o
     e[                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ ddlmZ ddlmZmZmZmZmZmZ dZdZdZdZeeeeef f Zed	eeef f Zer^dd
lmZ eefdedededefddZ efdedededefddZ!dedededededefddZ"efdede#defddZ$efdede#defdd Z%	d6dedededefd!d"Z&d#ede	e fd$d%Z'd&e	e de	e	e  fd'd(Z(G d)d* d*e)Z*G d+d, d,e*Z+G d-d. d.e)Z,g d/Z-g d0Z.G d1d2 d2e/Z0e0d Z1eG d3d	 d	Z2G d4d5 d5e)Z3dS )7    N)	dataclass)
itemgetter)TYPE_CHECKINGAnyDictListOptionalSetTupleUnion   )utils)T_bboxT_numT_obj
T_obj_iter
T_obj_listT_point   TableSettings)Pageedgesx_tolerancey_tolerancereturnc                 C   sR   g g d}| D ]}||d   | qt|d d|}t|d d|}|| S )zs
    Given a list of edges, snap any within `tolerance` pixels of one another
    to their positional average.
    vhorientationr   x0r   top)appendr   snap_objects)r   r   r   by_orientatione	snapped_v	snapped_h r'   C/var/www/html/venv/lib/python3.10/site-packages/pdfplumber/table.py
snap_edges   s   
	r)   r   	tolerancec           	      C   s   |dkr	d\}}n|dkrd\}}nt dtt| t|d}|d g}|dd	 D ])}|d
 }|| || | krO|| || krNt|||| |d
< q+|| q+|S )z
    Given a list of edges along the same infinite line, join those that
    are within `tolerance` pixels of one another.
    r   )r   x1r   )r    bottomzOrientation must be 'v' or 'h'keyr   r   N)
ValueErrorlistsortedr   r   resize_objectr!   )	r   r   r*   min_propmax_propsorted_edgesjoinedr$   lastr'   r'   r(   join_edge_group'   s   


r9   snap_x_tolerancesnap_y_tolerancejoin_x_tolerancejoin_y_tolerancec           	         sv   dt dtttf fdd}|dks|dkrt| ||} t| |d}tj||d} fdd|D }ttj	| } | S )	z|
    Using the `snap_edges` and `join_edge_group` methods above,
    merge a list of edges into a more "seamless" list.
    edger   c                 S   s$   | d dkrd| d fS d| d fS )Nr   r   r    r   r   r'   )r>   r'   r'   r(   	get_groupP   s   zmerge_edges.<locals>.get_groupr   r-   c                 3   s4    | ]\}}t ||d  |d  dkr nV  qdS )r   r   N)r9   ).0kitemsr<   r=   r'   r(   	<genexpr>[   s    
zmerge_edges.<locals>.<genexpr>)
r   r
   strr   r)   r2   	itertoolsgroupbyr1   chain)	r   r:   r;   r<   r=   r?   _sortededge_groupsedge_genr'   rC   r(   merge_edgesD   s   rL   wordsword_thresholdc           	   
      s   t | tdd}t fdd|}ttt j|}t|dkr"g S tttd|}t	ttd|}g }|D ]"}||||d |d || dd	|||d
 |d
 || dd	g7 }q8|S )zi
    Find (imaginary) horizontal lines that connect the tops
    of at least `word_threshold` words.
    r    r   c                       t |  kS NlenxrN   r'   r(   <lambda>m       z"words_to_edges_h.<locals>.<lambda>r   r   r+   r   )r   r+   r    r,   widthr   r,   )
r   cluster_objectsr   filterr1   mapobjects_to_rectrR   minmax)	rM   rN   by_toplarge_clustersrectsmin_x0max_x1r   rr'   rU   r(   words_to_edges_he   s4   re   c                    sF  t | tdd}t | tdd}dtdtfdd}t | |d}|| | }t|dd	 d
}tfdd	|}ttt j	|}	g }
|	D ] t
 fdd|
D }|sY|
  qEt|
dkrbg S tt j|
}tt|tdd
}tttd|}tttd|tttd|fdd|D || ddg S )zy
    Find (imaginary) vertical lines that connect the left, right, or
    center of at least `word_threshold` words.
    r   r   r+   wordr   c                 S   s   t | d | d  d S )Nr   r+      )float)rf   r'   r'   r(   
get_center   s   z$words_to_edges_v.<locals>.get_centerc                 S   s
   t |  S rP   rQ   rS   r'   r'   r(   rV      s   
 z"words_to_edges_v.<locals>.<lambda>r-   c                    rO   rP   rQ   rS   rU   r'   r(   rV      rW   c                 3   s    | ]	}t  |V  qd S rP   )r   get_bbox_overlapr@   cbboxr'   r(   rD      s    z#words_to_edges_v.<locals>.<genexpr>r   r    r,   c              	      s*   g | ]}|d  |d     ddqS )r   r   r   r+   r    r,   heightr   r'   )r@   b)
max_bottommin_topr'   r(   
<listcomp>   s    	z$words_to_edges_v.<locals>.<listcomp>r   ro   )r   rY   r   r   r   r2   rZ   r1   r[   objects_to_bboxanyr!   rR   bbox_to_rectr^   r]   )rM   rN   by_x0by_x1ri   	by_centerclusterssorted_clustersr`   bboxescondensed_bboxesoverlapcondensed_rectssorted_rectsrc   r'   )rn   rr   rs   rN   r(   words_to_edges_v   sB   
	r   c           	         s   i } fdddD \}}t |tdddD ][}t |tdddD ]O}|d |d | krp|d |d | krp|d |d | krp|d |d | krp|d |d f}||vr^g g d||< || d	 | || d
 | q!q|S )zi
    Given a list of edges, return the points at which they intersect
    within `tolerance` pixels.
    c                    s"   g | ] t t fd dqS )c                    s   | d  kS )Nr   r'   rS   or'   r(   rV      rW   z3edges_to_intersections.<locals>.<listcomp>.<lambda>)r1   rZ   )r@   r   r   r(   rt      s    z*edges_to_intersections.<locals>.<listcomp>r   r   r    r-   r,   r+   r   r   )r2   r   r!   )	r   r   r   intersectionsv_edgesh_edgesr   r   vertexr'   r   r(   edges_to_intersections   s$   

r   r   c                    s   dt dt dtffdd tt tdtt  dtdtt	 f fdd	fd
dt
tD }ttd|S )a8  
    Given a list of points (`intersections`), return all rectangular "cells"
    that those points describe.

    `intersections` should be a dictionary with (x0, top) tuples as keys,
    and a list of edge objects as values. The edge objects should correspond
    to the edges that touch the intersection.
    p1p2r   c                    s   dt dtt fdd}| d |d kr*| |  d | | d }t|r*dS | d |d krI| |  d	 | | d	 }t|rIdS d
S )Nr   r   c                 S   s   t ttj| S rP   )setr[   r   obj_to_bboxr   r'   r'   r(   edges_to_set   s   zCintersections_to_cells.<locals>.edge_connects.<locals>.edges_to_setr   r   Tr   r   F)r   r	   r   intersectionrR   )r   r   r   common)r   r'   r(   edge_connects   s   z-intersections_to_cells.<locals>.edge_connectspointsic                    s   |d krd S | |  | |d d  } fdd|D } fdd|D }|D ];} |s0q(|D ]0} |s:q2|d |d f}|v rb||rb||rb d  d |d |d f    S q2q(d S )Nr   c                        g | ]}|d   d  kr|qS r   r'   r@   rT   ptr'   r(   rt          zFintersections_to_cells.<locals>.find_smallest_cell.<locals>.<listcomp>c                    r   r   r'   r   r   r'   r(   rt     r   r   r'   )r   r   restbelowrightbelow_ptright_ptbottom_right)r   r   n_pointsr   r(   find_smallest_cell
  s,   

$z2intersections_to_cells.<locals>.find_smallest_cellc                 3   s    | ]} |V  qd S rP   r'   )r@   r   )r   r   r'   r(   rD   %  s    z)intersections_to_cells.<locals>.<genexpr>N)r   boolr1   r2   keysrR   r   intr   r   rangerZ   )r   cell_genr'   )r   r   r   r   r   r(   intersections_to_cells   s   
&r   cellsc                    s0  dt dtttttf fdd}t| }t  g }g }t|r|t|}t|D ]<}||}t|dkrC t|O  || || q&t fdd|D }|dkrb t|O  || || q&t||krx|t|  	  |	  t|st|r|t| t
|dd	 d
}	dd |	D }
|
S )z
    Given a list of bounding boxes (`cells`), return a list of tables that
    hold those cells most simply (and contiguously).
    rn   r   c                 S   s(   | \}}}}||f||f||f||ffS rP   r'   )rn   r   r    r+   r,   r'   r'   r(   bbox_to_corners/  s   z(cells_to_tables.<locals>.bbox_to_cornersr   c                 3   s    | ]}| v V  qd S rP   r'   rk   current_cornersr'   r(   rD   H  s    z"cells_to_tables.<locals>.<genexpr>c                 S   s   t dd | D S )Nc                 s   s     | ]}|d  |d fV  qdS )r   r   Nr'   rk   r'   r'   r(   rD   a  s    z4cells_to_tables.<locals>.<lambda>.<locals>.<genexpr>)r]   )tr'   r'   r(   rV   a  s    z!cells_to_tables.<locals>.<lambda>r-   c                 S   s   g | ]
}t |d kr|qS r   rQ   )r@   r   r'   r'   r(   rt   b      z#cells_to_tables.<locals>.<listcomp>)r   r
   r   r1   r   rR   r!   removesumclearr2   )r   r   remaining_cellscurrent_cellstablesinitial_cell_countcellcell_cornerscorner_countrI   filteredr'   r   r(   cells_to_tables)  s:   


r   c                   @   s"   e Zd Zdeee  fddZdS )	CellGroupr   c              	   C   sh   || _ tttdtd |tttdtd |tttdtd |tttdtd |f| _d S Nr   r   rg   r   )r   r]   r[   r   rZ   r^   rn   )selfr   r'   r'   r(   __init__g  s   
zCellGroup.__init__N)__name__
__module____qualname__r   r   r   r   r'   r'   r'   r(   r   f  s    r   c                   @      e Zd ZdS )RowNr   r   r   r'   r'   r'   r(   r   q      r   c                   @   sh   e Zd Zdddee fddZedefddZedee fd	d
Z	de
deeee   fddZdS )Tablepager   r   c                 C   s   || _ || _d S rP   )r   r   )r   r   r   r'   r'   r(   r   v  s   
zTable.__init__r   c                 C   sJ   | j }tttd|tttd|tttd|tttd|fS r   )r   r]   r[   r   r^   )r   rl   r'   r'   r(   rn   z  s   z
Table.bboxc                    s   t | jtddd}tt tttd| j}g }t|tdD ]\}}dd |D  t fdd|D }|	| q"|S )Nr   r   r-   c                 S   s   i | ]}|d  |qS r   r'   )r@   r   r'   r'   r(   
<dictcomp>      zTable.rows.<locals>.<dictcomp>c                    s   g | ]}  |qS r'   )getr   xdictr'   r(   rt     r   zTable.rows.<locals>.<listcomp>)
r2   r   r   r1   r   r[   rF   rG   r   r!   )r   rI   xsrowsy	row_cellsrowr'   r   r(   r     s   z
Table.rowskwargsc                    s   | j j}g }dtdtdtfdd| jD ]cg }fdd|D }jD ]L  d u r.d }n> fdd|D }t|rj d	 |d
<  d |d< d|v r` d  d	  |d<  d  d  |d< tj	|fi |}nd}|
| q%|
| q|S )Ncharrn   r   c                 S   sX   | d | d  d }| d | d  d }|\}}}}t ||ko*||k o*||ko*||k S )Nr    r,   rg   r   r+   )r   )r   rn   v_midh_midr   r    r+   r,   r'   r'   r(   char_in_bbox  s   z#Table.extract.<locals>.char_in_bboxc                    s   g | ]
} |j r|qS r'   rm   r@   r   )r   r   r'   r(   rt     r   z!Table.extract.<locals>.<listcomp>c                    s   g | ]	}| r|qS r'   r'   r   )r   r   r'   r(   rt     s
    
r   x_shiftr   y_shiftlayoutrg   layout_widthr   layout_height )r   charsr   r   r   r   r   rR   r   extract_textr!   )r   r   r   	table_arrarr	row_chars	cell_text
cell_charsr'   )r   r   r   r(   extract  s.   

zTable.extractN)r   r   r   r   r   r   propertyrn   r   r   r   r   rE   r   r'   r'   r'   r(   r   u  s    	"
r   )lineslines_stricttextexplicit)snap_tolerancer:   r;   join_tolerancer<   r=   edge_min_lengthmin_words_verticalmin_words_horizontalintersection_toleranceintersection_x_toleranceintersection_y_tolerancec                   @   r   )
UnsetFloatNr   r'   r'   r'   r(   r     r   r   c                   @   s*  e Zd ZU dZeed< dZeed< dZee	e
eef   ed< dZee	e
eef   ed< eZeed< eZeed< eZeed	< eZeed
< eZeed< eZeed< dZeed< eZeed< eZeed< dZeed< eZeed< eZeed< dZ ee!ee"f  ed< dddZ#e$dee% dd fddZ&dS )r   r   vertical_strategyhorizontal_strategyNexplicit_vertical_linesexplicit_horizontal_linesr   r:   r;   r   r<   r=   r   r   r   r   r   r   r   text_settingsr   c                 C   s   t D ]}t| |p
ddk rtd| dqdD ]}t| |d }|tvr2t| ddt dq| jd	u r;i | _d
D ]}|| jvrN| jdd| j|< q=d| jv rX| jd= dD ]\}}t| |tu rnt| |t| | qZ| S )a  Clean up user-provided table settings.

        Validates that the table settings provided consists of acceptable values and
        returns a cleaned up version. The cleaned up version fills out the missing
        values with the default values in the provided settings.

        TODO: Can be further used to validate that the values are of the correct
            type. For example, raising a value error when a non-boolean input is
            provided for the key ``keep_blank_chars``.

        :param table_settings: User-provided table settings.
        :returns: A cleaned up version of the user-provided table settings.
        :raises ValueError: When an unrecognised key is provided.
        r   zTable setting 'z' cannot be negative)
horizontalvertical	_strategyz_strategy must be one of{,}N)r   r   r*   r   ))r:   r   )r;   r   )r<   r   )r=   r   )r   r   )r   r   )	NON_NEGATIVE_SETTINGSgetattrr0   TABLE_STRATEGIESjoinr   r   UNSETsetattr)r   settingr   strategyattrfallbackr'   r'   r(   __post_init__  s4   


zTableSettings.__post_init__settingsc                 C   s   |d u r|  S t || r|S t |tr@i }i }| D ]\}}|d d dkr0|||dd  < q|||< q||d< | di |S td| )N   text_r   zCannot resolve settings: r'   )
isinstancedictrB   r0   )clsr
  core_settingsr   rA   r   r'   r'   r(   resolve  s   


zTableSettings.resolve)r   r   )'r   r   r   r   rE   __annotations__r   r   r   r   r   r   r   r   DEFAULT_SNAP_TOLERANCEr   r  r:   r;   DEFAULT_JOIN_TOLERANCEr   r<   r=   r   DEFAULT_MIN_WORDS_VERTICALr   r   DEFAULT_MIN_WORDS_HORIZONTALr   r   r   r   r   r   r   r	  classmethodT_table_settingsr  r'   r'   r'   r(   r     s*   
 
5c                   @   s6   e Zd ZdZddddee fddZdefd	d
ZdS )TableFindera0  
    Given a PDF page, find plausible table structures.

    Largely borrowed from Anssi Nurminen's master's thesis:
    http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3

    ... and inspired by Tabula:
    https://github.com/tabulapdf/tabula-extractor/issues/16
    Nr   r   r
  c                    s^   | _ t| _   _t j jj jj _	t
 j	 _ fddt jD  _d S )Nc                    s   g | ]}t  j|qS r'   )r   r   )r@   
cell_groupr   r'   r(   rt   A  s    z(TableFinder.__init__.<locals>.<listcomp>)r   r   r  r
  	get_edgesr   r   r   r   r   r   r   r   r   )r   r   r
  r'   r  r(   r   7  s   

zTableFinder.__init__r   c              
   C   s  | j }dD ]'}t||d }|dkr,t|d| d }t|dk r,td| d| d	q|j}|j}|d
ks;|d
krG| jjdi |jpDi }g }|j	pMg D ]9}	t
|	trit|	D ]}
|
d dkrg||
 qZqN||	|	| jjd | jjd | jjd | jjd  dd qN|dkrt| jjd}n!|dkrtj| jjddd}n|d
krt||jd}n|dkrg }|| }g }|jpg D ]9}	t
|	trt|	D ]}
|
d dkr||
 qq|| jjd | jjd | jjd | jjd  |	|	dd q|dkr	t| jjd}n$|dkrtj| jjddd}n|d
kr&t||jd}n|dkr-g }|| }t|t| }t||j|j|j|jd}tj||jdS )N)r   r   r   r   	explicit__linesrg   zIf z"_strategy == 'explicit', explicit_zD_lines must be specified as a list/tuple of two or more floats/ints.r   r   r   r   r   ro   r   r   line)	edge_typerU   r   r   )r   r+   rX   r    r,   r   )r:   r;   r<   r=   )
min_lengthr'   )r
  r   rR   r0   r   r   r   extract_wordsr   r   r  r  r   obj_to_edgesr!   rn   filter_edgesr   r   r   r   re   r   r1   rL   r:   r;   r<   r=   r   )r   r
  r   r  r   v_strath_stratrM   
v_explicitdescr$   v_baser   
h_explicith_baser   r   r'   r'   r(   r  E  s   











zTableFinder.get_edgesrP   )	r   r   r   __doc__r   r  r   r   r  r'   r'   r'   r(   r  ,  s    
r  )r   r   )4rF   dataclassesr   operatorr   typingr   r   r   r   r   r	   r
   r   r   r   _typingr   r   r   r   r   r   r  r  r  r  rE   T_intersectionsr  r   r   r)   r9   rL   r   re   r   r   r   r   objectr   r   r   r  r   rh   r   r  r   r  r'   r'   r'   r(   <module>   s    ( 


"
,
@
?=B\