o
     eT                  	   @   s*  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlmZmZ d dlmZ d dlmZ d	d
lmZ d	dl m!Z!m"Z"m#Z#m$Z$ d	dl%m&Z& d	dl'm(Z(m)Z)m*Z*m+Z+ d	dlm,Z,m-Z-m.Z. d	dl/m0Z0 e 1dZ2e3g dZ4erd	dl5m6Z6 d	dl7m8Z8 ddddddZ9de:de;fddZ<deedf dee
eee=e>f df  e
e; f fd d!Z?dedee
eee=e>f df  e
e; f fd"d#Z@G d$d% d%eZAG d&d' d'e&ZBG d(d) d)eBZCd*e!d+e!ddfd,d-ZDG d.d/ d/eCZEG d0d1 d1eCZFdS )2    N)	lru_cache)
TYPE_CHECKINGAnyCallableDict	GeneratorListOptionalPatternTupleUnion)PDFPageAggregator)LTCharLTComponentLTContainerLTItemLTPageLTTextContainer)PDFPageInterpreter	PDFStackT)PDFPage)	PSLiteral   )utils)T_bboxT_numT_obj
T_obj_list)	Container)T_table_settingsTableTableFinderTableSettings)decode_textresolve_allresolve_and_decode)TextMapz^LT)advheight	linewidthptssizesrcsizewidthx0x1y0y1bitsmatrixuprightfontnametext	imagemask
colorspaceevenoddfillnon_stroking_colorpathstreamstrokestroking_colormcidtag)	PageImage)PDFzSimSun,RegularzSimHei,RegularzSimKai,RegularzSimFang,RegularzSimLi,Regular)s   s   s   _GB2312s   _GB2312s   r5   returnc                 C   sh   d| v r|  dd }| d | | |d  }}nd| }}t|t|dd }t|dd | S )N   +r          )indexCP936_FONTNAMESgetstr)r5   split_atprefixsuffix
suffix_new rQ   B/var/www/html/venv/lib/python3.10/site-packages/pdfplumber/page.pyfix_fontname_bytesV   s   
rS   color.c                 C   s4   t | d tr| d d pd t| d jfS | d fS )NrH   )
isinstancer   r#   name)rT   rQ   rQ   rR   separate_patterna   s   rW   c                 C   sJ   | d u rdS t | tr| }t|S t | trt| }t|S | f}t|S )N)NN)rU   tuplelistrW   )rT   	tuplefiedrQ   rQ   rR   normalize_colorj   s   

r[   c                       s   e Zd ZU dZdZee ed< dZee	 ed< dde
dee ddfdd	Zdd
dZdddZdef fddZd fddZd fddZ  ZS )"PDFPageAggregatorWithMarkedContentzZExtract layout from a specific page, adding marked-content IDs to
    objects where found.Ncur_mcidcur_tagrA   propsrD   c                 C   s6   t |j| _t|trd|v r|d | _dS d| _dS )z5Handle beginning of tag, setting current MCID if any.MCIDN)r#   rV   r^   rU   dictr]   )selfrA   r_   rQ   rQ   rR   	begin_tag   s   
z,PDFPageAggregatorWithMarkedContent.begin_tagc                 C   s   d| _ d| _dS )z/Handle beginning of tag, clearing current MCID.N)r^   r]   rb   rQ   rQ   rR   end_tag   s   
z*PDFPageAggregatorWithMarkedContent.end_tagc                 C   s    | j jd }| j|_| j|_dS )z^Add current MCID to what we hope to be the most recent object created
        by pdfminer.six.rH   N)cur_item_objsr]   r@   r^   rA   )rb   cur_objrQ   rQ   rR   tag_cur_item   s   	z/PDFPageAggregatorWithMarkedContent.tag_cur_itemc                    s   t  j|i |}|   |S )z;Hook for rendering characters, adding the `mcid` attribute.)superrender_charri   )rb   argskwargsr'   	__class__rQ   rR   rk      s   z.PDFPageAggregatorWithMarkedContent.render_charc                       t  j|i | |   dS )z7Hook for rendering images, adding the `mcid` attribute.N)rj   render_imageri   rb   rl   rm   rn   rQ   rR   rq         z/PDFPageAggregatorWithMarkedContent.render_imagec                    rp   )zAHook for rendering lines and curves, adding the `mcid` attribute.N)rj   
paint_pathri   rr   rn   rQ   rR   rt      rs   z-PDFPageAggregatorWithMarkedContent.paint_pathN)rD   N)__name__
__module____qualname____doc__r]   r	   int__annotations__r^   rL   r   r   rc   re   ri   floatrk   rq   rt   __classcell__rQ   rQ   rn   rR   r\   x   s   
 

r\   c                   @   s  e Zd ZU ejdg Zee ed< dZe	ed< dZ
	dbddd	ed
edefddZedefddZedefddZedefddZedefddZedefddZedeeef fddZdeeef deeef fddZdedefdd Zd!ee deeddf fd"d#Z deeef fd$d%Z!	dcd&e"e# de$fd'd(Z%	dcd&e"e# dee& fd)d*Z'	dcd&e"e# de"e& fd+d,Z(	dcd&e"e# deeee"e    fd-d.Z)	dcd&e"e# de"eee"e    fd/d0Z*d1e+de,fd2d3Z-					ddd4e.ee/e f d5e	d6e	d7ed8e	d9e	d1e+deeee+f  fd:d;Z0d1e+defd<d=Z1d1e+defd>d?Z2d1e+defd@dAZ3	dedBe	d8e	d1e+defdCdDZ4	dfdFe5dGe	dHe	ddIfdJdKZ6	dfdFe5dGe	dHe	ddIfdLdMZ7	dfdFe5dGe	dHe	ddIfdNdOZ8dPe9ege	f ddQfdRdSZ:d1e+ddQfdTdUZ;				EdgdVe"e.ee<f  dWe"e.ee<f  dXe"e.ee<f  dYe	ddZf
d[d\Z=dcd]e"ee  deee+f fd^d_Z>defd`daZ?dS )hPage_layoutcached_propertiesTis_originalNr   pdfrC   page_objpage_numberinitial_doctopc           	      C   s4  || _ | | _|| _|| _t| jjddpd}|d | _| j| j_|| _	|jd}|jd}|d ur9t|nd | _
t|pB| j
| _| j}| jdv rnt|d |d t|d |d	 t|d |d t|d |d	 fn!t|d |d	 t|d |d t|d |d	 t|d |d f| _t | j| _d S )
NRotater   ih  CropBoxMediaBox)Z   i  r      rG   )r   	root_pager   r   r$   attrsrK   rotationrotater   cropboxmediaboxminmaxbboxr   _get_textmapget_textmap)	rb   r   r   r   r   	_rotationr   r   mrQ   rQ   rR   __init__   s4   


	zPage.__init__rD   c                 C      | j d | j d  S )NrG   r   r   rd   rQ   rQ   rR   r-         z
Page.widthc                 C   r   )Nr   r   r   rd   rQ   rQ   rR   r(      r   zPage.heightc                 C   sR   t | dr| jS t| jj| j| jjd}t| jj|}|| j	 |
 | _| jS )Nr   )pagenolaparams)hasattrr   r\   r   rsrcmgrr   r   r   process_pager   
get_result)rb   deviceinterpreterrQ   rQ   rR   layout   s   

zPage.layoutc                    s4   dt dt f fdd}t jjpg }tt||S )NannotrD   c                    s  | d }|  di }| d|  d|  dd}| D ]"\}}|d ur?z	|d||< W q ty>   |d||< Y qw q jd	|d
 |d |d |d  j j |d   j|d   j|d  |d |d
  |d |d  d}|| d| v r | d< | |d< |S )NRectAURITContents)urititlecontentszutf-8zutf-16r   r   r   rG   r   )r   object_typer.   r0   r/   r1   doctoptopbottomr-   r(   Pdata)rK   itemsdecodeUnicodeDecodeErrorr   r   r(   update)r   rectaextraskvparsedrd   rQ   rR   parse   s>   
zPage.annots.<locals>.parse)r   r$   r   annotsrY   map)rb   r   rawrQ   rd   rR   r      s   %zPage.annotsc                 C   s   dd | j D S )Nc                 S   s   g | ]
}|d  dur|qS )r   NrQ   ).0r   rQ   rQ   rR   
<listcomp>  s    z#Page.hyperlinks.<locals>.<listcomp>)r   rd   rQ   rQ   rR   
hyperlinks  s   zPage.hyperlinksc                 C   s    t | dr| jS |  | _| jS )N_objects)r   r   parse_objectsrd   rQ   rQ   rR   objects  s   

zPage.objectsptc                 C   s   |d | j |d  fS )Nr   r   )r(   )rb   r   rQ   rQ   rR   point2coord$  s   zPage.point2coordobjc           	      C   s  t td|jj }dtttf dt	tttf  fdd}t
td t||j }||d< | j|d< dD ]}t||rGtt||j||< q6d	D ]\}}||v r^t|| \||< ||< qJt|ttfrl| |d
< t|tr|j}t|j\|d< |d< t|j\|d< |d< t|d trt|d |d< d|v rtt| j|d |d< d|v r| j |d  |d< | j |d  |d< | j!|d  |d< |S )N itemrD   c                 S   s$   | \}}|t v rt|}||fS d S ru   )	ALL_ATTRSr$   )r   r   r   resrQ   rQ   rR   process_attr*  s
   z)Page.process_object.<locals>.process_attrr   r   )ncsscs))r?   stroking_pattern)r;   non_stroking_patternr6   r?   r   r;   r   r5   r*   r0   r1   r   r   r   )"resublt_patro   rv   lowerr   rL   r   r	   ra   filterr   __dict__r   r   r   r%   getattrrV   r[   rU   r   r   get_textgraphicstatescolorncolorbytesrS   rY   r   r(   r   )	rb   r   kindr   attrcs
color_attrpattern_attrgsrQ   rQ   rR   process_object'  s@   &


zPage.process_objectlayout_objectsc                 c   sR    |D ]#}t |tr | jjd ur| |V  | |jE d H  q| |V  qd S ru   )rU   r   r   r   r   iter_layout_objectsrg   )rb   r   r   rQ   rQ   rR   r   d  s   
zPage.iter_layout_objectsc                 C   sR   i }|  | jjD ]}|d }|dv rq	||d u rg ||< || | q	|S )Nr   )anno)r   r   rg   rK   append)rb   r   r   r   rQ   rQ   rR   r   r  s   zPage.parse_objectstable_settingsc                 C   s   t |}t| |S ru   )r"   resolver!   rb   r   tsetrQ   rQ   rR   debug_tablefinder}  s   

zPage.debug_tablefinderc                 C   s   t |}t| |jS ru   )r"   r   r!   tablesr   rQ   rQ   rR   find_tables  s   
zPage.find_tablesc                 C   sX   t |}| |}t|dkrd S dtdttttf fdd}tt	||dd }|S )Nr   xrD   c                 S   s   t | j | jd | jd fS )Nr   r   )lencellsr   r   rQ   rQ   rR   sorter  s   zPage.find_table.<locals>.sorter)key)
r"   r   r   r   r    r   rz   r   rY   sorted)rb   r   r   r   r   largestrQ   rQ   rR   
find_table  s   

zPage.find_tablec                    s&   t | |  } fdd|D S )Nc                    s"   g | ]}|j d i  jpi qS )rQ   )extracttext_settings)r   tabler   rQ   rR   r     s   " z'Page.extract_tables.<locals>.<listcomp>)r"   r   r   )rb   r   r   rQ   r   rR   extract_tables  s   

zPage.extract_tablesc                 C   s6   t |}| |}|d u rd S |jdi |jpi S NrQ   )r"   r   r   r   r   )rb   r   r   r   rQ   rQ   rR   extract_table  s
   

zPage.extract_tablerm   c                 K   sh   t | jd | jd d}d|vr|d| ji d|vr$|d| ji i ||}tj| jfi |S )Nr   r   )x_shifty_shiftlayout_width_charslayout_widthlayout_height_charslayout_height)ra   r   r   r-   r(   r   chars_to_textmapchars)rb   rm   defaultsfull_kwargsrQ   rQ   rR   r     s   zPage._get_textmappatternregexcase
main_groupreturn_charsreturn_groupsc           	      K   s&   | j di |}|j||||||dS )N)r  r  r  r  r  rQ   )r   search)	rb   r  r  r  r  r  r  rm   textmaprQ   rQ   rR   r    s   
zPage.searchc                 K   s   | j di |jS r   )r   	as_stringrb   rm   rQ   rQ   rR   extract_text  s   zPage.extract_textc                 K      t j| jfi |S ru   )r   extract_text_simpler	  r  rQ   rQ   rR   r       zPage.extract_text_simplec                 K   r  ru   )r   extract_wordsr	  r  rQ   rQ   rR   r    r  zPage.extract_wordsstripc                 K   s   | j di |j||dS )N)r  r  rQ   )r   extract_text_lines)rb   r  r  rm   rQ   rQ   rR   r    s   zPage.extract_text_linesFr   relativestrictCroppedPagec                 C   s   t | |||dS )N)r  r  )r  rb   r   r  r  rQ   rQ   rR   crop  s   z	Page.cropc                 C      t | |||tjdS zS
        Same as .crop, except only includes objects fully within the bbox
        )r  r  crop_fn)r  r   within_bboxr   rQ   rQ   rR   r%       zPage.within_bboxc                 C   r"  r#  )r  r   outside_bboxr   rQ   rQ   rR   r'    r&  zPage.outside_bboxtest_functionFilteredPagec                 C   s
   t | |S ru   )r)  )rb   r(  rQ   rQ   rR   r        
zPage.filterc                 K   sB   t | dd }dd | j D |_tj| jfi ||jd< |S )u   
        Removes duplicate chars — those sharing the same text, fontname, size,
        and positioning (within `tolerance`) as other characters on the page.
        c                 S   s   dS )NTrQ   r   rQ   rQ   rR   <lambda>  s    z#Page.dedupe_chars.<locals>.<lambda>c                 S   s   i | ]\}}||qS rQ   rQ   )r   r   objsrQ   rQ   rR   
<dictcomp>  s    z%Page.dedupe_chars.<locals>.<dictcomp>char)r)  r   r   r   r   dedupe_charsr	  )rb   rm   prQ   rQ   rR   r/    s   zPage.dedupe_chars
resolutionr-   r(   	antialiasrB   c                 C   s~   ddl m}m} tdd |||fD }|dkrtd| |dur+d| | j }n|dur6d| | j }|| |p;||dS )	z
        You can pass a maximum of 1 of the following:
        - resolution: The desired number pixels per inch. Defaults to 72.
        - width: The desired image width in pixels.
        - height: The desired image width in pixels.
        r   )DEFAULT_RESOLUTIONrB   c                 s   s    | ]}|d uV  qd S ru   rQ   )r   r   rQ   rQ   rR   	<genexpr>  s    z Page.to_image.<locals>.<genexpr>zUOnly one of these arguments can be provided: resolution, width, height. You provided NH   )r1  r2  )displayr3  rB   sum
ValueErrorr-   r(   )rb   r1  r-   r(   r2  r3  rB   	num_specsrQ   rQ   rR   to_image  s   
zPage.to_imageobject_typesc              	   C   sl   |d u rt | j dg }n|}| j| j| j| j| j| j| j	| j
d}|D ]}t| |d ||d < q&|S )Nr   )r   r   r   r   r   r   r-   r(   s)rY   r   keysr   r   r   r   r   r   r-   r(   r   )rb   r;  _object_typesdtrQ   rQ   rR   to_dict  s   
zPage.to_dictc                 C   s   d| j  dS )Nz<Page:>)r   rd   rQ   rQ   rR   __repr__.  s   zPage.__repr__)r   ru   )TTr   TT)TT)FT)NNNF)@rv   rw   rx   r   r   r   rL   r{   r   boolpagesr   rz   r   r   propertyr-   r(   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r!   r   r    r   r   r   r  r   r&   r   r   r
   r  r  r  r  r  r   r!  r%  r'  r   r   r/  r|   r:  rA  rC  rQ   rQ   rQ   rR   r~      s*  
 
*)"=






	






$r~   c                   @   s(   e Zd ZU dZeed< defddZdS )DerivedPageFr   parent_pagec                 C   sD   || _ |j| _|j| _|j| _|j| _| tj t | j	| _
d S ru   )rH  r   r   r   r   flush_cacher   r   r   r   r   )rb   rH  rQ   rQ   rR   r   5  s   zDerivedPage.__init__N)rv   rw   rx   r   rD  r{   r~   r   rQ   rQ   rQ   rR   rG  2  s   
 rG  r   parent_bboxc                 C   st   t | }|dkrtd|  dt | |}|d u r%td|  d| t |}||k r8td|  d| d S )Nr   zBounding box z has an area of zero.z. is entirely outside parent page bounding box z. is not fully within parent page bounding box )r   calculate_arear8  get_bbox_overlap)r   rJ  	bbox_areaoverlapoverlap_arearQ   rQ   rR   test_proposed_bbox?  s$   

rP  c                       sb   e Zd Zejddfdededeeegef de	de	f
 fdd	Z
ed
eeef fddZ  ZS )r  FTrH  	crop_bboxr$  r  r  c                    s   |r|j \}}}} \}	}
}}|	| |
| || || f |r%t |j  dtdtf fdd}t | || _tju rE|j | _ d S  | _ d S )Nr,  rD   c                    s
   |  S ru   rQ   )r,  rQ  r$  rQ   rR   _crop_fnd  r*  z&CroppedPage.__init__.<locals>._crop_fn)r   rP  r   rj   r   rS  r   r'  )rb   rH  rQ  r$  r  r  o_x0o_top_r.   r   r/   r   rS  rn   rR  rR   r   T  s   

zCroppedPage.__init__rD   c                    2   t  dr jS  fdd jj D  _ jS )Nr   c                    s   i | ]
\}}|  |qS rQ   )rS  r   r   r   rd   rQ   rR   r-  u  s    z'CroppedPage.objects.<locals>.<dictcomp>r   r   rH  r   r   rd   rQ   rd   rR   r   q  s   


zCroppedPage.objects)rv   rw   rx   r   crop_to_bboxr~   r   r   r   rD  r   rF  r   rL   r   r}   rQ   rQ   rn   rR   r  S  s"     r  c                       sJ   e Zd Zdedeegef f fddZede	e
ef fddZ  ZS )r)  rH  	filter_fnc                    s   |j | _ || _t | d S ru   )r   r[  rj   r   )rb   rH  r[  rn   rQ   rR   r   |  s   zFilteredPage.__init__rD   c                    rW  )Nr   c                    s"   i | ]\}}|t t j|qS rQ   )rY   r   r[  rX  rd   rQ   rR   r-    s    z(FilteredPage.objects.<locals>.<dictcomp>rY  rd   rQ   rd   rR   r     s   


zFilteredPage.objects)rv   rw   rx   r~   r   r   rD  r   rF  r   rL   r   r   r}   rQ   rQ   rn   rR   r)  {  s      r)  )Gr   	functoolsr   typingr   r   r   r   r   r   r	   r
   r   r   pdfminer.converterr   pdfminer.layoutr   r   r   r   r   r   pdfminer.pdfinterpr   r   pdfminer.pdfpager   pdfminer.psparserr   r   r   _typingr   r   r   r   	containerr   r   r   r    r!   r"   r#   r$   r%   
utils.textr&   compiler   setr   r6  rB   r   rC   rJ   r   rL   rS   r|   rz   rW   r[   r\   r~   rG  rP  r  r)  rQ   rQ   rQ   rR   <module>   s^    0 
!	
"
	"
2   (