o
    e`-                     @   sp   d Z ddlZddlZddlmZ ddlmZ ddl	m
Z
 ejZeeZG dd dejZG dd dejZdS ))PdfTextPagePdfTextSearcher    N)PdfiumErrorc                       s   e Zd ZdZ fddZedd ZdddZdddZdddZ	dd Z
d ddZdd Zd!ddZdd Zd"ddZ  ZS )#r   z
    Text page helper class.
    
    Attributes:
        raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
        page (PdfPage): Reference to the page this textpage belongs to.
    c                       || _ || _t tj d S N)rawpagesuper__init__pdfium_cFPDFText_ClosePage)selfr   r   	__class__ N/var/www/html/venv/lib/python3.10/site-packages/pypdfium2/_helpers/textpage.pyr
         zPdfTextPage.__init__c                 C      | j S r   )r   r   r   r   r   parent      zPdfTextPage.parentr   c                 C   sp   ||krdS t | |}|dkr| |d ||d |S t | |}|dkr2| ||d ||d S ||||fS )Nr      )r   "FPDFText_GetTextIndexFromCharIndex_get_active_text_range)r   c_startc_end	l_passive	r_passivet_startt_endr   r   r   r   $   s   z"PdfTextPage._get_active_text_ranger   ignorec                 C   s   |dkr
|   | }| ||| d }|dkrdS |\}}}}||7 }||| 8 }|| d }	t|	d }
t|
ttj}t| |||}|	|ksWJ d|	 d| |
j	d|d d  j
d	|d
S )aj  
        Extract text from a given range.
        
        See `this benchmark <https://github.com/py-pdf/benchmarks>`_ for a performance and quality comparison with other tools.
        
        Parameters:
            index (int): Index of the first char to include.
            count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
            errors (str): Error handling when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text in the range in question, or an empty string if no text was found.
        
        Note:
            * The returned text's length does not have to match *count*, even if it will for most PDFs.
              This is because the underlying API may exclude/insert chars compared to the internal list, although rare in practice.
              This means, if the char at ``i`` is excluded, ``get_text_range(i, 2)[1]`` will raise an index error.
              Pdfium provides raw APIs ``FPDFText_GetTextIndexFromCharIndex()`` / ``FPDFText_GetCharIndexFromTextIndex()`` to translate between the two views and identify excluded/inserted chars.
            * In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
        r   r   r       zBuffer size mismatch: z vs N	utf-16-leerrors)count_charsr   ctypescreate_string_buffercastPOINTERc_ushortr   FPDFText_GetTextr   decode)r   indexcountr&   active_ranger   r    r   r   in_sizebuffer
buffer_ptrout_sizer   r   r   get_text_range4   s    zPdfTextPage.get_text_rangeNc                 C   s   | j  }|du r|d }|du r|d }|du r|d }|du r%|d }| ||||f}tjg |ddR  }|dkr>dS d| }	t|	}
t|
ttj}tjg |||R   |
j	j
d|dS )	a  
        Extract text from given boundaries in PDF coordinates.
        If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
        
        Parameters:
            errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text on the page area in question, or an empty string if no text was found.
        Nr   r   r#      r"   r$   r%   )r   get_bboxr   FPDFText_GetBoundedTextr(   r)   r*   r+   r,   r   r.   )r   leftbottomrighttopr&   bboxargsn_charsn_bytesr3   r4   r   r   r   get_text_bounded^   s$   

zPdfTextPage.get_text_boundedc                 C   s   t | }|dkrtd|S )zV
        Returns:
            int: The number of characters on the text page.
        r   zFailed to get character count.)r   FPDFText_CountCharsr   )r   r@   r   r   r   r'      s   
zPdfTextPage.count_charsc                 C   s"   t | ||}|dkrtd|S )a  
        Parameters:
            index (int): Start character index.
            count (int): Character count to consider (defaults to -1 for all remaining).
        Returns:
            int: The number of text rectangles in the given character range.
        r   zFailed to count rectangles.)r   FPDFText_CountRectsr   )r   r/   r0   n_rectsr   r   r   count_rects   s   zPdfTextPage.count_rectsc                 C   s"   t | ||||}|dk rdS |S )a  
        Get the index of a character by position.
        
        Parameters:
            x (float): Horizontal position (in PDF canvas units).
            y (float): Vertical position.
            x_tol (float): Horizontal tolerance.
            y_tol (float): Vertical tolerance.
        Returns:
            int | None: The index of the character at or nearby the point (x, y).
            May be None if there is no character or an error occurred.
        r   N)r   FPDFText_GetCharIndexAtPos)r   xyx_toly_tolr/   r   r   r   	get_index   s   zPdfTextPage.get_indexFc           	      C   s   |rt  }t | ||}|j|j|j|jf\}}}}n&t t t t f\}}}}t | |||||}|j	|j	|j	|j	f\}}}}|sHt
d||||fS )a  
        Get the bounding box of a single character.
        
        Parameters:
            index (int):
                Index of the character to work with, in the page's character array.
            loose (bool):
                Get a more comprehensive box covering the entire font bounds, as opposed to the default tight box specific to the one character.
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zFailed to get charbox.)r   FS_RECTFFPDFText_GetLooseCharBoxr:   r;   r<   r=   c_doubleFPDFText_GetCharBoxvaluer   )	r   r/   looserectoklbrtr   r   r   get_charbox   s   zPdfTextPage.get_charboxc                 C   sP   t  t  t  t  f\}}}}t| |||||}|std|j|j|j|jfS )al  
        Get the bounding box of a text rectangle at the given index.
        Note that :meth:`.count_rects` must be called once with default parameters
        before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API).
        
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zzFailed to get rectangle. (Make sure count_rects() was called with default params once before subsequent get_rect() calls.))rO   r   FPDFText_GetRectr   rQ   )r   r/   rU   rV   rW   rX   rT   r   r   r   get_rect   s
   	zPdfTextPage.get_rectc                 C   s   t |dkr
tdd}|r|tjO }|r|tjO }|r!|tjO }|d d}t|t	tj
}t| |||}	t|	| }
| |
 |
S )au  
        Locate text on the page.
        
        Parameters:
            text (str):
                The string to search for.
            index (int):
                Character index at which to start searching.
            match_case (bool):
                If True, the search will be case-specific (upper and lower letters treated as different characters).
            match_whole_word (bool):
                If True, substring occurrences will be ignored (e. g. `cat` would not match `category`).
            consecutive (bool):
                If False (the default), :meth:`.search` will skip past the current match to look for the next match.
                If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
        Returns:
            PdfTextSearcher: A helper object to search text.
        r   z#Text length must be greater than 0. r$   )len
ValueErrorr   FPDF_MATCHCASEFPDF_MATCHWHOLEWORDFPDF_CONSECUTIVEencoder(   r*   r+   r,   FPDFText_FindStartr   _add_kid)r   textr/   
match_casematch_whole_wordconsecutiveflagsenc_textenc_text_ptrraw_searchersearcherr   r   r   search   s   




zPdfTextPage.search)r   r   )r   r   r!   )NNNNr!   )r   r   )F)r   FFF)__name__
__module____qualname____doc__r
   propertyr   r   r6   rB   r'   rF   rL   rY   r[   rn   __classcell__r   r   r   r   r      s    



*!

r   c                       sD   e Zd ZdZ fddZedd Zdd Zdd	 Zd
d Z	  Z
S )r   z
    Text searcher helper class.
    
    Attributes:
        raw (FPDF_SCHHANDLE): The underlying PDFium searcher handle.
        textpage (PdfTextPage): Reference to the textpage this searcher belongs to.
    c                    r   r   )r   textpager	   r
   r   FPDFText_FindClose)r   r   ru   r   r   r   r
     r   zPdfTextSearcher.__init__c                 C   r   r   )ru   r   r   r   r   r     r   zPdfTextSearcher.parentc                 C   s,   || }|sd S t | }t | }||fS r   )r   FPDFText_GetSchResultIndexFPDFText_GetSchCount)r   	find_funcrT   r/   r0   r   r   r   _get_occurrence  s   

zPdfTextSearcher._get_occurrencec                 C      |  tjS )z
        Returns:
            (int, int): Start character index and count of the next occurrence,
            or None if the last occurrence was passed.
        )rz   r   FPDFText_FindNextr   r   r   r   get_next     zPdfTextSearcher.get_nextc                 C   r{   )z
        Returns:
            (int, int): Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence),
            or None if the last occurrence was passed.
        )rz   r   FPDFText_FindPrevr   r   r   r   get_prev!  r~   zPdfTextSearcher.get_prev)ro   rp   rq   rr   r
   rs   r   rz   r}   r   rt   r   r   r   r   r      s    
r   )__all__r(   loggingpypdfium2.rawr   r   pypdfium2.internalinternalpdfium_ipypdfium2._helpers.miscr   rO   	getLoggerro   loggerAutoCloseabler   r   r   r   r   r   <module>   s   
 n