
    gm                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZmZ d dlmZ d dlmZ d dlmZmZmZ  G d d	e      Z G d
 de      Zd Zd Zd Zd Zd Zd ZddZddZd Z d Z!d Z"d Z#d Z$y)    N)reduce)ElementTree)FileSystemPathPointerPathPointerSeekableUnicodeStreamReaderZipFilePathPointer)slice_bounds)wordpunct_tokenize)AbstractLazySequenceLazyConcatenationLazySubsequencec                   v    e Zd ZdZddZ ed d      Zd Zd Zd	 Z	d
 Z
d Zd Zd Zd Zd Zd Zd Zd Zy)StreamBackedCorpusViewa^  
    A 'view' of a corpus file, which acts like a sequence of tokens:
    it can be accessed by index, iterated over, etc.  However, the
    tokens are only constructed as-needed -- the entire corpus is
    never stored in memory at once.

    The constructor to ``StreamBackedCorpusView`` takes two arguments:
    a corpus fileid (specified as a string or as a ``PathPointer``);
    and a block reader.  A "block reader" is a function that reads
    zero or more tokens from a stream, and returns them as a list.  A
    very simple example of a block reader is:

        >>> def simple_block_reader(stream):
        ...     return stream.readline().split()

    This simple block reader reads a single line at a time, and
    returns a single token (consisting of a string) for each
    whitespace-separated substring on the line.

    When deciding how to define the block reader for a given
    corpus, careful consideration should be given to the size of
    blocks handled by the block reader.  Smaller block sizes will
    increase the memory requirements of the corpus view's internal
    data structures (by 2 integers per block).  On the other hand,
    larger block sizes may decrease performance for random access to
    the corpus.  (But note that larger block sizes will *not*
    decrease performance for iteration.)

    Internally, ``CorpusView`` maintains a partial mapping from token
    index to file position, with one entry per block.  When a token
    with a given index *i* is requested, the ``CorpusView`` constructs
    it as follows:

      1. First, it searches the toknum/filepos mapping for the token
         index closest to (but less than or equal to) *i*.

      2. Then, starting at the file position corresponding to that
         index, it reads one block at a time using the block reader
         until it reaches the requested token.

    The toknum/filepos mapping is created lazily: it is initially
    empty, but every time a new block is read, the block's
    initial token is added to the mapping.  (Thus, the toknum/filepos
    map has one entry per block.)

    In order to increase efficiency for random access patterns that
    have high degrees of locality, the corpus view may cache one or
    more blocks.

    :note: Each ``CorpusView`` object internally maintains an open file
        object for its underlying corpus file.  This file should be
        automatically closed when the ``CorpusView`` is garbage collected,
        but if you wish to close it manually, use the ``close()``
        method.  If you access a ``CorpusView``'s items after it has been
        closed, the file object will be automatically re-opened.

    :warning: If the contents of the file are modified during the
        lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior
        is undefined.

    :warning: If a unicode encoding is specified when constructing a
        ``CorpusView``, then the block reader may only call
        ``stream.seek()`` with offsets that have been returned by
        ``stream.tell()``; in particular, calling ``stream.seek()`` with
        relative offsets, or with offsets based on string lengths, may
        lead to incorrect behavior.

    :ivar _block_reader: The function used to read
        a single block from the underlying file stream.
    :ivar _toknum: A list containing the token index of each block
        that has been processed.  In particular, ``_toknum[i]`` is the
        token index of the first token in block ``i``.  Together
        with ``_filepos``, this forms a partial mapping between token
        indices and file positions.
    :ivar _filepos: A list containing the file position of each block
        that has been processed.  In particular, ``_toknum[i]`` is the
        file position of the first character in block ``i``.  Together
        with ``_toknum``, this forms a partial mapping between token
        indices and file positions.
    :ivar _stream: The stream used to access the underlying corpus file.
    :ivar _len: The total number of tokens in the corpus, if known;
        or None, if the number of tokens is not yet known.
    :ivar _eofpos: The character position of the last character in the
        file.  This is calculated when the corpus view is initialized,
        and is used to decide when the end of file has been reached.
    :ivar _cache: A cache of the most recently read block.  It
       is encoded as a tuple (start_toknum, end_toknum, tokens), where
       start_toknum is the token index of the first token in the block;
       end_toknum is the token index of the first token not in the
       block; and tokens is a list of the tokens in the block.
    Nc                    |r|| _         dg| _        |g| _        || _        d| _        || _        d| _        d| _        	 d| _        	 	 t        | j
                  t              r | j
                  j                         | _        n.t        j                  | j
                        j                  | _        d| _        y# t         $ r}t#        d|d|       |d}~ww xY w)a  
        Create a new corpus view, based on the file ``fileid``, and
        read with ``block_reader``.  See the class documentation
        for more information.

        :param fileid: The path to the file that is read by this
            corpus view.  ``fileid`` can either be a string or a
            ``PathPointer``.

        :param startpos: The file position at which the view will
            start reading.  This can be used to skip over preface
            sections.

        :param encoding: The unicode encoding that should be used to
            read the file's contents.  If no encoding is specified,
            then the file's contents will be read as a non-unicode
            string (i.e., a str).
        r   NzUnable to open or access z -- )r   N)
read_block_toknum_filepos	_encoding_len_fileid_stream_current_toknum_current_blocknum
isinstancer   	file_size_eofpososstatst_size	Exception
ValueError_cache)selffileidblock_readerstartposencodingexcs         L/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/util.py__init__zStreamBackedCorpusView.__init__}   s    & *DOs!
!	#	( "&	(	W$,,4#||557!wwt||4<< %  	W8
$seLMSVV	Ws   A(B7 7	C CCc                     | j                   S N)r   r$   s    r*   <lambda>zStreamBackedCorpusView.<lambda>   s    T\\    za
        The fileid of the file that is accessed by this view.

        :type: str or PathPointer)docc                     t        d      )z
        Read a block from the input stream.

        :return: a block of tokens from the input stream
        :rtype: list(any)
        :param stream: an input stream
        :type stream: stream
        zAbstract Method)NotImplementedError)r$   streams     r*   r   z!StreamBackedCorpusView.read_block   s     ""344r0   c                 <   t        | j                  t              r+| j                  j                  | j                        | _        y| j                  r0t        t        | j                  d      | j                        | _        yt        | j                  d      | _        y)z
        Open the file stream associated with this corpus view.  This
        will be called performed if any value is read from the view
        while its file stream is closed.
        rbN)r   r   r   openr   r   r   r.   s    r*   _openzStreamBackedCorpusView._open   sg     dllK0<<,,T^^<DL^^6T\\4($..DL  d3DLr0   c                 ^    | j                   | j                   j                          d| _         y)as  
        Close the file stream associated with this corpus view.  This
        can be useful if you are worried about running out of file
        handles (although the stream should automatically be closed
        upon garbage collection of the corpus view).  If the corpus
        view is accessed after it is closed, it will be automatically
        re-opened.
        N)r   closer.   s    r*   r:   zStreamBackedCorpusView.close   s%     <<#LL r0   c                     | S r-    r.   s    r*   	__enter__z StreamBackedCorpusView.__enter__   s    r0   c                 $    | j                          y r-   )r:   )r$   typevalue	tracebacks       r*   __exit__zStreamBackedCorpusView.__exit__   s    

r0   c                 x    | j                   #| j                  | j                  d         D ]  } | j                   S Nr   )r   iterate_fromr   r$   toks     r*   __len__zStreamBackedCorpusView.__len__   s9    99 ((b)9: ;yyr0   c                 
   t        |t              rZt        | |      \  }}| j                  d   }||k  r*|| j                  d   k  r| j                  d   ||z
  ||z
   S t	        | ||      S |dk  r|t        |       z  }|dk  rt        d      | j                  d   }||cxk  r| j                  d   k  rn n| j                  d   ||z
     S 	 t        | j                  |            S # t        $ r}t        d      |d }~ww xY w)Nr         zindex out of range)
r   slicer	   r#   r   len
IndexErrornextrE   StopIteration)r$   istartstopoffsetes         r*   __getitem__z"StreamBackedCorpusView.__getitem__   s   a&tQ/KE4[[^F44;;q>#9{{1~efntf}EE"455 1uSY1u !566[[^F+T[[^+{{1~a&j11>D--a011  > !56A=>s   C( (	D1C==Dc              #     K   | j                   d   |cxk  r| j                   d   k  r3n n0| j                   d   || j                   d   z
  d  D ]  }| |dz  } || j                  d   k  rBt        j                  | j                  |      dz
  }| j                  |   }| j                  |   }n6t        | j                        dz
  }| j                  d   }| j                  d   }| j                  | j                          | j                  dk(  rd| _	        || j                  k  r| j                  j                  |       || _        || _        | j                  | j                        }t        |t        t         t"        f      sJ d| j                  j$                  z         t        |      }| j                  j'                         }||kD  s J d| j                  j$                  |fz         |||z   t!        |      f| _         || j                  d   k  sJ |dkD  r|dz  }|| j                  d   k(  rN|| j                  d   kD  sJ | j                  j)                  |       | j                  j)                  ||z          n5|| j                  |   k(  sJ d       ||z   | j                  |   k(  sJ d       || j                  k(  r
||z   | _	        |t+        d||z
        d  D ]  }|  || j                  k  sJ || j                  k(  rn||z  }|}|| j                  k  r| j                  J | j-                          y w)	Nr   rJ   rK   r   z.block reader %s() should return list or tuple.z=block reader %s() should consume at least 1 byte (filepos=%d)z*inconsistent block reader (num chars read)z/inconsistent block reader (num tokens returned))r#   r   bisectbisect_rightr   rM   r   r8   r   r   seekr   r   r   r   tuplelistr   __name__tellappendmaxr:   )	r$   	start_tokrG   block_indextoknumfilepostokensnum_toksnew_fileposs	            r*   rE   z#StreamBackedCorpusView.iterate_from  sC    ;;q>Y7Q7{{1~i$++a.&@&BC	Q	 D t||B'' --dllIFJK\\+.FmmK0Gdll+a/K\\"%FmmB'G <<JJL <<1DI $LLg&#)D %0D"__T\\2Ffud4H&IJ @//**+J 6{H,,++-Kg%N((R % "6H#4d6lCDK T\\"----!|q T\\"--&r)::::MM((5LL''(9: $t}}['AADCDA )T\\+-FFIHIF dll*"X-	 c!Y%78:;	 < $,,...dll*hF!Gk $p yy$$$ 	

s   L M$Mc                     t        | |g      S r-   concatr$   others     r*   __add__zStreamBackedCorpusView.__add__l  s    tUm$$r0   c                     t        || g      S r-   ri   rk   s     r*   __radd__zStreamBackedCorpusView.__radd__o  s    udm$$r0   c                      t        | g|z        S r-   ri   r$   counts     r*   __mul__zStreamBackedCorpusView.__mul__r      tfun%%r0   c                      t        | g|z        S r-   ri   rq   s     r*   __rmul__zStreamBackedCorpusView.__rmul__u  rt   r0   )Nr   utf8)r]   
__module____qualname____doc__r+   propertyr%   r   r8   r:   r=   rB   rH   rV   rE   rm   ro   rs   rv   r<   r0   r*   r   r       sa    Zx8%t !%F	54>6Yz%%&&r0   r   c                   (    e Zd ZdZd Zd Zd Zd Zy)ConcatenatedCorpusViewz
    A 'view' of a corpus file that joins together one or more
    ``StreamBackedCorpusViews<StreamBackedCorpusView>``.  At most
    one file handle is left open at any time.
    c                 4    || _         	 dg| _        	 d | _        y )Nr   )_pieces_offsets_open_piece)r$   corpus_viewss     r*   r+   zConcatenatedCorpusView.__init__  s.    #	 	>  	Jr0   c                     t        | j                        t        | j                        k  r#| j                  | j                  d         D ]  } | j                  d   S rD   )rM   r   r   rE   rF   s     r*   rH   zConcatenatedCorpusView.__len__  sN    t}}T\\!22((r):; < }}R  r0   c                 F    | j                   D ]  }|j                           y r-   )r   r:   )r$   pieces     r*   r:   zConcatenatedCorpusView.close  s    \\EKKM "r0   c              #   `  K   t        j                  | j                  |      dz
  }|t        | j                        k  r| j                  |   }| j                  |   }| j
                  |ur-| j
                  | j
                  j                          || _        |j                  t        d||z
              E d {    |dz   t        | j                        k(  r4| j                  j                  | j                  d   t        |      z          |dz  }|t        | j                        k  ry y 7 sw)NrJ   r   r   )
rX   rY   r   rM   r   r   r:   rE   r`   r_   )r$   ra   piecenumrT   r   s        r*   rE   z#ConcatenatedCorpusView.iterate_from  s    &&t}}i@1DT\\**]]8,FLL*E u,##/$$**,#(  ))#aV1C*DEEE !|s4==11$$T]]2%6U%CD MH% T\\** Fs   B6D.8D,9A0D.*D.N)r]   rx   ry   rz   r+   rH   r:   rE   r<   r0   r*   r}   r}   y  s    J!r0   r}   c                    t        |       dk(  r| d   S t        |       dk(  rt        d      | D ch c]  }|j                   }}t        d | D              rdj	                  |       S |D ]  }t        |t        t        f      r n t        |       S |D ]  }t        |t              r n t        |       S t        |      dk(  rt        |      d   }t        |t              rt        d | g       S t        |t              rt        d | d      S t        j                  |      r/t        j                  d	      }| D ]  }|j!                  |        |S t        d
|z        c c}w )z
    Concatenate together the contents of multiple documents from a
    single corpus, using an appropriate concatenation function.  This
    utility function is used by corpus readers when the user requests
    more than one document at a time.
    rJ   r   z%concat() expects at least one object!c              3   <   K   | ]  }t        |t                y wr-   )r   str).0r1   s     r*   	<genexpr>zconcat.<locals>.<genexpr>  s     
04C:c34s    c                     | |z   S r-   r<   abs     r*   r/   zconcat.<locals>.<lambda>      Ar0   c                     | |z   S r-   r<   r   s     r*   r/   zconcat.<locals>.<lambda>  r   r0   r<   	documentsz'Don't know how to concatenate types: %r)rM   r"   	__class__alljoin
issubclassr   r}   r   r   r\   r   r[   r   	iselementElementr_   )docsdtypestypxmltreer1   s         r*   rj   rj     sY    4yA~Aw
4yA~@AA"&'$QQ[[$E' 
04
00wwt} # 68NOP  &d++ #34  !&& 5zQ5k!nc4 -b99c5!-b99  %!))+6Gs# N >F
GGK (s   E!c                     g }t        d      D ]/  }|j                  | j                         j                                1 |S N   )rangeextendreadlinesplitr4   toksrQ   s      r*   read_whitespace_blockr     s6    D2YFOO%++-. Kr0   c                 z    g }t        d      D ]*  }|j                  t        | j                                      , |S r   )r   r   r
   r   r   s      r*   read_wordpunct_blockr     s3    D2Y&v'89: Kr0   c                     g }t        d      D ]8  }| j                         }|s|c S |j                  |j                  d             : |S )Nr   
)r   r   r_   rstrip)r4   r   rQ   lines       r*   read_line_blockr     sE    D2Y KDKK%&	 
 Kr0   c                 t    d}	 | j                         }|s|r|gS g S |r|j                         s|r|gS ||z  }6)Nr   )r   stripr4   sr   s      r*   read_blankline_blockr     sM    
A
 s
	$**,s
 IA r0   c                     d}	 | j                         }|d   dk(  s|d   dk(  s|d d dk(  r*|s|r|gS g S ||z  }t        j                  d|      |gS Q)Nr   r   =r   rK   z
z^\d+-\d+)r   rematchr   s      r*   read_alignedsent_blockr     su    
A
 7c>T!W_RaF0Bs
	 IAxxT*6s
 r0   c                    	 | j                         }|sg S t        j                  ||      rn,|g}	 | j                         }| j                         }|sdj	                  |      gS |(t        j                  ||      rdj	                  |      gS |9t        j                  ||      r#| j                  |       dj	                  |      gS |j                  |       )a  
    Read a sequence of tokens from a stream, where tokens begin with
    lines that match ``start_re``.  If ``end_re`` is specified, then
    tokens end with lines that match ``end_re``; otherwise, tokens end
    whenever the next line matching ``start_re`` or EOF is found.
    r   )r   r   r   r^   r   rZ   r_   )r4   start_reend_rer   linesoldposs         r*   read_regexp_blockr   )  s      I88Hd#  FE
 GGEN##"((64"8GGEN## >bhhx6KKGGEN##T r0   c                 >   | j                         }| j                  |      }t        | dd      }|t        |t              sJ |dvrddl}|j                  d|z         |r+t        j                  dt        j                  |      z        }	 	 |r.|| j                         z  }t        j                  t        |      }t        |      \  }}	t        j                  d      j                  ||	      j                         }	|| j!                  ||	z          |S | j!                  |t#        |d|	 j%                  |            z          |S # t&        $ rK}
|
j(                  d   dk(  r3| j                  |      }|r
||z  }Y d}
~
|j+                         gcY d}
~
S  d}
~
ww xY w)	a-  
    Read a sequence of s-expressions from the stream, and leave the
    stream's file position at the end the last complete s-expression
    read.  This function will always return at least one s-expression,
    unless there are no more s-expressions in the file.

    If the file ends in in the middle of an s-expression, then that
    incomplete s-expression is returned when the end of the file is
    reached.

    :param block_size: The default block size for reading.  If an
        s-expression is longer than one block, then more than one
        block will be read.
    :param comment_char: A character that marks comments.  Any lines
        that begin with this character will be stripped out.
        (If spaces or tabs precede the comment character, then the
        line will not be stripped.)
    r(   N)Nzutf-8r   zAParsing may fail, depending on the properties of the %s encoding!z
(?m)^%s.*$z\s*Block too small)r^   readgetattrr   r   warningswarnr   compileescaper   sub
_sub_space_parse_sexpr_blocksearchendrZ   rM   encoder"   argsr   )r4   
block_sizecomment_charrR   blockr(   r   COMMENTre   rT   rU   
next_blocks               r*   read_sexpr_blockr   L  s   & KKMEKK
#Evz40H:eS#999&"$,-	
 **\BIIl,CCD
	
 **w
E:/6NFFZZ'..uf=AACF EFN+
 M ECgv(=(=h(G$HHI M 
	vvay--#[[4
Z'E "KKM?*
	s1   BE 0E 	F*F FFFFc                 J    d| j                         | j                         z
  z  S )zrHelper function: given a regexp match, return a string of
    spaces that's the same length as the matched string. )r   rR   )ms    r*   r   r     s      !%%'AGGI%&&r0   c                    g }dx}}|t        |       k  r-t        j                  d      j                  | |      }|s||fS |j	                         }|j                         dk7  rIt        j                  d      j                  | |      }|r|j	                         }n|r||fS t        d      d}t        j                  d      j                  | |      D ]7  }|j                         dk(  r|dz  }n|dz  }|dk(  s'|j                         } n |r||fS t        d      |j                  | ||        |t        |       k  r-||fS )Nr   z\S(z[\s(]r   z[()]rJ   )
rM   r   r   r   rR   groupr"   finditerr   r_   )r   re   rR   r   r   m2nestings          r*   r   r     sH   FOEC
E

JJu$$UC03;	 779H%,,UE:Bhhj!3;& !233 GZZ(11%?779#qLGqLGa<%%'C @ !3;& !233eE#&'C E

F 3;r0   c           
         t        | t              st        d      |dz  }t        | t              r| j                  j                         D cg c]+  }|j                  d      s|t        | j                        d  - }}|D cg c]  }t        j                  ||      s| }}t        |      S t        | t              rg }t        j                  | j                        D ]w  \  }}}dj!                  d t#        | j                  |      D              }||D cg c]   }t        j                  |||z         r||z   " c}z  }d|v sg|j%                  d       y t        |      S t'        d| z        c c}w c c}w c c}w )Nz+find_corpus_fileids: expected a PathPointer$/r   c              3   &   K   | ]	  }d |z    yw)z%s/Nr<   )r   ps     r*   r   z&find_corpus_fileids.<locals>.<genexpr>  s     O0N1UQY0Ns   z.svnzDon't know how to handle %r)r   r   	TypeErrorr   zipfilenamelistendswithrM   entryr   r   sortedr   r   walkpathr   
_path_fromremoveAssertionError)	rootregexpnamefileidsitemsdirnamesubdirsprefixr%   s	            r*   find_corpus_fileidsr     su   dK(EFF
cMF $*+ --/
/==% TZZ"#/ 	 

 #*D'$RXXfd-C'De} 
D/	0)+);%GWgWWO
499g0NOOF%%F88FFVO4 % E  v& *< e} :TABB3

 Es   0E4E9E9%E>c                 d   t         j                  j                  |       d   dk(  r"t         j                  j                  |       d   } g }| |k7  rat         j                  j                  |      \  }}|j                  d|       t         j                  j                  |      d   |k7  sJ | |k7  ra|S )NrJ   r   r   )r   r   r   insert)parentchildr   r   s       r*   r   r     s    	ww}}VQ2%v&q)D
E/u-wAwww}}U#A&%/// E/ Kr0   c                     d}	 | j                         }t        j                  d|      r|j                         r"|gS |dk(  r|j                         r|gS g S ||z  }Y)Nr   z======+\s*$)r   r   r   r   )r4   parar   s      r*   !tagged_treebank_para_block_readerr     s`    D
 88ND)zz|vRZzz|v	 DLD r0   r-   )i @  N)%rX   r   pickler   tempfile	functoolsr   	xml.etreer   	nltk.datar   r   r   r   nltk.internalsr	   nltk.tokenizer
   	nltk.utilr   r   r   r   r}   rj   r   r   r   r   r   r   r   r   r   r   r   r   r<   r0   r*   <module>r     s     	  	   !  ( , N NV&1 V&r
61 6r1Hr&& FCL''^!CH r0   