
    g>                         d Z ddlZddlmZ ddlmZ ddl ddlmZ ddl	m
Z
 ddlmZ  G d	 d
e      Z G d de      Zy)z
Corpus reader for corpora whose documents are xml files.

(note -- not named 'xml' to avoid conflicting w/ standard xml package)
    N)ElementTree)CorpusReader)*)SeekableUnicodeStreamReader)ElementWrapper)WordPunctTokenizerc                   (    e Zd ZdZddZddZddZy)XMLCorpusReadera  
    Corpus reader for corpora whose documents are xml files.

    Note that the ``XMLCorpusReader`` constructor does not take an
    ``encoding`` argument, because the unicode encoding is specified by
    the XML files themselves.  See the XML specs for more info.
    c                 @    || _         t        j                  | ||       y N)_wrap_etreer   __init__)selfrootfileids
wrap_etrees       O/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/xmldocs.pyr   zXMLCorpusReader.__init__!   s    %dD'2    Nc                 j   |'t        | j                        dk(  r| j                  d   }t        |t              st	        d      | j                  |      j                         5 }t        j                  |      j                         }d d d        | j                  rt              }S # 1 sw Y   "xY w)N   r   z(Expected a single file identifier string)len_fileids
isinstancestr	TypeErrorabspathopenr   parsegetrootr   r   )r   fileidfpelts       r   xmlzXMLCorpusReader.xml%   s    >c$--0A5]]1%F&#&FGG\\&!&&(B##B'//1C )  %C
 )(s   $$B))B2c                 d   | j                  |      }| j                  |      }t               }	 |j                         }g }|D ]T  }|j
                  }|t        |t              r|j                  |      }|j                  |      }	|j                  |	       V |S #  |j	                         }Y qxY w)aE  
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        )r#   encodingr   getiteratoritertextr   bytesdecodetokenizeextend)
r   r    r"   r%   word_tokenizeriteratoroutnoder(   tokss
             r   wordszXMLCorpusReader.words4   s     hhv==(+-	"(H D99DdE*;;x0D%..t4

4   
	"xxzHs   B B/)Fr   )__name__
__module____qualname____doc__r   r#   r2    r   r   r
   r
      s    3r   r
   c                       e Zd ZdZdZdZddZd Zd Ze	j                  de	j                  e	j                  z        Ze	j                  d	      Ze	j                  d
e	j                  e	j                  z        Zd ZddZy)XMLCorpusViewam  
    A corpus view that selects out specified elements from an XML
    file, and provides a flat list-like interface for accessing them.
    (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
    but may be used by subclasses of ``XMLCorpusReader``.)

    Every XML corpus view has a "tag specification", indicating what
    XML elements should be included in the view; and each (non-nested)
    element that matches this specification corresponds to one item in
    the view.  Tag specifications are regular expressions over tag
    paths, where a tag path is a list of element tag names, separated
    by '/', indicating the ancestry of the element.  Some examples:

      - ``'foo'``: A top-level element whose tag is ``foo``.
      - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
        is a top-level element whose tag is ``foo``.
      - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
        in the xml tree.
      - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
        appearing anywhere in the xml tree.

    The view items are generated from the selected XML elements via
    the method ``handle_elt()``.  By default, this method returns the
    element as-is (i.e., as an ElementTree object); but it can be
    overridden, either via subclassing or via the ``elt_handler``
    constructor parameter.
    Fi   Nc                     |r|| _         t        j                  |dz         | _        	 ddi| _        	 | j                  |      }t        j                  | ||       y)aW  
        Create a new corpus view based on a specified XML file.

        Note that the ``XMLCorpusView`` constructor does not take an
        ``encoding`` argument, because the unicode encoding is
        specified by the XML files themselves.

        :type tagspec: str
        :param tagspec: A tag specification, indicating what XML
            elements should be included in the view.  Each non-nested
            element that matches this specification corresponds to one
            item in the view.

        :param elt_handler: A function used to transform each element
            to a value for the view.  If no handler is specified, then
            ``self.handle_elt()`` is called, which returns the element
            as an ElementTree object.  The signature of elt_handler is::

                elt_handler(elt, tagspec) -> value
        z\Zr   r7   )r%   N)
handle_eltrecompile_tagspec_tag_context_detect_encodingStreamBackedCorpusViewr   )r   r    tagspecelt_handlerr%   s        r   r   zXMLCorpusView.__init__u   s]    * )DO

7U?39G	
 ((0''fx'Hr   c                 2   t        |t              r2	 |j                         }|j                         }|j	                          n%t        |d      5 }|j                         }d d d        j                  t        j                        ry|j                  t        j                        ry|j                  t        j                        ry|j                  t        j                        ry|j                  t        j                        ryt        j                  d|      }|r|j                  d      j                         S t        j                  d	|      }|r|j                  d      j                         S y# j	                          w xY w# 1 sw Y   .xY w)
Nrbz	utf-16-bez	utf-16-lez	utf-32-bez	utf-32-lezutf-8s!   \s*<\?xml\b.*\bencoding="([^"]+)"r   s!   \s*<\?xml\b.*\bencoding='([^']+)')r   PathPointerr   readlineclose
startswithcodecsBOM_UTF16_BEBOM_UTF16_LEBOM_UTF32_BEBOM_UTF32_LEBOM_UTF8r<   matchgroupr*   )r   r    infilesms        r   r@   zXMLCorpusView._detect_encoding   s"   fk*OO%fd#vOO% $<<++,<<++,<<++,<<++,<<(HH:A>771:$$&&HH:A>771:$$&&+ ##s    E7 F7F	Fc                     |S )a  
        Convert an element into an appropriate value for inclusion in
        the view.  Unless overridden by a subclass or by the
        ``elt_handler`` constructor argument, this method simply
        returns ``elt``.

        :return: The view value corresponding to ``elt``.

        :type elt: ElementTree
        :param elt: The element that should be converted.

        :type context: str
        :param context: A string composed of element tags separated by
            forward slashes, indicating the XML context of the given
            element.  For example, the string ``'foo/bar/baz'``
            indicates that the element is a ``baz`` element whose
            parent is a ``bar`` element and whose grandparent is a
            top-level ``foo`` element.
        r7   )r   r"   contexts      r   r;   zXMLCorpusView.handle_elt   s	    ( 
r   a;  
        [^<]*
        (
          ((<!--.*?-->)                         |  # comment
           (<![CDATA[.*?]])                     |  # raw character data
           (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) |  # doctype decl
           (<[^!>][^>]*>))                         # tag or PI
          [^<]*)*
        \Zz<\s*(?:/\s*)?([^\s>]+)a6  
        # Include these so we can skip them:
        (?P<COMMENT>        <!--.*?-->                          )|
        (?P<CDATA>          <![CDATA[.*?]]>                     )|
        (?P<PI>             <\?.*?\?>                           )|
        (?P<DOCTYPE>        <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
        # These are the ones we actually care about:
        (?P<EMPTY_ELT_TAG>  <\s*[^>/\?!\s][^>]*/\s*>            )|
        (?P<START_TAG>      <\s*[^>/\?!\s][^>]*>                )|
        (?P<END_TAG>        <\s*/[^>/\?!\s][^>]*>               )c                    d}t        |t              r|j                         }	 |j                  | j                        }||z  }| j
                  j                  |      r|S t        j                  d|      j                  d      dk(  rO|j                         t        |      t        j                  d|      j                         z
  z
  }t        d|z        |st        d      |j                  d      }|dkD  ru| j
                  j                  |d|       rWt        |t              r#|j                         |j                  |       n|j                  t        |      |z
   d	       |d| S N)
a{  
        Read a string from the given stream that does not contain any
        un-closed tags.  In particular, this function first reads a
        block from the stream of size ``self._BLOCK_SIZE``.  It then
        checks if that block contains an un-closed tag.  If it does,
        then this function either backtracks to the last '<', or reads
        another block.
         z[<>]r   >zUnexpected ">" near char %sz&Unexpected end of file: tag not closed<Nr   )r   r   tellread_BLOCK_SIZE_VALID_XML_RErP   r<   searchrQ   r   end
ValueErrorrfindseekchar_seek_forward)r   streamfragmentstartpos	xml_blockposlast_open_brackets          r   _read_xml_fragmentz XMLCorpusView._read_xml_fragment   sZ    f9:{{}HD$4$45I	!H !!''1 yy*003s:kkmMBIIfh$?$C$C$EE !!>!DEE  !IJJ
 !)s 3 1$%%++H5G6G,HI!&*EFH-001BCc(m6G&G$H!L#$6%677? r   c                    || j                   }|| j                  }t        | j                  j	                  |j                                     }|J g }d}d}d}|g k(  s|ft        |t              r|j                         }	| j                  |      }
|
s|n/t        d      | j                  j                  |
      D ]q  }| j                  r;t        dj                  dj                  |      dd |j!                                      |j!                  d      r| j"                  j%                  |j!                               j!                  d      }|j'                  |       |t(        j%                  |dj                  |            s|j+                         }t-        |      }|j!                  d	      r| j"                  j%                  |j!                               j!                  d      }|st        d
|z        ||d   k7  rt        d|d    d| d      |L|t-        |      k(  r>||
||j/                          z  }|j'                  |dj                  |      f       dx}}d}|j1                          |j!                  d      s| j"                  j%                  |j!                               j!                  d      }|t(        j%                  |dj                  |      dz   |z         s<|j'                  |j!                         dj                  |      dz   |z   f       t ||g k(  r||
|d z  }d}nw| j                  rt        d       t        |t              r#|j3                  	       |j5                  |       n|j3                  t-        |
      |z
   d       |d|dz
   }dx}}d}|g k(  rb|f|j                         }|| j                  v rt7        |      | j                  |   k(  sJ t7        |      | j                  |<   |D cg c]1  \  }} |t9        j:                  |j=                  dd            |      3 c}}S c c}}w )z
        Read from ``stream`` until we find at least one element that
        matches ``tagspec``, and return the result of applying
        ``elt_handler`` to each element found.
        NrX   zUnexpected end of filez	{:>25} {}/i	START_TAGr   END_TAGzUnmatched tag </%s>zUnmatched tag <z>...</rY   EMPTY_ELT_TAGr   z/                                    (backtrack)asciixmlcharrefreplace)r>   r;   listr?   getr[   r   r   rk   ra   
_XML_PIECEfinditer_DEBUGprintformatjoinrQ   _XML_TAG_NAMErP   appendr<   startr   r`   poprc   rd   tupler   
fromstringencode)r   re   rB   rC   rV   elts	elt_start	elt_depthelt_textrg   xml_fragmentpiecenameri   r"   s                  r   
read_blockzXMLCorpusView.read_block"  s    ?mmG//K t((,,V[[];<"""		bjI1&"=>!;;=226:L  $$%=>> 11,?;;+,,SXXg->st-DekkmTU;;{+--33EKKMBHHKDNN4( (88GSXXg->?(-I(+GI[[+--33EKKMBHHKD"()>)EFFwr{*(?72;-vdVST)UVV ,c'l1J LUYY[$IIXsxx/@$AB044	I#%KKM[[1--33EKKMBHHKD (88GSXXg->-Dt-KL KK8IC8ORV8V(WXE @H $ 2:YZ 88H !I {{67!&*EFH-00;c,&7)&C$DaH%o	A6G,00I	!HO bjI1T kkm$###>T%6%6s%;;;;%*7^Dc" #'

 #'g	 &&szz';N'OP #'
 	
 
s   !6Qr   )NN)r3   r4   r5   r6   rx   r]   r   r@   r;   r<   r=   DOTALLVERBOSEr^   r|   rv   rk   r   r7   r   r   r9   r9   Q   s    < F K"IH:0 JJ	 			BJJM JJ89M 		E 			BJJJ,8bk
r   r9   )r6   rJ   	xml.etreer   nltk.corpus.reader.apir   nltk.corpus.reader.util	nltk.datar   nltk.internalsr   nltk.tokenizer   r
   rA   r9   r7   r   r   <module>r      s=     ! / % 1 ) ,6l 6r|
* |
r   