
    g6                         d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	 d Z
 G d de	      Z G d	 d
      Z G d d      Z G d de      Zy)z9
A reader for corpora whose documents are in MTE format.
    N)reduce)TaggedCorpusReaderconcat)XMLCorpusViewc                 &    | j                  ||      S N)findall)rootpathnss      K/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/mte.pyxpathr      s    <<b!!    c                        e Zd ZdZddZddZy)MTECorpusViewz0
    Class for lazy viewing the MTE Corpus.
    Nc                 4    t        j                  | |||       y r   )r   __init__)selffileidtagspecelt_handlers       r   r   zMTECorpusView.__init__   s    tVWkBr   c                 Z    t        t        d t        j                  | |||                  S )Nc                 
    | d uS r    xs    r   <lambda>z*MTECorpusView.read_block.<locals>.<lambda>       !4-r   )listfilterr   
read_block)r   streamr   r   s       r   r!   zMTECorpusView.read_block   s.    '((vwL
 	
r   r   )NN)__name__
__module____qualname____doc__r   r!   r   r   r   r   r      s    C
r   r   c                       e Zd ZdZdddZdZdZdZdZd	Z	d
 Z
ed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zd Zd Zd Zd Zd Zd Zd Zd Zd Zy)MTEFileReaderz
    Class for loading the content of the multext-east corpus. It
    parses the xml files and does some tag-filtering depending on the
    given method parameters.
    zhttps://www.tei-c.org/ns/1.0z%https://www.w3.org/XML/1998/namespace)teixmlz{https://www.tei-c.org/ns/1.0}z'{https://www.w3.org/XML/1998/namespace}zTEI/text/body/div/div/p/s/(w|c)zTEI/text/body/div/div/p/szTEI/text/body/div/div/pc                     || _         y r   )_MTEFileReader__file_path)r   	file_paths     r   r   zMTEFileReader.__init__3   s
    $r   c                     |j                   S r   )textclseltcontexts      r   	_word_eltzMTEFileReader._word_elt6   s    xxr   c                 v    t        |d| j                        D cg c]  }| j                  |d        c}S c c}w N*)r   r   r4   r1   r2   r3   ws       r   	_sent_eltzMTEFileReader._sent_elt:   4    05c30GH0G1a&0GHHH   6c                 v    t        |d| j                        D cg c]  }| j                  |d        c}S c c}w r6   )r   r   r:   r1   r2   r3   ss       r   	_para_eltzMTEFileReader._para_elt>   r;   r<   c                    d|j                   vr|j                  dfS | j                  dk(  r*| j                  dk(  r|j                  |j                   d   fS | j                  dk(  r=| j                  dk(  r.|j                  t        j                  |j                   d         fS t        j                  dt        j                  dd| j                        z   dz         }|j                  |j                   d         rX| j                  dk(  r|j                  |j                   d   fS |j                  t        j                  |j                   d         fS y )	Nana msd	universal^-.z.*$)
attribr/   _MTEFileReader__tags_MTEFileReader__tagsetMTETagConvertermsd_to_universalrecompilesubmatch)r1   r2   r3   tagss       r   _tagged_word_eltzMTEFileReader._tagged_word_eltB   s   

"HHb>!:: 5HHcjj/00ZZ2#,,+"=HHo>>szz%?PQRR::cBFF3SZZ$@@5HIDzz#**U+,<<5(HHcjj&788 '88E9JK 
 r   c                     t        t        d t        |d| j                        D cg c]  }| j	                  |d        c}            S c c}w )Nc                 
    | d uS r   r   r   s    r   r   z0MTEFileReader._tagged_sent_elt.<locals>.<lambda>\   r   r   r7   )r   r    r   r   rS   r8   s       r   _tagged_sent_eltzMTEFileReader._tagged_sent_eltX   N    '8=c38OP8O1%%a.8OP
 	
 Q   A
c                     t        t        d t        |d| j                        D cg c]  }| j	                  |d        c}            S c c}w )Nc                 
    | d uS r   r   r   s    r   r   z0MTEFileReader._tagged_para_elt.<locals>.<lambda>e   r   r   r7   )r   r    r   r   rV   r>   s       r   _tagged_para_eltzMTEFileReader._tagged_para_elta   rW   rX   c                 p    d|j                   vr|j                  dfS |j                  |j                   d   fS )NlemmarC   )rI   r/   r0   s      r   _lemma_word_eltzMTEFileReader._lemma_word_eltj   s4    #**$HHb>!HHcjj122r   c                 v    t        |d| j                        D cg c]  }| j                  |d        c}S c c}w r6   )r   r   r^   r8   s       r   _lemma_sent_eltzMTEFileReader._lemma_sent_eltq   6    6;Ccff6MN6M##At,6MNNNr<   c                 v    t        |d| j                        D cg c]  }| j                  |d        c}S c c}w r6   )r   r   r`   r>   s       r   _lemma_para_eltzMTEFileReader._lemma_para_eltu   ra   r<   c                 h    t        | j                  t        j                  t        j                        S r   )r   r,   r(   	word_pathr4   r   s    r   wordszMTEFileReader.wordsy   (    m55}7N7N
 	
r   c                 h    t        | j                  t        j                  t        j                        S r   )r   r,   r(   	sent_pathr:   rf   s    r   sentszMTEFileReader.sents~   rh   r   c                 h    t        | j                  t        j                  t        j                        S r   )r   r,   r(   	para_pathr@   rf   s    r   paraszMTEFileReader.paras   rh   r   c                 h    t        | j                  t        j                  t        j                        S r   )r   r,   r(   re   r^   rf   s    r   lemma_wordszMTEFileReader.lemma_words   (    m55}7T7T
 	
r   c                     |t         _        |t         _        t        | j                  t         j
                  t         j                        S r   )r(   rK   rJ   r   r,   re   rS   r   tagsetrR   s      r   tagged_wordszMTEFileReader.tagged_words   8    !'#m55}7U7U
 	
r   c                 h    t        | j                  t        j                  t        j                        S r   )r   r,   r(   rj   r`   rf   s    r   lemma_sentszMTEFileReader.lemma_sents   rq   r   c                     |t         _        |t         _        t        | j                  t         j
                  t         j                        S r   )r(   rK   rJ   r   r,   rj   rV   rs   s      r   tagged_sentszMTEFileReader.tagged_sents   rv   r   c                 h    t        | j                  t        j                  t        j                        S r   )r   r,   r(   rm   rc   rf   s    r   lemma_paraszMTEFileReader.lemma_paras   rq   r   c                     |t         _        |t         _        t        | j                  t         j
                  t         j                        S r   )r(   rK   rJ   r   r,   rm   r[   rs   s      r   tagged_paraszMTEFileReader.tagged_paras   rv   r   N)r#   r$   r%   r&   r   tag_nsxml_nsre   rj   rm   r   classmethodr4   r:   r@   rS   rV   r[   r^   r`   rc   rg   rk   rn   rp   ru   rx   rz   r|   r~   r   r   r   r(   r(   "   s    .6
B .F6F1I+I)I%   I I I I  * 
 
 
 
 3 3 O O O O














r   r(   c                   >    e Zd ZdZdddddddd	d
ddddZed        Zy)rL   zu
    Class for converting msd tags to universal tags, more conversion
    options are currently not implemented.
    ADJADPADVCONJDETNOUNNUMPRTPRONVERBrH   X)ASRCDNMQPVrH   rG   c                 t    | d   dk(  s| d   n| d   }|t         j                  vrd}t         j                  |   S )z
        This function converts the annotation from the Multex-East to the universal tagset
        as described in Chapter 5 of the NLTK-Book

        Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
        r   #   rG   )rL   mapping_msd_universal)tag	indicators     r   rM   z MTETagConverter.msd_to_universal   sA     #&a&C-CFSV	OAAAI44Y??r   N)r#   r$   r%   r&   r   staticmethodrM   r   r   r   rL   rL      sK      @ @r   rL   c                   f    e Zd ZdZddZd ZddZddZddZddZ	dd	Z
dd
ZddZddZddZy)MTECorpusReaderz
    Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
    MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
    scheme. These tags can be converted to the Universal tagset
    Nc                 B    t        j                  | |||       d| _        y)a.  
        Construct a new MTECorpusreader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP

        :param root: The root directory for this corpus. (default points to location in multext config file)
        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
        :param encoding: The encoding of the given files (default is utf8)
        z00README.txtN)r   r   _readme)r   r
   fileidsencodings       r   r   zMTECorpusReader.__init__   s     	##D$B%r   c                      | j                   }nt        |t              r|g}t         fd|      }t        d |      }|st	        d       |S )Nc                      | j                   v S r   )_fileids)r   r   s    r   r   z+MTECorpusReader.__fileids.<locals>.<lambda>   s    1#5r   c                 
    | dvS )N)zoana-bg.xmlzoana-mk.xmlr   r   s    r   r   z+MTECorpusReader.__fileids.<locals>.<lambda>   s
    1,J#Jr   z$No valid multext-east file specified)r   
isinstancestrr    print)r   r   s   ` r   	__fileidszMTECorpusReader.__fileids   sM    ?mmG%iG5w?JGT89r   c                     t        | j                  |      D cg c]C  }t        t        j                  j                  | j                  |            j                         E c}      S c c}w )z
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        )r   _MTECorpusReader__fileidsr(   osr   join_rootrg   r   r   fs      r   rg   zMTECorpusReader.words   s]      00A bggll4::q9:@@B0
 	
   AA(c                     t        | j                  |      D cg c]C  }t        t        j                  j                  | j                  |            j                         E c}      S c c}w )z
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of sentences or utterances,
                 each encoded as a list of word strings
        :rtype: list(list(str))
        )r   r   r(   r   r   r   r   rk   r   s      r   rk   zMTECorpusReader.sents  ]      00A bggll4::q9:@@B0
 	
r   c                     t        | j                  |      D cg c]C  }t        t        j                  j                  | j                  |            j                         E c}      S c c}w )a  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of paragraphs, each encoded as a list
                 of sentences, which are in turn encoded as lists of word string
        :rtype: list(list(list(str)))
        )r   r   r(   r   r   r   r   rn   r   s      r   rn   zMTECorpusReader.paras  r   r   c                     t        | j                  |      D cg c]C  }t        t        j                  j                  | j                  |            j                         E c}      S c c}w )a  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of words, the corresponding lemmas
                 and punctuation symbols, encoded as tuples (word, lemma)
        :rtype: list(tuple(str,str))
        )r   r   r(   r   r   r   r   rp   r   s      r   rp   zMTECorpusReader.lemma_words  s]      00A bggll4::q9:FFH0
 	
r   c                 
   |dk(  s|dk(  rit        | j                  |      D cg c]E  }t        t        j                  j                  | j                  |            j                  ||      G c}      S t        d       yc c}w )a;  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of tagged words and punctuation symbols
                 encoded as tuples (word, tag)
        :rtype: list(tuple(str, str))
        rE   rD   Unknown tagset specified.N)	r   r   r(   r   r   r   r   ru   r   r   r   rt   rR   r   s        r   ru   zMTECorpusReader.tagged_words,       [ FeO
 "^^G4	 5 ""'',,tzz1"=>KK 5	  -.   A
B c                     t        | j                  |      D cg c]C  }t        t        j                  j                  | j                  |            j                         E c}      S c c}w )aB  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of sentences or utterances, each
                 encoded as a list of tuples of the word and the corresponding
                 lemma (word, lemma)
        :rtype: list(list(tuple(str, str)))
        )r   r   r(   r   r   r   r   rx   r   s      r   rx   zMTECorpusReader.lemma_sentsC  ]      00A bggll4::q9:FFH0
 	
r   c                 
   |dk(  s|dk(  rit        | j                  |      D cg c]E  }t        t        j                  j                  | j                  |            j                  ||      G c}      S t        d       yc c}w )aH  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of sentences or utterances, each
                 each encoded as a list of (word,tag) tuples
        :rtype: list(list(tuple(str, str)))
        rE   rD   r   N)	r   r   r(   r   r   r   r   rz   r   r   s        r   rz   zMTECorpusReader.tagged_sentsR  r   r   c                     t        | j                  |      D cg c]C  }t        t        j                  j                  | j                  |            j                         E c}      S c c}w )am  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of paragraphs, each encoded as a
                 list of sentences, which are in turn encoded as a list of
                 tuples of the word and the corresponding lemma (word, lemma)
        :rtype: list(List(List(tuple(str, str))))
        )r   r   r(   r   r   r   r   r|   r   s      r   r|   zMTECorpusReader.lemma_parasi  r   r   c                 
   |dk(  s|dk(  rit        | j                  |      D cg c]E  }t        t        j                  j                  | j                  |            j                  ||      G c}      S t        d       yc c}w )a  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of paragraphs, each encoded as a
                 list of sentences, which are in turn encoded as a list
                 of (word,tag) tuples
        :rtype: list(list(list(tuple(str, str))))
        rE   rD   r   N)	r   r   r(   r   r   r   r   r~   r   r   s        r   r~   zMTECorpusReader.tagged_parasx  s     [ FeO
 "^^G4	 5 ""'',,tzz1"=>KK 5	  -.r   )NNutf8r   )NrD   rC   )r#   r$   r%   r&   r   r   rg   rk   rn   rp   ru   rx   rz   r|   r~   r   r   r   r   r      s>    &



/.
/.
/r   r   )r&   r   rN   	functoolsr   nltk.corpus.readerr   r   nltk.corpus.reader.xmldocsr   r   r   r(   rL   r   r   r   r   <module>r      sU    
 	  9 4"
M 
"H
 H
V"@ "@J|/( |/r   