Ë
    Øªg…#  ã                   óŽ   — d Z ddlZddlZddlZddlmZ ddl­ ddlm	Z	 ddl
­ ddl­ ddlmZ  G d„ de«      Z G d	„ d
e«      Zy)zN
A reader for corpora that contain chunked (and optionally tagged)
documents.
é    N)Útagstr2tree)Ú*)ÚBracketParseCorpusReader)ÚTreec                   ó‚   — e Zd ZdZde edd¬«      eddfd„Zdd	„Zdd
„Z	dd„Z
dd„Zdd„Zdd„Zdd„Zdd„Zdd„Zd„ Zy)ÚChunkedCorpusReadera&  
    Reader for chunked (and optionally tagged) corpora.  Paragraphs
    are split using a block reader.  They are then tokenized into
    sentences using a sentence tokenizer.  Finally, these sentences
    are parsed into chunk trees using a string-to-chunktree conversion
    function.  Each of these steps can be performed using a default
    function or a custom function.  By default, paragraphs are split
    on blank lines; sentences are listed one per line; and sentences
    are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
    Ú Ú
T)ÚgapsÚutf8Nc	                 óJ   — t         j                  | |||«       ||||f| _        y)z’
        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        N)ÚCorpusReaderÚ__init__Ú_cv_args)	ÚselfÚrootÚfileidsÚ	extensionÚstr2chunktreeÚsent_tokenizerÚpara_block_readerÚencodingÚtagsets	            úO/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/chunked.pyr   zChunkedCorpusReader.__init__&   s/   € ô 	×Ñ˜d D¨'°8Ô<Ø&¨Ð8IÈ6ÐRˆŒð	Aó    c                 óœ   — t        | j                  |d«      D cg c]   \  }}t        ||ddddg| j                  ¢­Ž ‘Œ" c}}«      S c c}}w )z~
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)
        Tr   ©ÚconcatÚabspathsÚChunkedCorpusViewr   ©r   r   ÚfÚencs       r   ÚwordszChunkedCorpusReader.words:   sZ   € ô ð !%§¡¨g°tÔ <ôá <‘HQ˜ô " ! S¨!¨Q°°1ÐE°t·}±}ÔEØ <òó
ð 	
ùóó   ›%A
c                 óœ   — t        | j                  |d«      D cg c]   \  }}t        ||ddddg| j                  ¢­Ž ‘Œ" c}}«      S c c}}w )z²
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))
        Tr   é   r   r!   s       r   ÚsentszChunkedCorpusReader.sentsG   óZ   € ô ð !%§¡¨g°tÔ <ôá <‘HQ˜ô " ! S¨!¨Q°°1ÐE°t·}±}ÔEØ <òó
ð 	
ùór%   c                 óœ   — t        | j                  |d«      D cg c]   \  }}t        ||ddddg| j                  ¢­Ž ‘Œ" c}}«      S c c}}w )zÜ
        :return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of word strings.
        :rtype: list(list(list(str)))
        Tr   r'   r   r!   s       r   ÚparaszChunkedCorpusReader.parasU   r)   r%   c                 ó¢   — t        | j                  |d«      D cg c]#  \  }}t        ||ddddg| j                  ¢­d|iŽ‘Œ% c}}«      S c c}}w )z¾
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))
        Tr'   r   Útarget_tagsetr   ©r   r   r   r"   r#   s        r   Útagged_wordsz ChunkedCorpusReader.tagged_wordsc   ól   € ô ð
 !%§¡¨g°tÔ <ô	ñ !=‘HQ˜ô "Øs˜A˜q ! QðØ)-¯©òØFLóð !=ò	ó
ð 	
ùóó   ›(A
c                 ó¢   — t        | j                  |d«      D cg c]#  \  }}t        ||ddddg| j                  ¢­d|iŽ‘Œ% c}}«      S c c}}w )z­
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.

        :rtype: list(list(tuple(str,str)))
        Tr'   r   r-   r   r.   s        r   Útagged_sentsz ChunkedCorpusReader.tagged_sentss   r0   r1   c                 ó¢   — t        | j                  |d«      D cg c]#  \  }}t        ||ddddg| j                  ¢­d|iŽ‘Œ% c}}«      S c c}}w )zð
        :return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of ``(word,tag)`` tuples.
        :rtype: list(list(list(tuple(str,str))))
        Tr'   r   r-   r   r.   s        r   Útagged_parasz ChunkedCorpusReader.tagged_parasƒ   r0   r1   c                 ó¢   — t        | j                  |d«      D cg c]#  \  }}t        ||ddddg| j                  ¢­d|iŽ‘Œ% c}}«      S c c}}w )av  
        :return: the given file(s) as a list of tagged
            words and chunks.  Words are encoded as ``(word, tag)``
            tuples (if the corpus has tags) or word strings (if the
            corpus has no tags).  Chunks are encoded as depth-one
            trees over ``(word,tag)`` tuples or word strings.
        :rtype: list(tuple(str,str) and Tree)
        Tr'   r   r-   r   r.   s        r   Úchunked_wordsz!ChunkedCorpusReader.chunked_words“   ól   € ô ð
 !%§¡¨g°tÔ <ô	ñ !=‘HQ˜ô "Øs˜A˜q ! QðØ)-¯©òØFLóð !=ò	ó
ð 	
ùór1   c                 ó¢   — t        | j                  |d«      D cg c]#  \  }}t        ||ddddg| j                  ¢­d|iŽ‘Œ% c}}«      S c c}}w )a6  
        :return: the given file(s) as a list of
            sentences, each encoded as a shallow Tree.  The leaves
            of these trees are encoded as ``(word, tag)`` tuples (if
            the corpus has tags) or word strings (if the corpus has no
            tags).
        :rtype: list(Tree)
        Tr'   r   r-   r   r.   s        r   Úchunked_sentsz!ChunkedCorpusReader.chunked_sents¥   r8   r1   c                 ó¢   — t        | j                  |d«      D cg c]#  \  }}t        ||ddddg| j                  ¢­d|iŽ‘Œ% c}}«      S c c}}w )ao  
        :return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as a shallow Tree.  The leaves of these
            trees are encoded as ``(word, tag)`` tuples (if the corpus
            has tags) or word strings (if the corpus has no tags).
        :rtype: list(list(Tree))
        Tr'   r-   r   r.   s        r   Úchunked_parasz!ChunkedCorpusReader.chunked_paras·   r8   r1   c                 óP   — t        |«      D cg c]  }t        |«      ‘Œ c}S c c}w ©N)Úread_blankline_blockr   )r   ÚstreamÚts      r   Ú_read_blockzChunkedCorpusReader._read_blockÉ   s%   € Ü(<¸VÔ(DÓEÑ(D 1”˜A•Ð(DÑEÐEùÒEs   Ž#r>   ©NN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚRegexpTokenizerr?   r   r$   r(   r+   r/   r3   r5   r7   r:   r<   rB   © r   r   r   r      s[   „ ñ	ð Ø!Ù& t°$Ô7Ø.ØØóAó(
ó
ó
ó
ó 
ó 
ó 
ó$
ó$
ó$Fr   r   c                   ó$   — e Zd Z	 	 dd„Zd„ Zd„ Zy)r    Nc                 ó²   — t         j                  | ||¬«       || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        y )N)r   )ÚStreamBackedCorpusViewr   Ú_taggedÚ_group_by_sentÚ_group_by_paraÚ_chunkedÚ_str2chunktreeÚ_sent_tokenizerÚ_para_block_readerÚ_source_tagsetÚ_target_tagset)r   Úfileidr   ÚtaggedÚgroup_by_sentÚgroup_by_paraÚchunkedr   r   r   Úsource_tagsetr-   s               r   r   zChunkedCorpusView.__init__Î   s_   € ô 	×'Ñ'¨¨f¸xÐ'ÔHØˆŒØ+ˆÔØ+ˆÔØˆŒØ+ˆÔØ-ˆÔØ"3ˆÔØ+ˆÔØ+ˆÕr   c                 óø  — g }| j                  |«      D ]ã  }g }| j                  j                  |«      D ]’  }| j                  || j                  | j
                  ¬«      }| j                  s| j                  |«      }| j                  s|j                  «       }| j                  r|j                  |«       Œ‚|j                  |«       Œ” | j                  r|j                  |«       ŒÓ|j                  |«       Œå |S )N)r[   r-   )rS   rR   ÚtokenizerQ   rT   rU   rM   Ú_untagrP   ÚleavesrN   ÚappendÚextendrO   )r   r@   ÚblockÚpara_strÚparaÚsent_strÚsents          r   Ú
read_blockzChunkedCorpusView.read_blockç   sß   € ØˆØ×/Ñ/°Ö7ˆHØˆDØ ×0Ñ0×9Ñ9¸(ÖCØ×*Ñ*ØØ"&×"5Ñ"5Ø"&×"5Ñ"5ð +ó ð —|’|ØŸ;™; tÓ,Dð —}’}ØŸ;™;›=Dð ×&Ò&Ø—K‘K Õ%à—K‘K Õ%ð' Dð, ×"Ò"Ø—‘˜TÕ"à—‘˜TÕ"ð7 8ð< ˆr   c                 ó¶   — t        |«      D ]J  \  }}t        |t        «      r| j                  |«       Œ(t        |t        «      r	|d   ||<   ŒAt        d«      ‚ |S )Nr   z"expected child to be Tree or tuple)Ú	enumerateÚ
isinstancer   r^   ÚtupleÚ
ValueError)r   ÚtreeÚiÚchilds       r   r^   zChunkedCorpusView._untag	  sT   € Ü! $ž‰HˆAˆuÜ˜%¤Ô&Ø—‘˜EÕ"Ü˜E¤5Ô)Ø ™(Q’ä Ð!EÓFÐFð (ð ˆr   rC   )rD   rE   rF   r   rg   r^   rI   r   r   r    r    Í   s   „ ð Øó,ò2 óDr   r    )rG   ÚcodecsÚos.pathÚosÚnltkÚ
nltk.chunkr   Únltk.corpus.reader.apiÚ nltk.corpus.reader.bracket_parser   Únltk.corpus.reader.utilÚnltk.tokenizeÚ	nltk.treer   r   r   rL   r    rI   r   r   Ú<module>rz      sG   ðñó
 Û ã Ý "Ü $Ý EÜ %Ü Ý ôpF˜,ô pFôfDÐ.õ Dr   