
    g                     p    d dl mZ d dlmZmZmZ d dlmZmZ d dl	m
Z
mZ  G d de      Z G d de      Zy	)
    )CorpusReader)StreamBackedCorpusViewconcatread_alignedsent_block)RegexpTokenizerWhitespaceTokenizer)AlignedSent	Alignmentc                   T    e Zd ZdZd e        edd      edfdZdd	Zdd
Z	ddZ
y)AlignedCorpusReaderz
    Reader for corpora of word-aligned sentences.  Tokens are assumed
    to be separated by whitespace.  Sentences begin on separate lines.
    /
T)gapslatin1c                 l    t        j                  | |||       || _        || _        || _        || _        y)a  
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        N)r   __init___sep_word_tokenizer_sent_tokenizer_alignedsent_block_reader)selfrootfileidssepword_tokenizersent_tokenizeralignedsent_block_readerencodings           O/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/aligned.pyr   zAlignedCorpusReader.__init__   s7    ( 	dD'8<	--)A&    Nc                     t        | j                  |d      D cg c]4  \  }}t        ||dd| j                  | j                  | j
                        6 c}}      S c c}}w )z~
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)
        TFr   abspathsAlignedSentCorpusViewr   r   r   r   r   fileidencs       r   wordszAlignedCorpusReader.words2   st      &*]]7D%A &BMVS &((((22 &B
 	
   9A
c                     t        | j                  |d      D cg c]4  \  }}t        ||dd| j                  | j                  | j
                        6 c}}      S c c}}w )z
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))
        TFr"   r%   s       r   sentszAlignedCorpusReader.sentsG   st      &*]]7D%A &BMVS &((((22 &B
 	
r)   c                     t        | j                  |d      D cg c]4  \  }}t        ||dd| j                  | j                  | j
                        6 c}}      S c c}}w )zp
        :return: the given file(s) as a list of AlignedSent objects.
        :rtype: list(AlignedSent)
        Tr"   r%   s       r   aligned_sentsz!AlignedCorpusReader.aligned_sents]   st    
  &*]]7D%A &BMVS &((((22 &B
 	
r)   )N)__name__
__module____qualname____doc__r   r   r   r   r(   r+   r-    r    r   r   r      s7     *,&t$7!7B4
*
,
r    r   c                       e Zd ZdZd Zd Zy)r$   z
    A specialized corpus view for aligned sentences.
    ``AlignedSentCorpusView`` objects are typically created by
    ``AlignedCorpusReader`` (not directly by nltk users).
    c                 z    || _         || _        || _        || _        || _        t        j                  | ||       y )N)r   )_aligned_group_by_sentr   r   r   r   r   )r   corpus_filer   alignedgroup_by_sentr   r   r   s           r   r   zAlignedSentCorpusView.__init__y   s=      +--)A&''kHMr    c                 x   | j                  |      D cg c]=  }| j                  j                  |      D ]  }| j                  j                  |       ? }}}| j                  r5t        j                  dj                  |d               |d<   t        | g}|S | j                  r|d   g}|S |d   }|S c c}}w )N    r   )
r   r   tokenizer   r5   r
   
fromstringjoinr	   r6   )r   streamalignedsent_strsent_strblocks        r   
read_blockz AlignedSentCorpusView.read_block   s     $(#A#A&#I
#I 0099/J   ))(3J 4#I 	 

 == ++q"E!H !%()E    1XJE  !HE
s   AB6N)r.   r/   r0   r1   r   rD   r2   r    r   r$   r$   r   s    N"r    r$   N)nltk.corpus.reader.apir   nltk.corpus.reader.utilr   r   r   nltk.tokenizer   r   nltk.translater	   r
   r   r$   r2   r    r   <module>rI      s8    0 
 ? 1]
, ]
@(2 (r    