
    g                         d dl Z d dlmZ d dl d dl  G d de      Z G d de      Zd Z	d	 Z
ed
k(  r e	         e
        yy)    N)util)*c                   D    e Zd Zd	dZd
dZd
dZd
dZd
dZd
dZd
dZ	y)ChasenCorpusReaderNc                 B    || _         t        j                  | |||       y N)_sent_splitterCorpusReader__init__)selfrootfileidsencodingsent_splitters        N/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/chasen.pyr   zChasenCorpusReader.__init__   s    +dD'8<    c                     t        | j                  |d      D cg c]  \  }}t        ||ddd| j                        ! c}}      S c c}}w NTFconcatabspathsChasenCorpusViewr	   r   r   fileidencs       r   wordszChasenCorpusReader.words   sU     &*]]7D%A%AMVS !eUE4CVCVW%A
 	
   $A
c                     t        | j                  |d      D cg c]  \  }}t        ||ddd| j                        ! c}}      S c c}}w r   r   r   s       r   tagged_wordszChasenCorpusReader.tagged_words   sU     &*]]7D%A%AMVS !dE5$BUBUV%A
 	
r   c                     t        | j                  |d      D cg c]  \  }}t        ||ddd| j                        ! c}}      S c c}}w r   r   r   s       r   sentszChasenCorpusReader.sents#   sU     &*]]7D%A%AMVS !eT5$BUBUV%A
 	
r   c                     t        | j                  |d      D cg c]  \  }}t        ||ddd| j                        ! c}}      S c c}}w r   r   r   s       r   tagged_sentszChasenCorpusReader.tagged_sents+   sU     &*]]7D%A%AMVS !dD%ATATU%A
 	
r   c                     t        | j                  |d      D cg c]  \  }}t        ||ddd| j                        ! c}}      S c c}}w r   r   r   s       r   paraszChasenCorpusReader.paras3   sU     &*]]7D%A%AMVS !eT4ATATU%A
 	
r   c                     t        | j                  |d      D cg c]  \  }}t        ||ddd| j                        ! c}}      S c c}}w )NTr   r   s       r   tagged_paraszChasenCorpusReader.tagged_paras;   sU     &*]]7D%A%AMVS !dD$@S@ST%A
 	
r   )utf8Nr   )
__name__
__module____qualname__r   r   r   r!   r#   r%   r'    r   r   r   r      s%    =





r   r   c                        e Zd ZdZ	 ddZd Zy)r   z
    A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``,
    but this'll use fixed sets of word and sentence tokenizer.
    Nc                 l    || _         || _        || _        || _        t        j                  | ||       y )Nr   )_tagged_group_by_sent_group_by_parar	   StreamBackedCorpusViewr   )r   corpus_filer   taggedgroup_by_sentgroup_by_parar   s          r   r   zChasenCorpusView.__init__J   s8     +++''kH'Mr   c                    g }t        |dd      D ]k  }g }g }|j                         D ]  }|j                         dk(  }|j                  d      }|d   dj	                  |dd       f}	|s|j                  |	       |s| j                  sb| j                  |	      st| j                  s|D 	
cg c]  \  }	}
|		 }}	}
| j                  r|j                  |       n|j                  |       g } t        |      dkD  rO| j                  s|D 	
cg c]  \  }	}
|		 }}	}
| j                  r|j                  |       n|j                  |       | j                  r|j                  |       [|j                  |       n |S c c}
}	w c c}
}	w )zReads one paragraph at a time..z^EOS\nEOS	r      N)read_regexp_block
splitlinesstripsplitjoinappendr	   r0   r1   extendlenr2   )r   streamblockpara_strparasentline_eos_cellswts              r   
read_blockzChasenCorpusView.read_blockY   s_   )&$	BHDD ++-zz|u,D)AY		&* 56KKND//D4G4G4J<<045fq!5**D)D)D .  4y1}||,01D&1aADD1&&KK%KK%""T"T"A CD +  6 2s   -F
Fr   )r)   r*   r+   __doc__r   rO   r,   r   r   r   r   D   s     N%r   r   c                      dd l } ddlm}  |dt        dd      }t	        dj                  |j                         dd	              t	        d
j                  d |j                         dd D                     y )Nr   LazyCorpusLoaderjeita.*chasenutf-8r/   /iTV  i|V  z
EOS
c              3   L   K   | ]  }d j                  d |D                yw)
c              3   p   K   | ].  }d j                  |d   |d   j                  d      d          0 yw)z{}/{}r   r<   r;      N)formatr@   ).0rM   s     r   	<genexpr>z!demo.<locals>.<genexpr>.<genexpr>   s2     NAgnnQqT1Q4::d+;A+>?s   46N)rA   )r]   rI   s     r   r^   zdemo.<locals>.<genexpr>   s'      
7 IINNN7s   "$iz  i}  )nltknltk.corpus.utilrS   r   printrA   r   r#   )r_   rS   rT   s      r   demorb      si    1W&8+PWXE	#((5;;=u-
./	 
**,T$7
 	
r   c                  |    ddl m}   | dt        dd      }t        |j	                         d   d   t
              sJ y )Nr   rR   rT   rU   rV   r/   r<   )r`   rS   r   
isinstancer   str)rS   rT   s     r   testrf      s:    1W&8+PWXEe((*1-a0#666r   __main__)sysnltk.corpus.readerr   nltk.corpus.reader.apinltk.corpus.reader.utilr
   r   r3   r   rb   rf   r)   r,   r   r   <module>rl      sQ     # $ %3
 3
l:- :z7 zFF r   