
    g                     ~    d dl Z d dlmZmZ d dlmZmZmZ d dlm	Z	 d Z
 G d de      Zd Zd	 Zed
k(  r e        yy)    N)CorpusReaderSyntaxCorpusReader)FileSystemPathPointerfind_corpus_fileidsread_blankline_block)DependencyGraphc                 2    dj                  d | D              S )N/c              3   8   K   | ]  }|d    dk7  s|d      yw)r   EOSN .0ms     L/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/knbc.py	<genexpr>z<lambda>.<locals>.<genexpr>   s     -TFqademadFs   
joinmorphss    r   <lambda>r      s    SXX-TF-T%T    c                   6    e Zd ZdZdefdZd Zd Zd	dZd Z	y)
KNBCorpusReadera  
    This class implements:
      - ``__init__``, which specifies the location of the corpus
        and a method for detecting the sentence blocks in corpus files.
      - ``_read_block``, which reads a block from the input stream.
      - ``_word``, which takes a block and returns a list of list of words.
      - ``_tag``, which takes a block and returns a list of list of tagged
        words.
      - ``_parse``, which takes a block and returns a list of parsed
        sentences.

    The structure of tagged words:
      tagged_word = (word(str), tags(tuple))
      tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)

    Usage example

    >>> from nltk.corpus.util import LazyCorpusLoader
    >>> knbc = LazyCorpusLoader(
    ...     'knbc/corpus1',
    ...     KNBCorpusReader,
    ...     r'.*/KN.*',
    ...     encoding='euc-jp',
    ... )

    >>> len(knbc.sents()[0])
    9

    utf8c                 B    t        j                  | |||       || _        y)z
        Initialize KNBCorpusReader
        morphs2str is a function to convert morphlist to str for tree representation
        for _parse()
        N)r   __init__
morphs2str)selfrootfileidsencodingr   s        r   r   zKNBCorpusReader.__init__7   s     	##D$B$r   c                     t        |      S N)r   )r   streams     r   _read_blockzKNBCorpusReader._read_block@   s    #F++r   c                     g }|j                         D ]L  }t        j                  d|      r|j                         j	                  d      }|j                  |d          N |S )NEOS|\*|\#|\+ r   )
splitlinesrematchstripsplitappend)r   treslinecellss        r   _wordzKNBCorpusReader._wordD   sR    LLND88OT2

**3/

58$	 # 
r   Nc           	          g }|j                         D ]`  }t        j                  d|      r|j                         j	                  d      }|j                  |d   dj                  |dd        f       b |S )Nr(   r)   r      )r*   r+   r,   r-   r.   r/   r   )r   r0   tagsetr1   r2   r3   s         r   _tagzKNBCorpusReader._tagO   sf    LLND88OT2

**3/

E!HchhuQRy&9:; # 
r   c                 @   t               }d}|j                         D ]-  }|d   dv r|j                         j                  dd      }t	        j
                  d|d         }|J |j                  |   }|j                  ||j                  d      g d       t        |j                  d            }|d	k(  r||_
        n!|j                  |   d
   j                  |       |dz  }|d   dk7  s|j                         j                  d      }|d   dj                  |dd        f}	|j                  |dz
     d   j                  |	       0 | j                  r6|j                  j                         D ]  }| j                  |d         |d<    |j                         S )Nr   z*+r)      z([\-0-9]*)([ADIP])r6      )addressrelworddeps#r>   )r   r*   r-   r.   r+   r,   nodesupdategroupintr    r/   r   r   valuestree)
r   r0   dgir2   r3   r   node
dep_parentmorphs
             r   _parsezKNBCorpusReader._parseZ   so   LLNDAw$ 

**32HH2E!H=}$}xx{!''!*bIJ _
#"BGHHZ(077:QaC

**3/a#((59"55Q'..u53 #6 ??)#tF|<V * wwyr   r$   )
__name__
__module____qualname____doc___morphs2str_defaultr   r&   r4   r8   rM   r   r   r   r   r      s(    < 06BU %,	"r   r   c                     dd l } ddlm} | j                  j	                  d      }t        t        |      d      D cg c]  }t        j                  d|      r| }}d } |dt        t        ||      d	
      }t        |j                         d d        t        dj                  |j                         d d              t        dj                  d |j                         d d D                     d |_        t        dj                  d |j                         d d D                     t        dj                  d |j#                         dd D                     y c c}w )Nr   LazyCorpusLoaderzcorpora/knbc/corpus1z.*z\d\-\d\-[\d]+\-[\d]+c                 ~    | j                  d      }|d   t        |d         t        |d         t        |d         fS )N-r   r6   r;   r:   )r.   rE   )xr3   s     r   _knbc_fileids_sortz demo.<locals>._knbc_fileids_sort   s:    a#eAh-U1XE!HFFr   knbc/corpus1)keyeuc-jpr"   
    d   z

c              3   2   K   | ]  }t        |        y wr$   )strr   rG   s     r   r   zdemo.<locals>.<genexpr>   s     D,CDc$i,Cs   r;   c                 P    dj                  d | D              j                  d      S )Nr
   c              3      K   | ]7  }|d    dk7  sdj                  |d    |d   j                  d      d          9 yw)r   r   z{}({})r6   r)   r;   Nformatr.   r   s     r   r   z)demo.<locals>.<lambda>.<locals>.<genexpr>   s>      .;AaQqTU]!adjjoa016s   ?/?zutf-8)r   encoder   s    r   r   zdemo.<locals>.<lambda>   s(    SXX .;A. &fWo&r   c              3   &   K   | ]	  }d |z    yw)z%sNr   rc   s     r   r   zdemo.<locals>.<genexpr>   s     F.EddTk.Es   
c              3   L   K   | ]  }d j                  d |D                yw)r)   c              3   p   K   | ].  }d j                  |d   |d   j                  d      d          0 yw)z{}/{}r   r6   r)   r;   Nrf   )r   ws     r   r   z!demo.<locals>.<genexpr>.<genexpr>   s1     Lt!W^^AaD!A$**S/!*<=ts   46Nr   )r   sents     r   r   zdemo.<locals>.<genexpr>   s'      
0 HHLtLL0s   "$)nltknltk.corpus.utilrU   datafindr   r   r+   searchr   sortedprintr!   r   wordsparsed_sentsr   tagged_sents)ro   rU   r    fr!   rY   knbcs          r   demor{      sC   199>>01D %%:4%@$GGA99,a0 	
G  G w./	D 
$,,."
	"''$**,t$
%&	&++DD,=,=,?,CD
DEDO 
&++Fd.?.?.A"1.EF
FG			 
))+Aa0
 	
9s   E"c                  T   ddl m}   | dt        dd      }t        |j	                         d   t
              sJ t        |j                         d   d   t
              sJ t        |j                         d   t              sJ t        |j                         d   d   t              sJ y )Nr   rT   rZ   z.*/KN.*r\   r]   )
rp   rU   r   
isinstancerv   rb   sentstagged_wordstuplerx   )rU   rz   s     r   testr      s    1hD djjl1os+++djjl1oa(#...d'')!,e444d'')!,Q/777r   __main__)r+   nltk.corpus.reader.apir   r   nltk.corpus.reader.utilr   r   r   
nltk.parser   rR   r   r{   r   rN   r   r   r   <module>r      sT    
 C 
 ' U d( dX&R	8 zF r   