
    gT                         d Z ddlZddl ddl ddlmZ ddlmZ ddlm	Z	m
Z
  G d de      Z G d	 d
      Z G d de      Z G d de      Zy)z!
Read CoNLL-style chunk fileids.
    N)*)map_tag)Tree)LazyConcatenationLazyMapc                      e Zd ZdZdZdZdZdZdZdZ	dZ
eeeeee	e
fZd	d
ddded	d	fdZd%dZd%dZd&dZd&dZd'dZd'dZd'dZd%dZd(dZd&dZd&dZd%dZd Zd Zd%dZd%dZd%dZd%d Zd! Z d" Z!d# Z"e#d$        Z$y	))ConllCorpusReadera  
    A corpus reader for CoNLL-style files.  These files consist of a
    series of sentences, separated by blank lines.  Each sentence is
    encoded using a table (or "grid") of values, where each line
    corresponds to a single word, and each column corresponds to an
    annotation type.  The set of columns used by CoNLL-style files can
    vary from corpus to corpus; the ``ConllCorpusReader`` constructor
    therefore takes an argument, ``columntypes``, which is used to
    specify the columns that are used by a given corpus. By default
    columns are split by consecutive whitespaces, with the
    ``separator`` argument you can set a string to split by (e.g.
    ``'	'``).


    @todo: Add support for reading from corpora where different
        parallel files contain different columns.
    @todo: Possibly add caching of the grid corpus view?  This would
        allow the same grid view to be used by different data access
        methods (eg words() and parsed_sents() could both share the
        same grid corpus view object).
    @todo: Better support for -DOCSTART-.  Currently, we just ignore
        it, but it could be used to define methods that retrieve a
        document at a time (eg parsed_documents()).
    wordspostreechunknesrlignoreNSFTutf8c                 V   |D ]  }|| j                   vst        d|z         t        |t              r|g}|| _        t        |      D ci c]  \  }}||
 c}}| _        || _        || _        || _	        |	| _
        t        j                  | |||       |
| _        || _        y c c}}w )NzBad column type %r)COLUMN_TYPES
ValueError
isinstancestr_chunk_types	enumerate_colmap_pos_in_tree_root_label_srl_includes_roleset_tree_classCorpusReader__init___tagsetsep)selfrootfileidscolumntypeschunk_types
root_labelpos_in_treesrl_includes_rolesetencoding
tree_classtagset	separator
columntypeics                  M/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/conll.pyr    zConllCorpusReader.__init__C   s     &J!2!22 !5
!BCC & k3'&-K'+4[+AB+A!Q1+AB'%%9"%dD'8< Cs   B%c                     | j                  | j                         t        t        | j                  | j                  |                  S N)_requireWORDSr   r   
_get_words_gridsr#   r%   s     r2   r
   zConllCorpusReader.wordsd   s1    djj! $++g:N!OPP    c                     | j                  | j                         t        | j                  | j	                  |            S r4   )r5   r6   r   r7   r8   r9   s     r2   sentszConllCorpusReader.sentsh   s,    djj!tG(<==r:   c                       j                   j                   j                          fd}t        t	        | j                  |                  S )Nc                 (    j                  |       S r4   _get_tagged_wordsgridr#   r-   s    r2   get_tagged_wordsz8ConllCorpusReader.tagged_words.<locals>.get_tagged_wordso       ))$77r:   )r5   r6   POSr   r   r8   r#   r%   r-   rC   s   ` ` r2   tagged_wordszConllCorpusReader.tagged_wordsl   s<    djj$((+	8 !)94;;w;O!PQQr:   c                       j                   j                   j                          fd}t        | j	                  |            S )Nc                 (    j                  |       S r4   r?   rA   s    r2   rC   z8ConllCorpusReader.tagged_sents.<locals>.get_tagged_wordsw   rD   r:   )r5   r6   rE   r   r8   rF   s   ` ` r2   tagged_sentszConllCorpusReader.tagged_sentst   s7    djj$((+	8 'W)=>>r:   c                       j                   j                   j                   j                          j                   fd}t        t        | j                  |                  S )Nc                 *    j                  |       S r4   _get_chunked_wordsrB   r'   r#   r-   s    r2   get_chunked_wordsz:ConllCorpusReader.chunked_words.<locals>.get_chunked_words       **4fEEr:   )r5   r6   rE   CHUNKr   r   r   r8   r#   r%   r'   r-   rP   s   ` `` r2   chunked_wordszConllCorpusReader.chunked_words|   sT    djj$((DJJ7++K	F !):DKK<P!QRRr:   c                       j                   j                   j                   j                          j                   fd}t        | j                  |            S )Nc                 *    j                  |       S r4   rM   rO   s    r2   rP   z:ConllCorpusReader.chunked_sents.<locals>.get_chunked_words   rQ   r:   )r5   r6   rE   rR   r   r   r8   rS   s   ` `` r2   chunked_sentszConllCorpusReader.chunked_sents   sO    djj$((DJJ7++K	F ($++g*>??r:   c                       j                   j                   j                   j                          j                   fd}t        | j                  |            S )Nc                 *    j                  |       S r4   )_get_parsed_sent)rB   r)   r#   r-   s    r2   get_parsed_sentz7ConllCorpusReader.parsed_sents.<locals>.get_parsed_sent   s    (({FCCr:   )r5   r6   rE   TREEr   r   r8   )r#   r%   r)   r-   r[   s   ` `` r2   parsed_sentszConllCorpusReader.parsed_sents   sN    djj$((DII6++K	D G(<==r:   c                     | j                  | j                         t        | j                  | j	                  |            S r4   )r5   SRLr   _get_srl_spansr8   r9   s     r2   	srl_spanszConllCorpusReader.srl_spans   s.    dhht**DKK,@AAr:   c                       j                   j                   j                   j                   j                          j
                   fd}t        | j                  |            }|rt        |      }|S )Nc                 (    j                  |       S r4   )_get_srl_instances)rB   r)   r#   s    r2   get_srl_instancesz:ConllCorpusReader.srl_instances.<locals>.get_srl_instances   s    **4==r:   )	r5   r6   rE   r\   r_   r   r   r8   r   )r#   r%   r)   flattenre   results   ` `   r2   srl_instanceszConllCorpusReader.srl_instances   se    djj$((DIItxx@++K	> *DKK,@A&v.Fr:   c                       j                   j                   j                   j                          fd}t	        t        | j                  |                  S )z
        :return: a list of word/tag/IOB tuples
        :rtype: list(tuple)
        :param fileids: the list of fileids that make up this corpus
        :type fileids: None or str or list
        c                 (    j                  |       S r4   _get_iob_wordsrA   s    r2   get_iob_wordsz2ConllCorpusReader.iob_words.<locals>.get_iob_words       &&tV44r:   )r5   r6   rE   rR   r   r   r8   r#   r%   r-   rm   s   ` ` r2   	iob_wordszConllCorpusReader.iob_words   sC     	djj$((DJJ7	5 !G8L!MNNr:   c                       j                   j                   j                   j                          fd}t	        | j                  |            S )z
        :return: a list of lists of word/tag/IOB tuples
        :rtype: list(list)
        :param fileids: the list of fileids that make up this corpus
        :type fileids: None or str or list
        c                 (    j                  |       S r4   rk   rA   s    r2   rm   z2ConllCorpusReader.iob_sents.<locals>.get_iob_words   rn   r:   )r5   r6   rE   rR   r   r8   ro   s   ` ` r2   	iob_sentszConllCorpusReader.iob_sents   s>     	djj$((DJJ7	5 }dkk'&:;;r:   c                     t        | j                  |d      D cg c]  \  }}t        || j                  |       c}}      S c c}}w )NT)r+   )concatabspathsStreamBackedCorpusView_read_grid_block)r#   r%   fileidencs       r2   r8   zConllCorpusReader._grids   sQ      &*]]7D%A%AMVS 'vt/D/DsS%A
 	
s   "A
c                    g }t        |      D ]  }|j                         }|s|j                  d      D cg c]  }|j                  | j                         }}|d   | j                  j                  dd         dk(  r|d= |D ]*  }t        |      t        |d         k7  st        d|z         |j                  |        |S c c}w )N
r   r
   z
-DOCSTART-z"Inconsistent number of columns:
%s)	read_blankline_blockstripsplitr"   r   getlenr   append)r#   streamgridsblocklinerB   rows          r2   rx   z"ConllCorpusReader._read_grid_block   s    )&1EKKME5:[[5FG5FTDJJtxx(5FDG Awt||''34DG s8s47|+$%JU%RSS  LL! 2"  Hs   "Cc                 @    | j                  || j                  d         S )Nr
   )_get_columnr   )r#   rB   s     r2   r7   zConllCorpusReader._get_words   s    dll7&;<<r:   c           	         | j                  || j                  d         }|r3|| j                  k7  r$|D cg c]  }t        | j                  ||       }}t	        t        | j                  || j                  d         |            S c c}w )Nr   r
   r   r   r!   r   listzipr#   rB   r-   pos_tagsts        r2   r@   z#ConllCorpusReader._get_tagged_words   sz    ##D$,,u*=>f,BJK(Qfa8(HKC((t||G/DExPQQ Ls   Bc                 T   | j                  || j                  d         }|r3|| j                  k7  r$|D cg c]  }t        | j                  ||       }}t	        t        | j                  || j                  d         || j                  || j                  d                     S c c}w )Nr   r
   r   r   r   s        r2   rl   z ConllCorpusReader._get_iob_words   s    ##D$,,u*=>f,BJK(Qfa8(HK  t||G'<=  t||G'<=
 	
 Ls   B%c                    | j                  || j                  d         }| j                  || j                  d         }|r3|| j                  k7  r$|D cg c]  }t        | j                  ||       }}| j                  || j                  d         }t	        | j
                  g       g}t        |||      D ]  \  }	}
}|dk(  rd\  }}n|j                  d      \  }}|||vrd}|dk(  r||d   j                         k7  rd	}|d
v rt        |      dk(  r|j                          |d	k(  r1t	        |g       }|d   j                  |       |j                  |       |d   j                  |	|
f        |d   S c c}w )Nr
   r   r   O)r    -IBBO   r   )r   r   r!   r   r   r   r   r   labelr   popr   )r#   rB   r'   r-   r
   r   r   
chunk_tagsstackwordpos_tag	chunk_tagstate
chunk_type	new_chunks                  r2   rN   z$ConllCorpusReader._get_chunked_words  sy     t||G'<=##D$,,u*=>f,BJK(Qfa8(HK%%dDLL,AB
d&&+,(+E8Z(H$D'9C$+!z&/ooc&:#
&:[+H|
eBioo.? ?}Uq		| R0	b	  +Y'"IdG_-) )I, Qx7 Ls   E7c           	         | j                  || j                  d         }| j                  || j                  d         }|r3|| j                  k7  r$|D cg c]  }t        | j                  ||       }}| j                  || j                  d         }d}t	        |||      D ]Z  \  }	}
}|	dk(  rd}	|	dk(  rd}	|
dk(  rd}
|
dk(  rd}
|j                  d	      \  }}|j                  d      dz  }|| d
|
 d|	 d| z  }\ 	 | j                  j                  |      }|st|j                         D ]a  }t        |      D ]Q  \  }}t        |t              st!        |      dk(  s&t        |d   t"              s:|d   |j%                         f||<   S c |S c c}w # t        t        f$ r/ | j                  j                  d| j                   d| d      }Y w xY w)Nr
   r   r   r   (z-LRB-)z-RRB-r   z ( z)    r   )r   r   r!   r   r   r   countr   
fromstringr   
IndexErrorr   subtreesr   r   r   r   r   r   )r#   rB   r)   r-   r
   r   r   
parse_tagstreestrr   r   	parse_tagleftrightr   subtreer0   childs                     r2   rZ   z"ConllCorpusReader._get_parsed_sent(  s     t||G'<=##D$,,u*=>f,BJK(Qfa8(HK%%dDLL,@A
(+E8Z(H$D'9s{s{#~!#~!%OOC0MT5KK$s*E$r'!D6E7;;G )I	R##..w7D ==? )' 2HAu"5$/J!O&uQx5&+Ah%>
 !3 + ? L$ J' 	R##..43C3C2DAgYa/PQD	Rs   F F ;GGc                    | j                   r5| j                  || j                  d   dz         }| j                  d   dz   }n1| j                  || j                  d         }| j                  d   dz   }t        |D cg c]
  }|dk7  s	| c}      }g }t	        |      D ]  }| j                  |||z         }g }	g }
t        |      D ]  \  }}|j                  d      \  }}|j                  d      D ]  }|s|
j                  ||f        t	        |j                  d            D ]-  }|
j                         \  }}|	j                  ||dz   f|f       /  |j                  |	        |S c c}w )z;
        list of list of (start, end), tag) tuples
        r   r   r   r   r   r   r   )
r   r   r   r   ranger   r   r   r   r   )r#   rB   
predicates	start_colp	num_preds	spanlistsr0   colspanlistr   wordnumsrl_tagr   r   tagstarts                    r2   r`   z ConllCorpusReader._get_srl_spansM  sm    %%))$U0Ca0GHJU+a/I))$U0CDJU+a/I J;Jq!s(J;<		y!A""4Q7CHE$-cN  'c 2u::c?Cc7^4 + u{{3/0A#(99;LS%OOeWq[%93$?@ 1 %3 X& " # <s   <
E-E-c           
      @   | j                  ||      }| j                  |      }| j                  rB| j                  || j                  d   dz         }| j                  || j                  d         }n.| j                  || j                  d         }d gt        |      z  }t        |      }t        |      D ]e  \  }}	|	dk(  r|D ]&  }
|
D ]  \  \  }}}|t        ||      v s|dv s n & n t        d|	z        |j                  t        |||	||   |
             g |S )Nr   r   r   VzC-VzNo srl column found for %r)rZ   r`   r   r   r   r   ConllSRLInstanceListr   r   r   r   ConllSRLInstance)r#   rB   r)   r   r   r   rolesets	instancesr   	predicater   r   endr   s                 r2   rd   z$ConllCorpusReader._get_srl_instancesm  s:   $$T;7''-	%%))$U0Ca0GHJ''dll5.ABH))$U0CDJvJ/H(.	"+J"7GYC &)1%LUC#%s"33|8K *2  & !!=	!IJJ w	8G;LhW #8& r:   c                 J    |D ]  }|| j                   vst        d|z         y )Nz)This corpus does not contain a %s column.)r   r   )r#   r&   r/   s      r2   r5   zConllCorpusReader._require  s-    %J- BZO  &r:   c                 \    t        t        |             D cg c]
  }| |   |    c}S c c}w r4   )r   r   )rB   column_indexr0   s      r2   r   zConllCorpusReader._get_column  s.    /4SY/?@/?!Q%/?@@@s   )r4   )NN)NNN)NNT)%__name__
__module____qualname____doc__r6   rE   r\   rR   NEr_   IGNOREr   r   r    r
   r<   rG   rJ   rT   rW   r]   ra   rh   rp   rs   r8   rx   r7   r@   rl   rN   rZ   r`   rd   r5   staticmethodr    r:   r2   r	   r	      s    : E
CDE	B
CF 3eRf=L !BQ>R?S@>BO<$	
6=R

 D#J@H A Ar:   r	   c                   "    e Zd ZdZd Zd Zd Zy)r   z|
    An SRL instance from a CoNLL corpus, which identifies and
    providing labels for the arguments of a single verb.
    c           	      N   g | _         	 || _        	 || _        || _        g | _        	 || _        	 || _        	 |j                         | _        	 |D ]T  \  \  }}}|dv r)| xj                   t        t        ||            z  c_         6| j                  j                  ||f|f       V y )Nr   )verb	verb_head	verb_stemroleset	argumentstagged_spansr   leavesr
   r   r   r   )	r#   r   r   r   r   r   r   r   r   s	            r2   r    zConllSRLInstance.__init__  s    		/
 #	"
 #	F
 )	* 	G[[]
	 ".LUC#l"		T%s"344	%%s|S&9:	 ".r:   c                     t        | j                        dk7  rdnd}d| j                  t        | j                        |fz  S )Nr   sr   z,<ConllSRLInstance for %r with %d argument%s>)r   r   r   )r#   plurals     r2   __repr__zConllSRLInstance.__repr__  s?     DNN+q0b=^^S0&9
 	
r:   c                     dj                   fd j                  D              }d|d j                  d}d}t         j                        D ]d  \  }}t        |t              r|d   } j                  D ]   \  \  }}}||k(  r|d|z  z  }||k(  s|d	z  }" | j                  v rd
|z  }||dz   z  }f |t        j                  |j                  dd      dd      z   S )Nr   c              3   B   K   | ]  }j                   |   d      yw)r   N)r
   ).0r0   r#   s     r2   	<genexpr>z*ConllSRLInstance.pprint.<locals>.<genexpr>  s     ?Y4::a=+Ys   zSRL for z (stem=z):
r   r   z[%s z] z<<%s>>z ]]z    )initial_indentsubsequent_indent)joinr   r   r   r
   r   tupler   textwrapfillreplace)	r#   verbstrhdrr   r0   r   r   r   argids	   `        r2   pprintzConllSRLInstance.pprint  s    ((?TYY??74>>*<DA ,GAt$&Aw'+~~#e:%'A8IA	 (6
 DII~$OA - X]]IIdC 6
 
 	
r:   N)r   r   r   r   r    r   r   r   r:   r2   r   r     s    (;T

r:   r   c                   ,    e Zd ZdZddZd ZddZd Zy)	r   z0
    Set of instances for a single sentence
    c                 >    || _         t        j                  | |       y r4   )r   r   r    )r#   r   r   s      r2   r    zConllSRLInstanceList.__init__  s    	dI&r:   c                 "    | j                         S r4   )r   )r#   s    r2   __str__zConllSRLInstanceList.__str__  s    {{}r:   c                    | D ]&  }|j                   | j                   k7  st        d       |rW| j                   j                         }d gt        |      z  }dgt        |      z  }| j	                  | j                   d|||       d}t        t                    D ]  }|r9|d||   z  z  }|d|   z  z  }|dt        |   j                  d            z  z  }| D ]%  }||j                  k(  s|d|j                  z  z  } n	 |ddz  z  }| D ]=  }d}|j                  D ]"  \  \  }	}
}||	k(  rd	| | }||
d
z
  k(  s|dz  }$ |d|z  z  }? |dz  } |S )NzTree mismatch!r   r   r   z%-20s z%-8s z
%15s*%-8s r   r   r   r   z%-12s r|   )r   r   r   r   _tree2conllr   r   r   r   r   r   )r#   include_treeinstr
   r   syntr   r0   argstrr   r   r   s               r2   r   zConllSRLInstanceList.pprint  s   DyyDII% !122 
 II$$&E&3u:%C53u:%DTYY5#t<s5z"AXa((Ws1v%%\E$q'--*<$=== &DNN22A 
 X^#+/+<+<'LUC%Ez#$UGF8!4S1W~#	 ,=
 X&&  IA/ #0 r:   c                    t        |t              sJ t        |      dk(  r8t        |d   t              r%|j	                         ||<   ||   |d   k(  sJ |dz   S t        |      dk(  r9t        |d   t
              r&t        |d         dk(  sJ |d   \  ||<   ||<   |dz   S d|j	                          ||    ||<   |D ]  }| j                  |||||      } ||dz
  xx   dz  cc<   |S )Nr   r   r   r   r   )r   r   r   r   r   r   r   )r#   r   r   r
   r   r   r   s          r2   r   z ConllSRLInstanceList._tree2conll  s    $%%%t9>ja#6::<CL>T!W,,,Q;Y!^
47E :tAw<1$$$)-a&CL#g,Q;

~d7m_=DM**5'5#tL 1$Nr:   N)r   )F)r   r   r   r   r    r   r   r   r   r:   r2   r   r     s    '&Pr:   r   c                       e Zd ZdZ	 ddZy)ConllChunkCorpusReaderz`
    A ConllCorpusReader whose data file contains three columns: words,
    pos, and chunk.
    Nc           
      >    t         j                  | ||d||||       y )N)r
   r   r   )r'   r+   r-   r.   )r	   r    )r#   r$   r%   r'   r+   r-   r.   s          r2   r    zConllChunkCorpusReader.__init__7  s/     	""%# 	# 		
r:   )r   NN)r   r   r   r   r    r   r:   r2   r   r   1  s     SW
r:   r   )r   r   nltk.corpus.reader.apinltk.corpus.reader.utilnltk.tagr   	nltk.treer   	nltk.utilr   r   r   r	   r   r   r   r   r   r:   r2   <module>r     sX     $ %   0DA DANK
 K
\C4 CL
. 
r:   