
    g1                     Z    d dl Z d dlmZ d dlmZmZ d Z G d de      Z G d de      Zy)	    N)CorpusReader)StreamBackedCorpusViewconcatc                 D     t        j                         d fd	       }|S )Nc                 `    |j                  dd        |s| j                         } | |fi |S )Ntags)popfileids)selfr
   kwargsfuns      N/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/ipipan.py	decoratorz_parse_args.<locals>.decorator   s1    

64 llnG4+F++    N)	functoolswraps)r   r   s   ` r   _parse_argsr      s&    __S, , r   c                       e Zd ZdZd ZddZddZddZddZe	dd       Z
e	dd	       Ze	dd
       Ze	dd       Ze	dd       Ze	dd       Zd Zd Zd ZddZd Zd Zd Zy)IPIPANCorpusReadera5  
    Corpus reader designed to work with corpus created by IPI PAN.
    See http://korpus.pl/en/ for more details about IPI PAN corpus.

    The corpus includes information about text domain, channel and categories.
    You can access possible values using ``domains()``, ``channels()`` and
    ``categories()``. You can use also this metadata to filter files, e.g.:
    ``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``.

    The reader supports methods: words, sents, paras and their tagged versions.
    You can get part of speech instead of full tag by giving "simplify_tags=True"
    parameter, e.g.: ``tagged_sents(simplify_tags=True)``.

    Also you can get all tags disambiguated tags specifying parameter
    "one_tag=False", e.g.: ``tagged_paras(one_tag=False)``.

    You can get all tags that were assigned by a morphological analyzer specifying
    parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``.

    The IPIPAN Corpus contains tags indicating if there is a space between two
    tokens. To add special "no space" markers, you should specify parameter
    "append_no_space=True", e.g. ``tagged_words(append_no_space=True)``.
    As a result in place where there should be no space between two tokens new
    pair ('', 'no-space') will be inserted (for tagged data) and just '' for
    methods without tags.

    The corpus reader can also try to append spaces between words. To enable this
    option, specify parameter "append_space=True", e.g. ``words(append_space=True)``.
    As a result either ' ' or (' ', 'space') will be inserted between tokens.

    By default, xml entities like &quot; and &amp; are replaced by corresponding
    characters. You can turn off this feature, specifying parameter
    "replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``.
    c                 6    t        j                  | ||d d        y r   )r   __init__)r   rootr
   s      r   r   zIPIPANCorpusReader.__init__=   s    dD'4>r   Nc                 J    |s| j                         }| j                  |d      S )Nchannelr
   _parse_headerr   r
   s     r   channelszIPIPANCorpusReader.channels@   s#    llnG!!'955r   c                 J    |s| j                         }| j                  |d      S )Ndomainr   r   s     r   domainszIPIPANCorpusReader.domainsE   s#    llnG!!'844r   c                     |s| j                         }| j                  |d      D cg c]  }| j                  |       c}S c c}w )NkeyTerm)r
   r   _map_category)r   r
   cats      r   
categorieszIPIPANCorpusReader.categoriesJ   sJ    llnG/3/A/A'9/U
/UDs#/U
 	
 
s   Ac                 X   |||t        d      |||t        j                  |       S t        |t              r|g}t        |t              r|g}t        |t              r|g}|r| j                  d|      S |r| j                  d|      S | j                  d|| j                        S )NzNYou can specify only one of channels, domains and categories parameter at oncer   r!   r$   )map)
ValueErrorr   r
   
isinstancestr_list_morph_files_byr%   )r   r   r"   r'   s       r   r
   zIPIPANCorpusReader.fileidsQ   s    G$7J<R3  J4F''--h$ zHgs#iGj#&$J,,YAA,,Xw??,,:4+=+= -  r   c                     t        | j                  |      D cg c]&  } | j                  |ft        j                  dd|( c}      S c c}w NF)moder   r   _list_morph_files_viewIPIPANCorpusView
SENTS_MODEr   r
   r   fileids       r   sentszIPIPANCorpusReader.sentsh   f    
 #44W=	 >F 

!1!<!<5LR >	
 	
   +Ac                     t        | j                  |      D cg c]&  } | j                  |ft        j                  dd|( c}      S c c}w r/   r   r2   r3   r4   
PARAS_MODEr6   s       r   paraszIPIPANCorpusReader.parass   r9   r:   c           
          t        | j                  |      D cg c]  } | j                  |fddi| c}      S c c}w )Nr   Fr   r2   r3   r6   s       r   wordszIPIPANCorpusReader.words~   sQ     #44W==F 

6888=
 	
s   <c           
          t        | j                  |      D cg c]%  } | j                  |fdt        j                  i|' c}      S c c}w Nr0   r1   r6   s       r   tagged_sentszIPIPANCorpusReader.tagged_sents   X     #44W==F 

6N(8(C(CNvN=
 	
   *A
c           
          t        | j                  |      D cg c]%  } | j                  |fdt        j                  i|' c}      S c c}w rC   r<   r6   s       r   tagged_paraszIPIPANCorpusReader.tagged_paras   rE   rF   c           
      ~    t        | j                  |      D cg c]  } | j                  |fi | c}      S c c}w r   r@   r6   s       r   tagged_wordszIPIPANCorpusReader.tagged_words   sA    8<8N8Nw8WX8WfZTZZ)&)8WX
 	
Xs   :c                 J    | j                  |      D cg c]  }| c}S c c}w r   )abspathsr   r
   fs      r   r2   z$IPIPANCorpusReader._list_morph_files   s%    ==121a1222s   	 c                 j    | j                  |      D cg c]  }|j                  dd       c}S c c}w Nz	morph.xmlz
header.xml)r2   replacerM   s      r   _list_header_filesz%IPIPANCorpusReader._list_header_files   s?     ++G4
4 IIk<04
 	
 
s   0c                     t               }| j                  |      D ],  }| j                  ||      }|D ]  }|j                  |        . t	        |      S r   )setrR   _get_tagaddlist)r   r
   tagvaluesrN   values_listvs          r   r   z IPIPANCorpusReader._parse_header   sO    ((1A--3/K 

1 ! 2 F|r   c                    | j                         }t               }|D ]\  }| j                  |      j                  dd      }| j	                  ||      }|D ]"  }	| ||	      }	|	|v s|j                  |       $ ^ t        |      S rP   )r
   rT   abspathrQ   rU   rV   rW   )
r   rX   rY   r)   r
   ret_fileidsrN   fprZ   values
             r   r-   z'IPIPANCorpusReader._list_morph_files_by   s    ,,.eAa((lCB--C0K$?JEF?OOA&	 %  K  r   c                    g }t        |      5 }|j                         }d d d        d}	 j                  d|z   |      }|dk  r|S |j                  d|z   dz   |      }|j                  ||t	        |      z   dz   |        X# 1 sw Y   dxY w)Nr   <z</>   )openreadfindappendlen)r   rN   rX   r   infileheadertag_endtag_poss           r   rU   zIPIPANCorpusReader._get_tag   s    !W[[]F kk#)W5G{kk$*s"2G<GKKwS1A5@A  Ws   BBc                 B    |j                  d      }|dk(  r|S ||dz   d  S )Nrc      )rg   )r   r&   poss      r   r%   z IPIPANCorpusReader._map_category   s+    hhsm"9JsQwy>!r   c                    |j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  dd      }|j                  d	d      }|j                  d
d      }	|j                  dd      }
t        |      dkD  rt        d|j                         z        |s|st        d      |s|s|r|st        d      t	        ||||||||	|
	      S )Nr   Tr0   r   simplify_tagsFone_tagdisamb_onlyappend_no_spaceappend_spacereplace_xmlentitieszUnexpected arguments: %sz;You cannot specify both one_tag=False and disamb_only=Falsez[You cannot specify simplify_tags, one_tag or disamb_only with functions other than tagged_*)r   r0   rs   rt   ru   rv   rw   rx   )r	   ri   r*   keysr4   )r   filenamer   r   r0   rs   rt   ru   rv   rw   rx   s              r   r3   zIPIPANCorpusReader._view   s   zz&$'zz&!$

?E:**Y-jj5 **%6>zz.%8$jj)>Ev;?7&++-GHH{P  g[A 
  '#+% 3

 
	
r   r   )NNN)__name__
__module____qualname____doc__r   r   r"   r'   r
   r   r8   r>   rA   rD   rH   rJ   r2   rR   r   r-   rU   r%   r3    r   r   r   r      s    !F?6
5

. 
 
 
 
 
 
 
 
 
 
 
 

3
!
B" 
r   r   c                   8    e Zd ZdZdZdZd
dZd Zd Zd Z	d Z
y	)r4   r   rp   rd   c                    t        j                  | |d |d        d| _        d| _        |j	                  dd      | _        |j	                  dd      | _        |j	                  dt        j                        | _	        |j	                  dd      | _
        |j	                  dd      | _        |j	                  d	d      | _        |j	                  d
d      | _        |j	                  dd      | _        y )NFr   r   Tru   r0   rs   rt   rv   rw   rx   )r   r   in_sentencepositionr	   	show_tagsru   r4   
WORDS_MODEr0   rs   rt   rv   rw   rx   )r   rz   startposr   s       r   r   zIPIPANCorpusView.__init__   s    ''hhM FD1!::mT:JJv'7'B'BC	#ZZ?zz)T2%zz*;UC"JJ~u=#)::.CT#J r   c                 L   g }g }d}d}t               }| j                  |      }	 t        |      dk  r"| j                  |       | j                  |      }|dgk(  r|rJ g S |j	                         }| xj
                  t        |      dz   z  c_        |j                  d      r	d| _        n|j                  d      rn|j                  d      r3| j                  r|r|s| j                  |       d}d}d}	t               }n|j                  d      r| j                  rd| _        | j                  |       | j                  | j                  k(  r|gS | j                  | j                  k(  r| j                  r| j                  |       |S |j                  |       n | j                  | j                  k(  r| j                  |       |gS |j                  d	      r6|d
d }	| j                  r|	j!                  dd      j!                  dd      }	n|j                  d      rZ| j"                  r|j%                  d      dk7  rY||j'                  d      d
z   |j'                  d       }
|j)                  |
       n |j                  d      r| j*                  r| j,                  r!|D cg c]  }|j/                  d      d    }}| j0                  r| j"                  s|j                  	t3        |      f       n|j                  	|j	                         f       n|j                  	       nm|j                  d      rJ| j                  rd}| j4                  rB| j*                  r|j                  d       n$|j                  d       n|j                  d      r	 |c c}w )NFTrp    z<chunk type="s"z<chunk type="p"z<tokz</chunkz<orth   iz&quot;"z&amp;&z<lexzdisamb=ro   z<ctagz</ctagz</tok:r   z<ns/>)r   zno-spacez</cesAna)rT   
_read_datari   _seekr	   r   
startswithr   rw   _append_spacer0   r5   r   rh   r=   rx   rQ   ru   rg   indexrV   r   rs   splitrt   tuplerv   )r   streamsentence	sentencesspaceno_spacer   lineslineorthrX   ts               r   
read_blockzIPIPANCorpusView.read_block  s   	u'5zQ

6"/}$$}	99;DMMSY]*M01#' !23($$x&&x0 u+##',D$JJv&yyDOO3 (z)doo5,, ..x8'!((2YY$//1JJv&%;&)Abz++<<#6>>wLD(''499Y+?2+Etzz'2Q6H9MNCHHSM)>>))9=>AQ><<t/?/? uT{(;< txxz(:;OOD))$$#H''~~ (89 +,E d  ?s   ,N!c                     |j                         | _        |j                  d      }|j                  d      }|j	                          |S )Ni   
)tellr   rf   r   reverse)r   r   buffr   s       r   r   zIPIPANCorpusView._read_dataT  s9    {{4 

4 r   c                 :    |j                  | j                         y r   )seekr   )r   r   s     r   r   zIPIPANCorpusView._seek[  s    DMM"r   c                 b    | j                   r|j                  d       y |j                  d       y )N) r   r   )r   rh   )r   r   s     r   r   zIPIPANCorpusView._append_space^  s!    >>OON+OOC r   N)r   )r{   r|   r}   r   r5   r=   r   r   r   r   r   r   r   r   r4   r4      s-    JJJKL\#!r   r4   )	r   nltk.corpus.reader.apir   nltk.corpus.reader.utilr   r   r   r   r4   r   r   r   <module>r      s3     / BW
 W
to!- o!r   