
    g=                         d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZ d Z	 G d de      Z
 G d de      Z G d	 d
      Z G d de      Z G d de      Z G d de      Zy)    N)concat)XMLCorpusReaderXMLCorpusViewc                 D     t        j                         d fd	       }|S )zj
    Wraps function arguments:
    if fileids not specified then function set NKJPCorpusReader paths.
    c                 4    |s| j                   } | |fi |S N)_paths)selffileidskwargsfuns      L/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/nkjp.py	decoratorz_parse_args.<locals>.decorator   s!    kkG4+F++    r   )	functoolswraps)r   r   s   ` r   _parse_argsr      s(     __S, ,
 r   c                       e Zd ZdZdZdZdZddZd Zd Z	dd	Z
d
 Zedd       Zedd       Zedd       Zedd       Zedd       Zy)NKJPCorpusReaderr            c           	          t        |t              rt        j                  | ||dz          n(t        j                  | ||D cg c]  }|dz   	 c}       | j	                         | _        yc c}w )aN  
        Corpus reader designed to work with National Corpus of Polish.
        See http://nkjp.pl/ for more details about NKJP.
        use example:
        import nltk
        import nkjp
        from nkjp import NKJPCorpusReader
        x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
        x.header()
        x.raw()
        x.words()
        x.tagged_words(tags=['subst', 'comp'])  #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
        x.sents()
        x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
        x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
        x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
        z.*/header.xmlz/header.xmlN)
isinstancestrr   __init__	get_pathsr	   )r
   rootr   fileids       r   r   zNKJPCorpusReader.__init__&   sd    $ gs#$$T4?1JK$$d'J'Vm3'J nn& Ks   A*c           	          | j                   D cg c]G  }t        j                  j                  t	        | j
                        |j                  d      d         I c}S c c}w )N
header.xmlr   )_fileidsospathjoinr   _rootsplitr
   fs     r   r   zNKJPCorpusReader.get_paths@   sR     ]]
" GGLLTZZ!'',*?*BC"
 	
 
s   AAc                 d    | j                   D cg c]  }|j                  d      d    c}S c c}w )zf
        Returns a list of file identifiers for the fileids that make up
        this corpus.
        r!   r   )r"   r'   r(   s     r   r   zNKJPCorpusReader.fileidsF   s.    
 37--@-Q%a(-@@@s   -Nc                 n   |j                  dt        j                        }|t        j                  u rt        ||      S |t        j                  u rt        ||      S |t        j                  u rt        ||      S |t        j                  u rt        ||t        j                        S t        d      )zQ
        Returns a view specialised for use with particular corpus file.
        mode)tags)r-   r,   zNo such mode!)popr   
WORDS_MODENKJPCorpus_Morph_View
SENTS_MODENKJPCorpus_Segmentation_ViewHEADER_MODENKJPCorpus_Header_ViewRAW_MODENKJPCorpus_Text_View	NameError)r
   filenamer-   r   r,   s        r   _viewzNKJPCorpusReader._viewM   s     zz&"2"="=>#...(==%000/tDD%111)(>>%...'t*>*G*G 
 O,,r   c                 @    | j                   |v r|S | j                   |z   S )z<
        Add root if necessary to specified fileid.
        )r   )r
   r   s     r   add_rootzNKJPCorpusReader.add_root`   s$     99Myy6!!r   c           
          t        |D cg c]B  } | j                  | j                  |      fdt        j                  i|j                         D c}      S c c}w )z9
        Returns header(s) of specified fileids.
        r,   )r   r9   r;   r   r3   handle_queryr
   r   r   r   s       r   headerzNKJPCorpusReader.headerh   sp    
 
 &	 &F 

MM&)0@0L0LPV,.! &	
 	
   AAc           
          t        |D cg c]B  } | j                  | j                  |      fdt        j                  i|j                         D c}      S c c}w )z9
        Returns sentences in specified fileids.
        r,   )r   r9   r;   r   r1   r=   r>   s       r   sentszNKJPCorpusReader.sentsv   sp    
 
 &	 &F 

MM&)0@0K0KOU,.! &	
 	
r@   c           
          t        |D cg c]B  } | j                  | j                  |      fdt        j                  i|j                         D c}      S c c}w z5
        Returns words in specified fileids.
        r,   )r   r9   r;   r   r/   r=   r>   s       r   wordszNKJPCorpusReader.words   sp     
 &	 &F 

MM&)0@0K0KOU,.! &	
 	
r@   c                     |j                  dg       }t        |D cg c]C  } | j                  | j                  |      ft        j
                  |d|j                         E c}      S c c}w )z
        Call with specified tags as a list, e.g. tags=['subst', 'comp'].
        Returns tagged words in specified fileids.
        r-   )r,   r-   )r.   r   r9   r;   r   r/   r=   )r
   r   r   r-   r   s        r   tagged_wordszNKJPCorpusReader.tagged_words   s     zz&"% & &F 

MM&))44 	
 ,.! &

 
	
s   AA+c           
          t        |D cg c]B  } | j                  | j                  |      fdt        j                  i|j                         D c}      S c c}w rD   )r   r9   r;   r   r5   r=   r>   s       r   rawzNKJPCorpusReader.raw   sp    
 
 &	 &F 

MM&)0@0I0IMS,.! &	
 	
r@   )z.*r   )__name__
__module____qualname__r/   r1   r3   r5   r   r   r   r9   r;   r   r?   rB   rE   rG   rI    r   r   r   r       s    JJKH'4
A-&" 
 
 
 
 
 
 
 
$ 
 
r   r   c                       e Zd Zd Zd Zd Zy)r4   c                 Z    d| _         t        j                  | |dz   | j                          y)z
        HEADER_MODE
        A stream backed corpus view specialized for use with
        header.xml files in NKJP corpus.
        z.*/sourceDesc$r!   N)tagspecr   r   r
   r8   r   s      r   r   zNKJPCorpus_Header_View.__init__   s&     (tX%<dllKr   c                     | j                          g }	 t        j                  | | j                        }t	        |      dk(  rn|j                  |       A| j                          |S Nr   )_openr   
read_block_streamlenextendclose)r
   r?   segms      r   r=   z#NKJPCorpus_Header_View.handle_query   sX    

 ++D$,,?D4yA~MM$	 
 	

r   c                 0   |j                  d      }g }|rdj                  d |D              }|j                  d      }g }|rdj                  d |D              }|j                  d      }g }|rdj                  d |D              }|j                  d      }	g }
|	rdj                  d	 |	D              }
|j                  d
      }g }|rdj                  d |D              }|j                  d      }g }|rdj                  d |D              }||||
||dS )Nz
bibl/title
c              3   P   K   | ]  }|j                   j                            y wr   textstrip).0titles     r   	<genexpr>z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   s     EfUejj..0f   $&zbibl/authorc              3   P   K   | ]  }|j                   j                            y wr   r^   )ra   authors     r   rc   z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   s     Ivv{{002rd   z	bibl/datec              3   P   K   | ]  }|j                   j                            y wr   r^   )ra   dates     r   rc   z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>        A54TYY__.5rd   zbibl/publisherc              3   P   K   | ]  }|j                   j                            y wr   r^   )ra   	publishers     r   rc   z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   s     !U*Y).."6"6"8*rd   z	bibl/idnoc              3   P   K   | ]  }|j                   j                            y wr   r^   )ra   idnos     r   rc   z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   ri   rd   z	bibl/notec              3   P   K   | ]  }|j                   j                            y wr   r^   )ra   notes     r   rc   z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>   ri   rd   )rb   rf   rh   rk   rm   ro   )findallr%   )r
   eltcontexttitlesrb   authorsrf   datesrh   
publishersrk   idnosrm   notesro   s                  r   
handle_eltz!NKJPCorpus_Header_View.handle_elt   s   \*IIEfEEE++m,YYIIIFK(99A5AAD[[!12
			!U*!UUIK(99A5AADK(99A5AAD "
 	
r   N)rJ   rK   rL   r   r=   ry   rM   r   r   r4   r4      s    L	&
r   r4   c                   "    e Zd ZdZd Zd Zd Zy)XML_Toola  
    Helper class creating xml file to one without references to nkjp: namespace.
    That's needed because the XMLCorpusView assumes that one can find short substrings
    of XML that are valid XML, which is not true if a namespace is declared at top level
    c                     t         j                  j                  ||      | _        t	        j
                  d      | _        y )NF)delete)r#   r$   r%   	read_filetempfileNamedTemporaryFile
write_file)r
   r   r8   s      r   r   zXML_Tool.__init__   s*    dH5"55UCr   c                    	 t        | j                        }| j                  }d}t        |      r|j	                         }t        j                  d|      }dj                  |      }t        j                  d|      }dj                  |      }t        j                  d|      }dj                  |      }t        j                  d|      }dj                  |      }t        j                  d|      }dj                  |      }|j                  |       t        |      r|j                          |j                          | j                  j                  S # t        $ r}| j                          t        |d }~ww xY w)N znkjp:[^ ]* z<nkjp:paren>z</nkjp:paren>z<choice>z	</choice>)openr~   r   rW   readlinerer'   r%   writerY   name	Exceptionremove_preprocessed_file)r
   frfwlinexretes          r   build_preprocessed_filez XML_Tool.build_preprocessed_file   s"   	#dnn%BBDd){{}HH^T2hhqkHH^S1hhqkHH_c2hhqkHHZ-hhqkHH[#.hhqk d) HHJHHJ??''' 	#))+"	#s   DE  5E 	E;E66E;c                 V    t        j                  | j                  j                         y r   )r#   remover   r   )r
   s    r   r   z!XML_Tool.remove_preprocessed_file  s    
		$//&&'r   N)rJ   rK   rL   __doc__r   r   r   rM   r   r   r{   r{      s    D#2(r   r{   c                   @    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
d	 Zy
)r2   zm
    A stream backed corpus view specialized for use with
    ann_segmentation.xml files in NKJP corpus.
    c                    d| _         t        |t        j                        | _        | j                  j	                          t        |d      | _        t        j                  | | j                  j                         | j                          y )Nz.*p/.*s)r,   zann_segmentation.xml)
rP   r6   r1   	text_viewr=   r{   xml_toolr   r   r   rQ   s      r   r   z%NKJPCorpus_Segmentation_View.__init__!  sh     -/::
 	##% +AB$--7794<<	
r   c                 N    |j                  d      d   j                  d      d   S )N(r   ,r   )r'   )r
   example_words     r   get_segm_idz(NKJPCorpus_Segmentation_View.get_segm_id/  s(    !!#&q)//4Q77r   c                 <    t        |j                  d      d         S )Nr   r   )intr'   )r
   beg_words     r   get_sent_begz)NKJPCorpus_Segmentation_View.get_sent_beg2  s    8>>#&q)**r   c                     |j                  d      d   j                  d      }t        |d         t        |d         z   S )N)r   r   r   r   )r'   r   )r
   end_wordsplitteds      r   get_sent_endz)NKJPCorpus_Segmentation_View.get_sent_end6  s=    >>#&q)//48A;#hqk"222r   c                     | j                  |d         }| j                  j                  |   }| j                  |d         }| j	                  |t        |      dz
           }||| S )Nr   r   )r   r   	segm_dictr   r   rW   )r
   	sent_segmidrZ   begends         r   get_sentencesz*NKJPCorpus_Segmentation_View.get_sentences;  sh    il+~~''+	!-	#i.1*< =>C}r   c                     g }d}d}|D ]S  }| j                  |      }| j                  |      |dz
  kD  s||k7  r"|j                  |       | j                  |      }|}U |S )Nr   )r   r   appendr   )r
   rZ   r   prev_txt_endprev_txt_nrwordtxt_nrs          r   remove_choicez*NKJPCorpus_Segmentation_View.remove_choiceC  sr    D%%d+F  &)99[F=R

4 #006 K  
r   c                    	 | j                          g }	 t        j                  | | j                        }t	        |      dk(  rn9|D ]3  }| j                  |      }|j                  | j                  |             5 h| j                          | j                  j                          |S # t        $ r&}| j                  j                          t        |d }~ww xY wrS   )rT   r   rU   rV   rW   r   r   r   rY   r   r   r   )r
   	sentencesr   rZ   r   s        r   r=   z)NKJPCorpus_Segmentation_View.handle_queryQ  s    	#JJLI)44T4<<H	y>Q&%D--d3D$$T%7%7%=> &	  JJLMM224 	#MM224"	#s   B&B) )	C2!CCc                 X    g }|D ]"  }|j                  |j                  d             $ |S )Ncorresp)r   get)r
   rq   rr   r   segs        r   ry   z'NKJPCorpus_Segmentation_View.handle_eltc  s+    CJJswwy)* 
r   N)rJ   rK   rL   r   r   r   r   r   r   r   r=   ry   rM   r   r   r2   r2     s/    

8+3
#$r   r2   c                   8    e Zd ZdZdZdZd Zd Zd
dZd Z	d	 Z
y)r6   za
    A stream backed corpus view specialized for use with
    text.xml files in NKJP corpus.
    r   r   c                     |j                  dd      | _        d| _        t               | _        t        |d      | _        t        j                  | | j                  j                         | j                         y )Nr,   r   z	.*/div/abztext.xml)
r.   r,   rP   dictr   r{   r   r   r   r   rQ   s      r   r   zNKJPCorpus_Text_View.__init__s  sX    JJvq)	" :6$--7794<<	
r   c                    	 | j                          | j                  | j                        }| j                          | j                  j                          |S # t        $ r&}| j                  j                          t        |d }~ww xY wr   )rT   rU   rV   rY   r   r   r   )r
   r   r   s      r   r=   z!NKJPCorpus_Text_View.handle_query~  sh    	#JJL-AJJLMM224H 	#MM224"	#s   AA 	B"!BBNc                     g }	 t        j                  | |      }t        |      dk(  rn|D ]  }|j                  |        >dj	                  |D cg c]  }| c}      gS c c}w )z6
        Returns text as a list of sentences.
        r   r   )r   rU   rW   r   r%   )r
   streamrP   elt_handlertxtrZ   parts          r   rU   zNKJPCorpus_Text_View.read_block  sm      ++D&9D4yA~

4  	  3/34$3/011/s   	A"c                 n    |j                   D ]&  }|j                  d      s|j                  |      c S  y )Nr   )attribendswithr   )r
   rq   attrs      r   r   z NKJPCorpus_Text_View.get_segm_id  s*    JJD}}T"wwt}$ r   c                     | j                   t        j                  u r(|j                  | j                  | j                  |      <   |j                  S r   )r,   r6   r1   r_   r   r   )r
   rq   rr   s      r   ry   zNKJPCorpus_Text_View.handle_elt  s;    99,77747HHDNN4++C01xxr   )NN)rJ   rK   rL   r   r1   r5   r   r=   rU   r   ry   rM   r   r   r6   r6   j  s,    
 JH	
	#2%
r   r6   c                   "    e Zd ZdZd Zd Zd Zy)r0   zm
    A stream backed corpus view specialized for use with
    ann_morphosyntax.xml files in NKJP corpus.
    c                     |j                  dd       | _        d| _        t        |d      | _        t        j                  | | j                  j                         | j                         y )Nr-   z	.*/seg/fszann_morphosyntax.xml)r.   r-   rP   r{   r   r   r   r   rQ   s      r   r   zNKJPCorpus_Morph_View.__init__  sP    JJvt,	" +AB$--7794<<	
r   c                 |   	 | j                          g }	 t        j                  | | j                        }t	        |      dk(  rn|D ]  }||j                  |        K| j                          | j                  j                          |S # t        $ r&}| j                  j                          t        |d }~ww xY wrS   )
rT   r   rU   rV   rW   r   rY   r   r   r   )r
   rE   rZ   r   r   s        r   r=   z"NKJPCorpus_Morph_View.handle_query  s    	#JJLE$//dllCt9> D'T* !	  JJLMM224L 	#MM224"	#s   A	B ?B 	B;!B66B;c                    d}d}d}| j                   d}|D ]6  }d|j                         v r6|j                  d   dk(  r$|D ]  }|j                  dk(  s|j                  }  Ld|j                         v s_|j                  d   dk(  sr|D ]  }d|j                         v s|j                  d   d	k(  s)|D ]  }d|j                         v s|j                  d   d
k(  s)|D ]f  }	d|	j                         v r*| j                   |	j                  d   | j                   v rd}?d|	j                         v sR|	j                  d   dk(  sed}h   9 |r|r|S y y )N FTr   orthstringinterpstypelexctagvalueinterp)r-   keysr   tagr_   )
r
   rq   rr   r   flagis_not_interpchildsymbolsymbol2symbol3s
             r   ry   z NKJPCorpus_Morph_View.handle_elt  sK   99DE%%,,v*>&*H#FzzX-%{{ $ 5::<'ELL,@I,M#F.6==3HE3Q'-G &',,. 8$+NN6$:f$D/6G(/7<<>(A,0II,A,3NN7,Ctyy,P/3(/7<<>(A,3NN7,Cx,O8= 07 (. $ 4 MK "4r   N)rJ   rK   rL   r   r   r=   ry   rM   r   r   r0   r0     s    

#$#r   r0   )r   r#   r   r   nltk.corpus.reader.utilr   nltk.corpus.reader.xmldocsr   r   r   r   r4   r{   r2   r6   r0   rM   r   r   <module>r      so     	 	  * ER
 R
j;
] ;
|%( %(PL= L^6= 6rCM Cr   