
    g.                     6   d dl  d dlmZ ej	                  d      Zej	                  d      Zej	                  d      Zej	                  d      Zej	                  d      Z	ej	                  d      Z
ej	                  d	      Z G d
 de      Z G d dee      Zy)    )*)XMLCorpusReaderz<p(?: [^>]*){0,1}>(.*?)</p>z<s(?: [^>]*){0,1}>(.*?)</s>z#<([wc](?: [^>]*){0,1}>)(.*?)</[wc]>z!<[wc](?: [^>]*){0,1}>(.*?)</[wc]>ztype="(.*?)"zana="(.*?)"ztext id="(.*?)"c                   *    e Zd Z	 	 	 ddZdZd Zd Zy)TEICorpusViewNc                 l    || _         || _        || _        || _        t        j                  | ||       y )N)startpos)_tagged_textids_group_by_sent_group_by_paraStreamBackedCorpusView__init__)selfcorpus_filetaggedgroup_by_sentgroup_by_paratagsethead_lentextidss           N/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/pl196x.pyr   zTEICorpusView.__init__   s7     ++''kH'M    i   c           
      d   |j                  | j                        }t        |      }|j                  d      |j                  d      kD  s|j                  d      dk(  r]|j	                         }t        |      dk  rn>||z  }|j                  d      |j                  d      kD  rH|j                  d      dk(  r]|j                  dd      }t        j                  |      }| j                  rX|D ]S  }|| j                  vs|j                  |      dz
  }||d  j                  d      t        d      z   }|d | |||z   d  z   }U g }t        j                  |      D ]  }	g }
t        j                  |	      D ]  }| j                  st        j                  |      }n2t        t!        | j"                  t$        j                  |                  }| j&                  r|
j)                  |       u|
j+                  |        | j,                  r|j)                  |
       |j+                  |
        |S )Nz<text idz</text>r   
    )	readlines	_pagesizeconcatcountreadlinelenreplaceTEXTIDfindallr
   findPARASENTr	   WORDlistmap
_parse_tag
TAGGEDWORDr   appendextendr   )r   streamblocktmpr   tidbegendoutputpara_strparasent_strsents                r   
read_blockzTEICorpusView.read_block,   s     0u{{:&Y)??EKKE
E //#C3x1}SLE {{:&Y)??EKKE
E dB'..'==dmm+**S/A-C+**95IFC!$3K%c	*<<E	  U+HD LL2||<<1DDOOZ5G5G5Q RSD&&KK%KK% 3 ""d#d# , r   c                     |\  }}|j                  d      r(t        j                  |      j                  d      }||fS t        j                  |      j                  d      }||fS )Nwr   )
startswithANAsearchgroupTYPE)r   tag_word_tupletagwords       r   r,   zTEICorpusView._parse_tagS   s`    $d>>#**S/''*C Sy ++c"((+CSyr   )Nr   N)__name__
__module____qualname__r   r   r;   r,    r   r   r   r      s%     N$ I%Nr   r   c                   p    e Zd ZdZd Zd Zd ZddZd ZddZ	dd	Z
dd
ZddZddZddZddZddZy)Pl196xCorpusReaderi
  c                     d|v r|d   | _         nd | _         t        j                  | g|  t        j                  | |       | j	                          y )Ntextid_file)r
   r   r   CategorizedCorpusReader_init_textids)r   argskwargss      r   r   zPl196xCorpusReader.__init___   sL    F""=1DM DM  --((v6r   c           	         t        t              | _        t        t              | _        | j                  t        | j                        5 }|D ]  }|j                         }|j                  dd      \  }}|| j                         vrt        d| j                  d|d      |j                  | j                        D ]  }| j                  ||         	 d d d        y y # 1 sw Y   y xY w)N r   zIn text_id mapping file z: z
 not found)defaultdictr*   _f2t_t2fr
   openstripsplitfileids
ValueError
_delimiter_add_textids)r   fplinefile_idtext_idstext_ids         r   rO   z Pl196xCorpusReader._init_textidsj   s    %	%	==$dmm$D::<D(,

3(:%GXdlln4(#}}g7  $,>>$//#B))'7; $C  %$ %$$s   
BC""C+c                 |    | j                   |   j                  |       | j                  |   j                  |       y N)rU   r.   rV   )r   r`   rb   s      r   r]   zPl196xCorpusReader._add_textidsz   s0    		'!!'*		'!!'*r   Nc           
      l    d }t        t        t        d |||f                  dk7  rt        d      ||d fS | j	                  |      d fS |dt        |t              r|g}t         fd|D        g       }t               }|D ])  }t         j                  |         t        |      z  ||<   + ||fS y )Nc                 
    | d u S rd   rI   )accessors    r   <lambda>z-Pl196xCorpusReader._resolve.<locals>.<lambda>   s	    T)9r   r   z6Specify exactly one of: fileids, categories or textidsc              3   <   K   | ]  }j                   |     y wrd   )rV   ).0tr   s     r   	<genexpr>z.Pl196xCorpusReader._resolve.<locals>.<genexpr>   s     7w!1w   )r"   r*   filterr[   rZ   
isinstancestrsumdictsetrU   )r   rZ   
categoriesr   r2   filestdictfs   `       r   _resolvezPl196xCorpusReader._resolve~   s    9 *g6  K  D= !<<
+T11'3'")7w7<EFEtyy|,s7|;a %< r   c                     |S rd   rI   )r   rD   s     r   
decode_tagzPl196xCorpusReader.decode_tag   s    
r   c                       j                  ||      \  }}|t         j                        S t        |t              r|g}t        t         fd|D        g             S )an  
        In the pl196x corpus each category is stored in single
        file and thus both methods provide identical functionality. In order
        to accommodate finer granularity, a non-standard textids() method was
        implemented. All the main functions can be supplied with a list
        of required chunks---giving much more control to the user.
        c              3   <   K   | ]  }j                   |     y wrd   )rU   )rj   dr   s     r   rl   z-Pl196xCorpusReader.textids.<locals>.<genexpr>   s     9A499Q<rm   )rx   sortedrV   ro   rp   rq   r   rZ   rt   _s   `   r   r   zPl196xCorpusReader.textids   sV     ]]7J7
?$))$$gs#iGc992>??r   c                    | j                  |||      \  }}|| j                  }nt        |t              r|g}|rDt	        |D cg c]/  }t        | j                  |      ddd| j                  ||         1 c}      S t	        |D cg c]+  }t        | j                  |      ddd| j                        - c}      S c c}w c c}w )NFr   r   r   rx   _fileidsro   rp   r   r   abspathr   r   rZ   rt   r   fileids        r   wordszPl196xCorpusReader.words   s    ==*gF?mmG%iG #*
 #* "V,!% ' #*
   #*	 #* "V,!% #*	 
	   4B?0Cc                    | j                  |||      \  }}|| j                  }nt        |t              r|g}|rDt	        |D cg c]/  }t        | j                  |      ddd| j                  ||         1 c}      S t	        |D cg c]+  }t        | j                  |      ddd| j                        - c}      S c c}w c c}w NFTr   r   r   r   s        r   sentszPl196xCorpusReader.sents   s    ==*gF?mmG%iG #*
 #* "V,!% ' #*
  
 #*	 #* "V,eT54== #*	 
r   c                    | j                  |||      \  }}|| j                  }nt        |t              r|g}|rDt	        |D cg c]/  }t        | j                  |      ddd| j                  ||         1 c}      S t	        |D cg c]+  }t        | j                  |      ddd| j                        - c}      S c c}w c c}w r   r   r   s        r   paraszPl196xCorpusReader.paras   s    ==*gF?mmG%iG #*
 #* "V,!% ' #*
  
 #*	 #* "V,eT4$-- #*	 
r   c                    | j                  |||      \  }}|| j                  }nt        |t              r|g}|rDt	        |D cg c]/  }t        | j                  |      ddd| j                  ||         1 c}      S t	        |D cg c]+  }t        | j                  |      ddd| j                        - c}      S c c}w c c}w NTFr   r   r   r   s        r   tagged_wordszPl196xCorpusReader.tagged_words  s    ==*gF?mmG%iG #*
 #* "V,!% ' #*
  
 #*	 #* "V,dE54== #*	 
r   c                    | j                  |||      \  }}|| j                  }nt        |t              r|g}|rDt	        |D cg c]/  }t        | j                  |      ddd| j                  ||         1 c}      S t	        |D cg c]+  }t        | j                  |      ddd| j                        - c}      S c c}w c c}w r   r   r   s        r   tagged_sentszPl196xCorpusReader.tagged_sents2  s    ==*gF?mmG%iG #*
 #* "V,!% ' #*
  
 #*	 #* "V,dD%$-- #*	 
r   c                    | j                  |||      \  }}|| j                  }nt        |t              r|g}|rDt	        |D cg c]/  }t        | j                  |      ddd| j                  ||         1 c}      S t	        |D cg c]+  }t        | j                  |      ddd| j                        - c}      S c c}w c c}w )NTr   r   r   r   s        r   tagged_paraszPl196xCorpusReader.tagged_parasQ  s    ==*gF?mmG%iG #*
 #* "V,!% ' #*
  
 #*	 #* "V,dD$ #*	 
r   c                     | j                  ||      \  }}t        |      dk(  rt        j                  | |d         S t	        d      )Nr   r   zExpected a single file)rx   r"   r   xml	TypeErrorr   s       r   r   zPl196xCorpusReader.xmlp  sE    ]]7J7
w<1"&&tWQZ88455r   rd   )NN)NNN)rF   rG   rH   r   r   rO   r]   rx   rz   r   r   r   r   r   r   r   r   rI   r   r   rK   rK   \   sK    H	< + @@ !F>>>>>6r   rK   N)nltk.corpus.reader.apinltk.corpus.reader.xmldocsr   recompiler'   r(   r-   r)   rB   r?   r$   r   r   rN   rK   rI   r   r   <module>r      s    % 6	zz01	zz01ZZ>?
	zz67	zz/"jj 	&	'B* BJY60/ Y6r   