
    g                         d Z ddlZddlmZ ddl ddl ddl  G d d      Z G d de      Z	 G d	 d
e
      Zd Zy)a  
Read from the Senseval 2 Corpus.

SENSEVAL [http://www.senseval.org/]
Evaluation exercises for Word Sense Disambiguation.
Organized by ACL-SIGLEX [https://www.siglex.org/]

Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
https://www.d.umn.edu/~tpederse/data.html
Distributed with permission.

The NLTK version of the Senseval 2 files uses well-formed XML.
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
    N)ElementTree)*c                       e Zd Zd Zd Zy)SensevalInstancec                 N    || _         t        |      | _        || _        || _        y N)wordtuplesensespositioncontext)selfr	   r   r   r   s        P/var/www/openai/venv/lib/python3.12/site-packages/nltk/corpus/reader/senseval.py__init__zSensevalInstance.__init__"   s"    	Fm     c           	      p    d| j                   d| j                  d| j                  d| j                  d	S )NzSensevalInstance(word=z, position=z
, context=z	, senses=))r	   r   r   r   )r   s    r   __repr__zSensevalInstance.__repr__(   s(    IIMMLLKK	
 	
r   N)__name__
__module____qualname__r   r    r   r   r   r   !   s    
r   r   c                       e Zd ZddZd Zy)SensevalCorpusReaderNc           
      ~    t        | j                  |d      D cg c]  \  }}t        ||       c}}      S c c}}w )NT)concatabspathsSensevalCorpusView)r   fileidsfileidencs       r   	instanceszSensevalCorpusReader.instances2   sG     &*]]7D%A%AMVS #63/%A
 	
s   9
c                    g }|j                  d      D ]h  }|j                  d      D ]R  }|d   j                  d   }|d   D cg c]  }|j                  |j                  d   f }}|j                  ||f       T j |S c c}w )Nlexeltinstancer   senseid   pos)findallattribtextappend)r   treeeltsr$   instsensewr   s           r   _entryzSensevalCorpusReader._entry:   s    ll8,Fz2Qy1>B1gFgAFFAHHUO4gFUG,- 3 -
  Gs   "Br   )r   r   r   r"   r2   r   r   r   r   r   1   s    
r   r   c                       e Zd Zd Zd Zd Zy)r   c                 r    t         j                  | ||       t               | _        dg| _        d g| _        y )N)encodingr   )StreamBackedCorpusViewr   WhitespaceTokenizer_word_tokenizer_lexelt_starts_lexelts)r   r    r5   s      r   r   zSensevalCorpusView.__init__E   s5    ''fx'H24 cr   c                    t         j                  | j                  |j                               dz
  }| j                  |   }g }d}	 |j                         }|dk(  r	|g k(  sJ g S |j                         j                  d      r|dz  }t        j                  d|      }|J |j                  d      dd }|t        | j                        k  r|| j                  |   k(  sFJ | j                  j                  |       | j                  j                  |j                                |j                         j                  d      r	|g k(  sJ d}|r|j                  |       |j                         j                  d	      rDd
j                  |      }t        |      }t        j                   |      }	| j#                  |	|      gS )Nr'   FT z<lexeltzitem=("[^"]+"|'[^']+')z	<instancez
</instance
)bisectbisect_rightr9   tellr:   readlinelstrip
startswithresearchgrouplenr,   join_fixXMLr   
fromstring_parse_instance)
r   stream
lexelt_numr$   instance_linesin_instancelinem	xml_blockr/   s
             r   
read_blockzSensevalCorpusView.read_blockL   s   (()<)<fkkmLqP
z*??$Drz%+++	 {{}''	2a
II94@}$}Ab)DMM 22!T]]:%>>>>MM((0''..v{{}= {{}''4%+++" %%d+ {{}''5 IIn5	#I.	"--i8,,T6:;;A r   c                 $   g }g }d }|D ]w  }|j                   dk(  r|j                  |j                  d          2|j                   dk(  r$|| j                  j	                  |j
                        z  }|D ]  }|j                   dk(  r|d   }|j                   dk(  r0|J d       |j
                  j                         st        |      dk(  sJ |j
                  j                         rt        |      dk(  rJ t        |      }|j
                  j                         r*|j                  |j
                  j                                n|d   j                   d	k(  rk|j                  |d   j
                  |d   j                  d
   f       |d   j                  r|| j                  j	                  |d   j                        z  }nnJ d       |j                   d	k(  r+|j                  |j
                  |j                  d
   f       n-|j                   dk(  rnt        d|j                          J d       |j                  s|| j                  j	                  |j                        z  } fJ d|j                   z          t        ||||      S )Nanswerr&   r   compoundr   headzhead specified twicer'   wfr(   zexpected CDATA or wf in <head>sACKz expected CDATA or <wf> or <head>zunexpected tag %s)tagr,   r*   r8   tokenizer+   striprH   tailprintr   )r   r%   r$   r   r   r   childcwords           r   rL   z"SensevalCorpusView._parse_instanceu   s*   EyyH$ell956i'4//88DD"EyyJ. %ayyF*'/G1GG/$zz//1SZ1_DD$)JJ$4$4$63u:?KK#&w< ::++-#NN5::+;+;+=>"1X\\T1#NNE!HMM58??5;Q+RS$Qx}} '4+?+?+H+Hq+W WJ*JJ5d*

ELL4G'HIc) eUYY/H&HHuzz4#7#7#@#@#LL; #> >1EII==uI J  '6BBr   N)r   r   r   r   rT   rL   r   r   r   r   r   D   s    '<R)Cr   r   c                    t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  d	d
|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } t        j                  dd|       } | S )z:
    Fix the various issues with Senseval pseudo-XML.
    z	<([~\^])>z\1z(\s+)\&(\s+)z	\1&amp;\2z"""z'"'z(<[^<]*snum=)([^">]+)>z\1"\2"/>z<\&frasl>\s*<p[^>]*>FRASLz
<\&I[^>]*>r<   z<{([^}]+)}>z	<(@|/?p)>z	<&\w+ \.>z<!DOCTYPE[^>]*>z<\[\/?[^>]+\]*>z
<(\&\w+;)>z&(?!amp|gt|lt|apos|quot)z'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>z <wf pos="\2">\1</wf>z\s*"\s*<p=\'"\'/>z <wf pos='"'>"</wf>)rE   sub)r+   s    r   rJ   rJ      s$   
 66,t,D66/<6D66&&$'D66+[$?D66)7D9D66-T*D66.%.D66,T*D66,T*D66$c40D66$c40D66--D66-sD9D6624LdD 66&(?FDKr   )__doc__rE   	xml.etreer   nltk.corpus.reader.apinltk.corpus.reader.utilnltk.tokenizer   CorpusReaderr   r6   r   rJ   r   r   r   <module>rl      sH     
 ! $ % 
 
 < &ZC/ ZCz#r   