
    g                     T    d Z ddlmZ ddlmZ ddlmZ  G d d      Zd Zd Z	dd
Z
y	)a  
Simple classifier for RTE corpus.

It calculates the overlap in words and named entities between text and
hypothesis, and also whether there are words / named entities in the
hypothesis which fail to occur in the text, since this is an indicator that
the hypothesis is more informative than (i.e not entailed by) the text.

TO DO: better Named Entity classification
TO DO: add lemmatization
    )MaxentClassifier)accuracy)RegexpTokenizerc                   H    e Zd ZdZddZd	dZd
dZed        Zed        Z	y)RTEFeatureExtractorz
    This builds a bag of words for both the text and the hypothesis after
    throwing away some stopwords, then calculates overlap and difference.
    c                 H   || _         h d| _        h d| _        t        d      }|j	                  |j
                        | _        |j	                  |j                        | _        t        | j                        | _
        t        | j                        | _        |r\| j                  D ch c]  }| j                  |       c}| _
        | j                  D ch c]  }| j                  |       c}| _        | j                   r<| j                  | j                  z
  | _
        | j                  | j                  z
  | _        | j                  | j                  z  | _        | j                  | j                  z
  | _        | j                  | j                  z
  | _        yc c}w c c}w )z
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        >   ,.ainisitoftoandarethehavetheyverywere>   nonotneverdeniedfailedrejectedz[\w.@:/]+|\w+|\$[\d.]+N)stop	stopwordsnegwordsr   tokenizetexttext_tokenshyp
hyp_tokensset
text_words	hyp_words
_lemmatize_overlap
_hyp_extra
_txt_extra)selfrtepairr   use_lemmatize	tokenizertokens         O/var/www/openai/venv/lib/python3.12/site-packages/nltk/classify/rte_classify.py__init__zRTEFeatureExtractor.__init__   s=    	
$ O $$=>	 %--gll;#,,W[[9d../T__-CGCSCSTCS%tu5CSTDOBF//R/dooe4/RDN99"oo>DO!^^dnn<DN8..4??://DNN: URs   )FFc                    | j                   D ch c]  }| j                  |      s| }}|dk(  r|rt        d|       |S |dk(  r*|rt        d| j                   |z
         | j                   |z
  S t        d|z        c c}w )z
        Compute the overlap between text and hypothesis.

        :param toktype: distinguish Named Entities from ordinary words
        :type toktype: 'ne' or 'word'
        nez
ne overlapwordzword overlapzType not recognized:'%s')r*   _neprint
ValueError)r-   toktypedebugr1   
ne_overlaps        r2   overlapzRTEFeatureExtractor.overlapO   s     *.J$((5/e
Jd?lJ/ndmmj&@A==:--7'ABB Ks
   A>A>c                     | j                   D ch c]  }| j                  |      s| }}|dk(  r|S |dk(  r| j                   |z
  S t        d|z        c c}w )z
        Compute the extraneous material in the hypothesis.

        :param toktype: distinguish Named Entities from ordinary words
        :type toktype: 'ne' or 'word'
        r5   r6   zType not recognized: '%s')r+   r7   r9   )r-   r:   r;   r1   ne_extras        r2   	hyp_extrazRTEFeatureExtractor.hyp_extrab   s`     (,Je$((5/EJd?O??X--87BCC Ks
   AAc                 F    | j                         s| j                         ryy)zz
        This just assumes that words in all caps or titles are
        named entities.

        :type token: str
        TF)istitleisupper)r1   s    r2   r7   zRTEFeatureExtractor._neq   s     ==?emmo    c                 T    ddl m} |j                  | |j                        }||S | S )zI
        Use morphy from WordNet to find the base form of verbs.
        r   )wordnet)pos)nltk.corpusrF   morphyVERB)r6   wnlemmas      r2   r)   zRTEFeatureExtractor._lemmatize}   s-    
 	.		$BGG	,LrD   N)TF)F)T)
__name__
__module____qualname____doc__r3   r=   r@   staticmethodr7   r)    rD   r2   r   r      sA    
.;`C&D 	 	 	 	rD   r   c                    t        |       }i }d|d<   t        |j                  d            |d<   t        |j                  d            |d<   t        |j                  d            |d<   t        |j                  d            |d<   t        |j                  |j
                  z        |d	<   t        |j                  |j                  z        |d
<   |S )NTalwaysonr6   word_overlapword_hyp_extrar5   r<   ne_hyp_extraneg_txtneg_hyp)r   lenr=   r@   r    r'   r(   )r.   	extractorfeaturess      r2   rte_featuresr]      s    #G,IHHZ"9#4#4V#<=H^!$Y%8%8%@!AH !2!24!89H\"9#6#6t#<=H^i0093G3GGHHYi0093F3FFGHYOrD   c                 V    | D cg c]  }t        |      |j                  f c}S c c}w N)r]   value)	rte_pairspairs     r2   rte_featurizerc      s(    9BC\$,CCCs   &Nc                    ddl m} |j                  g d      }|j                  g d      }|
|d | }|d | }t        |      }t        |      }t	        d       | dv rt        j                  ||       }n1| dv rt        j                  ||       }nt        d      }t        |      t	        d	       t        ||      }	t	        d
|	z         |S )Nr   )rte)zrte1_dev.xmlzrte2_dev.xmlzrte3_dev.xml)zrte1_test.xmlzrte2_test.xmlzrte3_test.xmlzTraining classifier...)megam)GISIISzFRTEClassifier only supports these algorithms:
 'megam', 'GIS', 'IIS'.
zTesting classifier...zAccuracy: %6.4f)
rH   re   pairsrc   r8   r   trainstr	Exceptionr   )
	algorithmsample_N
rte_corpus	train_settest_setfeaturized_train_setfeaturized_test_setclferr_msgaccs
             r2   rte_classifierrw      s    -  !QRI STHix(	IX&(3'1 

"#I$$%99E	n	$$$%99E'
   	
!"
3+
,C	
c
!"JrD   r_   )rP   nltk.classify.maxentr   nltk.classify.utilr   nltk.tokenizer   r   r]   rc   rw   rR   rD   r2   <module>r{      s2   
 2 ' )n nb
DrD   