
    gP                     <   d dl Z d dlmZ d dlmZ d dlmZ d dlm	Z	 d Z G d d      Z
d	 Z	 ddZ e j                  d      ZddZd Z	 ddZd Z e j                  de j&                        Z e j                  d      Zd Zg dd
fdZd Zedk(  r e        yy)    N)accuracy)map_tag)	str2tuple)Treec                     g }g }|D ]=  }| j                  |j                               }|t        |      z  }|t        |      z  }? t        ||      S )a|  
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    )parseflattentree2conlltags	_accuracy)chunkergold	gold_tags	test_tags	gold_tree	test_trees         D/var/www/openai/venv/lib/python3.12/site-packages/nltk/chunk/util.pyr   r      s\     II	MM)"3"3"56	^I..	^I..	  Y	**    c                   f    e Zd ZdZd Zd Zd Zd Zd Zd Z	ddZ
d	 Zd
 Zd Zd Zd Zd Zd Zy)
ChunkScorea;  
    A utility class for scoring chunk parsers.  ``ChunkScore`` can
    evaluate a chunk parser's output, based on a number of statistics
    (precision, recall, f-measure, misssed chunks, incorrect chunks).
    It can also combine the scores from the parsing of multiple texts;
    this makes it significantly easier to evaluate a chunk parser that
    operates one sentence at a time.

    Texts are evaluated with the ``score`` method.  The results of
    evaluation can be accessed via a number of accessor methods, such
    as ``precision`` and ``f_measure``.  A typical use of the
    ``ChunkScore`` class is::

        >>> chunkscore = ChunkScore()           # doctest: +SKIP
        >>> for correct in correct_sentences:   # doctest: +SKIP
        ...     guess = chunkparser.parse(correct.leaves())   # doctest: +SKIP
        ...     chunkscore.score(correct, guess)              # doctest: +SKIP
        >>> print('F Measure:', chunkscore.f_measure())       # doctest: +SKIP
        F Measure: 0.823

    :ivar kwargs: Keyword arguments:

        - max_tp_examples: The maximum number actual examples of true
          positives to record.  This affects the ``correct`` member
          function: ``correct`` will not return more than this number
          of true positive examples.  This does *not* affect any of
          the numerical metrics (precision, recall, or f-measure)

        - max_fp_examples: The maximum number actual examples of false
          positives to record.  This affects the ``incorrect`` member
          function and the ``guessed`` member function: ``incorrect``
          will not return more than this number of examples, and
          ``guessed`` will not return more than this number of true
          positive examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - max_fn_examples: The maximum number actual examples of false
          negatives to record.  This affects the ``missed`` member
          function and the ``correct`` member function: ``missed``
          will not return more than this number of examples, and
          ``correct`` will not return more than this number of true
          negative examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - chunk_label: A regular expression indicating which chunks
          should be compared.  Defaults to ``'.*'`` (i.e., all chunks).

    :type _tp: list(Token)
    :ivar _tp: List of true positives
    :type _fp: list(Token)
    :ivar _fp: List of false positives
    :type _fn: list(Token)
    :ivar _fn: List of false negatives

    :type _tp_num: int
    :ivar _tp_num: Number of true positives
    :type _fp_num: int
    :ivar _fp_num: Number of false positives
    :type _fn_num: int
    :ivar _fn_num: Number of false negatives.
    c                    t               | _        t               | _        t               | _        t               | _        t               | _        |j                  dd      | _        |j                  dd      | _        |j                  dd      | _	        |j                  dd      | _
        d| _        d| _        d| _        d| _        d| _        d| _        d	| _        y )
Nmax_tp_examplesd   max_fp_examplesmax_fn_exampleschunk_labelz.*r   g        F)set_correct_guessed_tp_fp_fnget_max_tp_max_fp_max_fn_chunk_label_tp_num_fp_num_fn_num_count_tags_correct_tags_total_measuresNeedUpdate)selfkwargss     r   __init__zChunkScore.__init__r   s    555zz"3S9zz"3S9zz"3S9"JJ}d; #( r   c                 |   | j                   r| j                  | j                  z  | _        | j                  | j                  z
  | _        | j                  | j                  z
  | _        t        | j                        | _        t        | j
                        | _        t        | j                        | _	        d| _         y y )NF)
r-   r   r   r   r!   r    lenr'   r(   r)   r.   s    r   _updateMeasureszChunkScore._updateMeasures   s    ##}}t}}4DH}}t}}4DH}}t}}4DHtxx=DLtxx=DLtxx=DL',D$ $r   c           	         | xj                   t        || j                  | j                        z  c_         | xj                  t        || j                  | j                        z  c_        | xj                  dz  c_        d| _        	 t        |      }t        |      }| xj                  t        |      z  c_        | xj                  t        d t        ||      D              z  c_
        y# t        $ r dx}}Y ]w xY w)aU  
        Given a correctly chunked sentence, score another chunked
        version of the same sentence.

        :type correct: chunk structure
        :param correct: The known-correct ("gold standard") chunked
            sentence.
        :type guessed: chunk structure
        :param guessed: The chunked sentence to be scored.
           T c              3   2   K   | ]  \  }}||k(  sd   yw)r6   Nr7   ).0tgs      r   	<genexpr>z#ChunkScore.score.<locals>.<genexpr>   s      "
;&1aqAvA;s   N)r   
_chunksetsr*   r&   r   r-   r
   
ValueErrorr,   r2   r+   sumzip)r.   correctguessedcorrect_tagsguessed_tagss        r   scorezChunkScore.score   s     	GT[[$:K:KLLGT[[$:K:KLLq#' 	-)'2L)'2L 	C--c "
l;"
 
 	
  	- +-,L<		-s   C* *C:9C:c                 T    | j                   dk(  ry| j                  | j                   z  S )z
        Return the overall tag-based accuracy for all text that have
        been scored by this ``ChunkScore``, using the IOB (conll2000)
        tag encoding.

        :rtype: float
        r   r6   )r,   r+   r3   s    r   r   zChunkScore.accuracy   s,     q !!D$4$444r   c                 ~    | j                          | j                  | j                  z   }|dk(  ry| j                  |z  S )z
        Return the overall precision for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        r   )r4   r'   r(   r.   divs     r   	precisionzChunkScore.precision   ;     	llT\\)!8<<#%%r   c                 ~    | j                          | j                  | j                  z   }|dk(  ry| j                  |z  S )z
        Return the overall recall for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        r   r4   r'   r)   rH   s     r   recallzChunkScore.recall   rK   r   c                     | j                          | j                         }| j                         }|dk(  s|dk(  ryd||z  d|z
  |z  z   z  S )a  
        Return the overall F measure for all texts that have been
        scored by this ``ChunkScore``.

        :param alpha: the relative weighting of precision and recall.
            Larger alpha biases the score towards the precision value,
            while smaller alpha biases the score towards the recall
            value.  ``alpha`` should have a value in the range [0,1].
        :type alpha: float
        :rtype: float
        r   r6   )r4   rJ   rN   )r.   alphaprs       r   	f_measurezChunkScore.f_measure   sS     	NNKKM6Q!VEAIUa/00r   c                 |    | j                          t        | j                        }|D cg c]  }|d   	 c}S c c}w )z
        Return the chunks which were included in the
        correct chunk structures, but not in the guessed chunk
        structures, listed in input order.

        :rtype: list of chunks
        r6   )r4   listr!   r.   chunkscs      r   missedzChunkScore.missed   s9     	dhh$%f!f%%%   9c                 |    | j                          t        | j                        }|D cg c]  }|d   	 c}S c c}w )z
        Return the chunks which were included in the guessed chunk structures,
        but not in the correct chunk structures, listed in input order.

        :rtype: list of chunks
        r6   )r4   rU   r    rV   s      r   	incorrectzChunkScore.incorrect   s9     	dhh$%f!f%%%rZ   c                 \    t        | j                        }|D cg c]  }|d   	 c}S c c}w )z
        Return the chunks which were included in the correct
        chunk structures, listed in input order.

        :rtype: list of chunks
        r6   )rU   r   rV   s      r   rA   zChunkScore.correct   .     dmm$$%f!f%%%   )c                 \    t        | j                        }|D cg c]  }|d   	 c}S c c}w )z
        Return the chunks which were included in the guessed
        chunk structures, listed in input order.

        :rtype: list of chunks
        r6   )rU   r   rV   s      r   rB   zChunkScore.guessed  r^   r_   c                 T    | j                          | j                  | j                  z   S )NrM   r3   s    r   __len__zChunkScore.__len__  s!    ||dll**r   c                 6    dt        t        |             z   dz   S )z`
        Return a concise representation of this ``ChunkScoring``.

        :rtype: str
        z<ChunkScoring of z chunks>)reprr2   r3   s    r   __repr__zChunkScore.__repr__  s     #T#d)_4zAAr   c                     dd| j                         dz  ddz   d| j                         dz  ddz   d| j                         dz  ddz   d| j                         dz  dd	z   S )
a-  
        Return a verbose representation of this ``ChunkScoring``.
        This representation includes the precision, recall, and
        f-measure scores.  For other information about the score,
        use the accessor methods (e.g., ``missed()`` and ``incorrect()``).

        :rtype: str
        zChunkParse score:
z    IOB Accuracy: r   z5.1fz%%
z    Precision:    z    Recall:       z    F-Measure:    z%%)r   rJ   rN   rS   r3   s    r   __str__zChunkScore.__str__  s     "#DMMOc$9$#?tDF#DNN$4s$:4#@EG $DKKMC$7#=TBD $DNN$4s$:4#@C	E	
r   N)g      ?)__name__
__module____qualname____doc__r0   r4   rE   r   rJ   rN   rS   rY   r\   rA   rB   rb   re   rg   r7   r   r   r   r   3   sO    <|)&-
:
5&&1&
&	&&&+B
r   r   c                     d}g }| D ]{  }t        |t              rdt        j                  ||j	                               r#|j                  ||f|j                         f       |t        |j                               z  }w|dz  }} t        |      S )Nr   r6   )

isinstancer   rematchlabelappendfreezer2   leavesr   )r:   countr   posrW   childs         r   r=   r=   2  sy    
CFeT"xxU[[]3s|U\\^<=3u||~&&C1HC  v;r   Sc                    t        j                  d      }t        |g       g}|j                  |       D ]	  }|j	                         }	|	d   dk(  r]t        |      dk7  rt        d|j                         d      t        |g       }
|d   j                  |
       |j                  |
       y|	d   dk(  r<t        |      d	k7  rt        d
|j                         d      |j                          ||d   j                  |	       t        |	|      \  }}|r|rt        |||      }|d   j                  ||f        t        |      dk7  rt        dt        |       d      |d   S )aB  
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    z\[|\]|[^\[\]\s]+r   [r6   zUnexpected [ at char d]   zUnexpected ] at char zExpected ] at char )rn   compiler   finditergroupr2   r>   startrq   popr   r   )sr   
root_labelsepsource_tagsettarget_tagsetWORD_OR_BRACKETstackro   textchunkwordtags                r   tagstr2treer   ?  sX   ( jj!45O*b!"E ))!,{{}7c>5zQ #8q8I!JKKb)E"IU#LL!W^5zQ #8q8I!JKKIIK{b	  &%dC0	c ]!-DCb	  $-' -* 5zQ.s1vaj9::8Or   z(\S+)\s+(\S+)\s+([IOB])-?(\S+)?c                 0   t        |g       g}t        | j                  d            D ]  \  }}|j                         st        j                  |      }|t        d|d      |j                         \  }}}	}
||
|vrd}	|	dk(  xr |
|d   j                         k7  }|	dv s|rt        |      dk(  r|j                          |	d	k(  s|r1t        |
g       }|d   j                  |       |j                  |       |d   j                  ||f        |d
   S )a*  
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    
zError on line rz   OIr{   BOr}   Br   )r   	enumeratesplitstrip_LINE_REro   r>   groupsrp   r2   r   rq   )r   chunk_typesr   r   linenolinero   r   r   state
chunk_type
mismatch_Ir   s                r   conllstr2treer   u  s    $ *b!"E!!''$-0zz| t$=~fQZ899).&sE: "z'DE c\EjE"IOO4E&E
D=J5zQ		 C<:R(E"IU#LL 	b	$%9 1< 8Or   c                    g }| D ]V  }	 |j                         }d}|D ]<  }t        |t              rt        d      |j	                  |d   |d   ||z   f       d}> X |S # t
        $ r |j	                  |d   |d   df       Y w xY w)z
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    B-z7Tree is too deeply nested to be printed in CoNLL formatr   r6   I-r   )rp   rm   r   r>   rq   AttributeError)r:   tagsrv   categoryprefixcontentss         r   r
   r
     s     D	3{{}HF!h-$Q  Xa[(1+v7HIJ "	  K  	3KKq58S12	3s   AA  #BBc                 R   t        |g       }| D ]  \  }}}|!|rt        d      |j                  ||f       +|j                  d      r"|j                  t        |dd ||fg             ^|j                  d      rt	        |      dk(  s,t        |d   t               r|d   j                         |dd k7  r/|rt        d      |j                  t        |dd ||fg             |d   j                  ||f       |dk(  r|j                  ||f       	t        d	|       |S )
z1
    Convert the CoNLL IOB format to a tree.
    NzBad conll tag sequencer   r}   r   r   r{   r   zBad conll tag )r   r>   rq   
startswithr2   rm   rp   )sentencer   r   stricttreer   postagchunktags           r   conlltags2treer     s+    
BD"*fh !9:: T6N+  &KKXab\T6N+;<=  &D	Q!$r(D18>>#x|3$%=>> KKXab\T6N3C DERv/_KKv'~h\:;;3 #+4 Kr   c                 |    t        |       D cg c]  }dj                  |       }}dj                  |      S c c}w )z
    Return a multiline string where each line contains a word, tag and IOB tag.
    Convert a tree to the CoNLL IOB string format

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: str
     r   )r
   join)r:   tokenliness      r   tree2conllstrr     s;     +9*;<*;SXXe_*;E<99U =s   9a   <DOC>\s*(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?<BODY>\s*(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?<TEXT>(?P<text>.*?)</TEXT>\s*</BODY>\s*</DOC>\s*z#<b_\w+\s+[^>]*?type="(?P<type>\w+)"c                 z   t        |g       g}| g S t        j                  d|       D ]  }|j                         }	 |j	                  d      rdt
        j                  |      }|t        d|       t        |j                  d      g       }|d   j                  |       |j                  |       n6|j	                  d      r|j                          n|d   j                  |        t        |      d
k7  rt        d      |d   S # t        t        f$ r$}t        d|j                         dd	      |d }~ww xY w)Nz<[^>]+>|[^\s<]+z<b_XXXXtyper{   z<e_z$Bad IEER string (error at character rz   )r6   zBad IEER stringr   )r   rn   r   r   r   _IEER_TYPE_REro   printrq   r   
IndexErrorr>   r   r2   )r   r   r   piece_mpiecemr   es           r   _ieer_read_textr     s,   *b!"E 	y	;;115	&!''.9&%(QWWV_b1b	  'U#!!%(		
 b	  '! 6* 5zQ*++8O J' 	6w}}q6IK	s   B+DD:D55D:)	LOCATIONORGANIZATIONPERSONDURATIONDATECARDINALPERCENTMONEYMEASUREc           	         t         j                  |       }|rgt        |j                  d      |      |j                  d      |j                  d      |j                  d      t        |j                  d      |      dS t        | |      S )ap  
    Return a chunk structure containing the chunked tagged text that is
    encoded in the given IEER style string.
    Convert a string of chunked tagged text in the IEER named
    entity format into a chunk structure.  Chunks are of several
    types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
    PERCENT, MONEY, and MEASURE.

    :rtype: Tree
    r   docnodoctype	date_timeheadline)r   r   r   r   r   )_IEER_DOC_REro   r   r   )r   r   r   r   s       r   ieerstr2treer   '  s{    8 	1A#AGGFOZ@WWW%wwy)- (
(;ZH
 	
 q*--r   c                  .   d} dd l }|j                  j                  | d      }|j                          t	                d} t        | d      }|j                          t	        d       t	        |j                  j                  |             t	                y )	Nzd[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./.r   NP)r   av  
These DT B-NP
research NN I-NP
protocols NNS I-NP
offer VBP B-VP
to TO B-PP
the DT B-NP
patient NN I-NP
not RB O
only RB O
the DT B-NP
very RB I-NP
best JJS I-NP
therapy NN I-NP
which WDT B-NP
we PRP B-NP
have VBP B-VP
established VBN I-VP
today NN B-NP
but CC B-NP
also RB I-NP
the DT B-NP
hope NN I-NP
of IN B-PP
something NN B-NP
still RB B-ADJP
better JJR I-ADJP
. . O
)r   PP)r   zCoNLL output:)nltkr   r   pprintr   r   r   )r   r   r:   
conll_trees       r   demor   R  sx    nA

qd3AHHJ	G	A< ql;J 
/	$**
"
":
./	Gr   __main__)r   rw   /NN)r   r   VPrw   )r   rw   F)rn   nltk.metricsr   r   nltk.tag.mappingr   nltk.tag.utilr   	nltk.treer   r   r=   r   r~   r   r   r
   r   r   DOTALLr   r   r   r   r   rh   r7   r   r   <module>r      s    
 . $ # +<z
 z
~
 UY.f 2::892j8 FK!H
 rzz II
 

ABD
 (.V,^ zF r   