
    g>                         d Z ddlZddlmZmZ ddlmZ d Zej                  Z
d ZdZ	 ddlmZ dZ	 d
Z	 dZ	  G d de      Z G d de      Z G d de      Z G d de      Z G d d      Zy# e$ r d	 ZY Kw xY w)z
Provides scoring functions for a number of association measures through a
generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
    N)ABCMetaabstractmethodreducec                 ,    t        j                  |       S N)_mathlog2)xs    M/var/www/openai/venv/lib/python3.12/site-packages/nltk/metrics/association.py<lambda>r      s    %**Q-    c                     t        d |       S )Nc                     | |z  S r    )r   ys     r   r   z<lambda>.<locals>.<lambda>   s    Qr   r   )ss    r   r   r      s    V.2r   g#B;)fisher_exactc                      t         r   NotImplementedError)_args_kwargss     r   r   r      s    !!r   c                       e Zd ZdZdZeed               Zeed               Ze	d        Z
ed        Ze	d        Ze	d        Zed	        Ze	d
        Ze	d        Ze	d        Ze	d        Zy)NgramAssocMeasuresa  
    An abstract class defining a collection of generic association measures.
    Each public method returns a score, taking the following arguments::

        score_fn(count_of_ngram,
                 (count_of_n-1gram_1, ..., count_of_n-1gram_j),
                 (count_of_n-2gram_1, ..., count_of_n-2gram_k),
                 ...,
                 (count_of_1gram_1, ..., count_of_1gram_n),
                 count_of_total_words)

    See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``

    Inheriting classes should define a property _n, and a method _contingency
    which calculates contingency values from marginals in order for all
    association measures defined here to be usable.
    r   c                      t        d      )z>Calculates values of a contingency table from marginal values.?The contingency table is not availablein the general ngram caser   	marginalss    r   _contingencyzNgramAssocMeasures._contingencyB        "P
 	
r   c                      t        d      )ACalculates values of contingency table marginals from its values.r   r   )contingencys    r   
_marginalszNgramAssocMeasures._marginalsJ   r#   r   c              #      K   t              }t         j                        D cg c]  }d|z  	 }}t        t                    D ]-  t	         fd|D              | j                  dz
  z  z   / yc c}w w)3Calculates expected values for a contingency table.   c              3   x   K   | ]0  t        fd t        dj                  z        D               2 yw)c              3   @   K   | ]  }|z  z  k(  s|     y wr   r   ).0r   contijs     r   	<genexpr>z@NgramAssocMeasures._expected_values.<locals>.<genexpr>.<genexpr>]   s'     P)9Aa!eQ=OQ)9s   
   N)sumrange_n)r-   r0   clsr.   r/   s    @r   r1   z6NgramAssocMeasures._expected_values.<locals>.<genexpr>\   s1      ! Pq#&&y)9PP!s   6:N)r3   r4   r5   len_product)r6   r.   n_allr/   bitss   `` ` r   _expected_valuesz#NgramAssocMeasures._expected_valuesR   s      D	 %cff.1Q. s4y!A  !  SVVaZ(	* " /s   #B A;AB c                  (    | t            | t           z  S )z Scores ngrams by their frequency)NGRAMTOTALr    s    r   raw_freqzNgramAssocMeasures.raw_freqc   s     )E"222r   c                     |t            t        |t                 |t           | j                  dz
  z  z  z
  |t            t
        z   dz  z  S )zScores ngrams using Student's t test with independence hypothesis
        for unigrams, as in Manning and Schutze 5.3.1.
        r*   g      ?)r=   r8   UNIGRAMSr>   r5   _SMALLr6   r!   s     r   	student_tzNgramAssocMeasures.student_th   sR     ey*+y/?CFFQJ/OPQu&3./ 	/r   c                 z     | j                   | }| j                  |      }t        d t        ||      D              S )zZScores ngrams using Pearson's chi-square as in Manning and Schutze
        5.3.3.
        c              3   F   K   | ]  \  }}||z
  d z  |t         z   z    yw)r2   N)rB   r-   obsexps      r   r1   z,NgramAssocMeasures.chi_sq.<locals>.<genexpr>y   s'     U_cC#I!#sV|4_s   !)r"   r;   r3   zip)r6   r!   r.   expss       r   chi_sqzNgramAssocMeasures.chi_sqr   s=    
  s+##D)USt_UUUr   c                  `    | t            |j                  dd      z  t        | t                 z  S )zScores ngrams using a variant of mutual information. The keyword
        argument power sets an exponent (default 3) for the numerator. No
        logarithm of the result is calculated.
        power   )r=   getr8   rA   )r!   kwargss     r   mi_likezNgramAssocMeasures.mi_like{   s5     6::gq#99Hh=
 
 	
r   c                     t        |t           |t           | j                  dz
  z  z        t        t	        |t
                       z
  S )z^Scores ngrams by pointwise mutual information, as in Manning and
        Schutze 5.4.
        r*   )_log2r=   r>   r5   r8   rA   rC   s     r   pmizNgramAssocMeasures.pmi   sG    
 Yu%	%(8SVVaZ(HHIEYx()M
 
 	
r   c           
      |     | j                   | }dt        d t        || j                  |            D              z  S )zFScores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.r2   c              3   `   K   | ]&  \  }}|t        ||t        z   z  t        z         z   ( y wr   )_lnrB   rG   s      r   r1   z6NgramAssocMeasures.likelihood_ratio.<locals>.<genexpr>   s4      
AS #cS6\*V344As   ,.)r"   r3   rJ   r;   r6   r!   r.   s      r   likelihood_ratioz#NgramAssocMeasures.likelihood_ratio   sI      s+3 
c&:&:4&@A
 
 
 	
r   c                     t        |t                 |t           | j                  dz
  z  z  }|t           t        |t           |z        dz
  z  S )z1Scores ngrams using the Poisson-Stirling measure.r*   )r8   rA   r>   r5   r=   rT   )r6   r!   rI   s      r   poisson_stirlingz#NgramAssocMeasures.poisson_stirling   sN     y*+y/?CFFQJ/OP55)9C)?#@1#DEEr   c                 H     | j                   | }|d   t        |dd       z  S )z&Scores ngrams using the Jaccard index.r   Nr   )r"   r3   rY   s      r   jaccardzNgramAssocMeasures.jaccard   s/      s+AwT#2Y''r   N)__name__
__module____qualname____doc__r5   staticmethodr   r"   r'   classmethodr;   r?   rD   rL   rR   rU   rZ   r\   r^   r   r   r   r   r   -   s    $ 
B
  
 
  
    3 3 / / V V 
 
 
 
 
 
 F F
 ( (r   r   )	metaclassc                       e Zd ZdZdZed        Zed        Zed        Ze	d        Z
e	d        Ze	d        Zed	        Zy
)BigramAssocMeasuresa  
    A collection of bigram association measures. Each association measure
    is provided as a function with three arguments::

        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_ii counts ``(w1, w2)``, i.e. the bigram being scored
    - n_ix counts ``(w1, *)``
    - n_xi counts ``(*, w2)``
    - n_xx counts ``(*, *)``, i.e. any bigram

    This may be shown with respect to a contingency table::

                w1    ~w1
             ------ ------
         w2 | n_ii | n_oi | = n_xi
             ------ ------
        ~w2 | n_io | n_oo |
             ------ ------
             = n_ix        TOTAL = n_xx
    r2   c                 >    |\  }}|| z
  }|| z
  }| |||| z
  |z
  |z
  fS )zECalculates values of a bigram contingency table from marginal values.r   )n_iin_ix_xi_tuplen_xxn_ixn_xin_oin_ios          r   r"   z BigramAssocMeasures._contingency   s<     %td{d{dD$+"4t";<<r   c                 .    | || z   || z   f||z   |z   | z   fS )r%   r   )ri   rn   ro   n_oos       r   r'   zBigramAssocMeasures._marginals   s,     td{D4K0$+2Dt2KLLr   c              #      K   t        |       }t        d      D ]$  }| |   | |dz     z   | |   | |dz     z   z  |z   & yw)r)      r*   r2   N)r3   r4   )r.   rk   r/   s      r   r;   z$BigramAssocMeasures._expected_values   sQ      4yqA7T!a%[(T!WtAE{-BCdJJ s   ?Ac                 v     | j                   | \  }}}}||z  ||z  z
  dz  ||z   ||z   z  ||z   z  ||z   z  z  S )zdScores bigrams using phi-square, the square of the Pearson correlation
        coefficient.
        r2   )r"   )r6   r!   ri   ro   rn   rq   s         r   phi_sqzBigramAssocMeasures.phi_sq   s`    
 "2!1!19!=dD$tdTk)a/D[TD[)TD[9TD[I
 	
r   c                 <    |\  }}|| j                  |||f|      z  S )zScores bigrams using chi-square, i.e. phi-sq multiplied by the number
        of bigrams, as in Manning and Schutze 5.3.3.
        )ru   )r6   ri   rj   rk   rl   rm   s         r   rL   zBigramAssocMeasures.chi_sq   s)    
 %tcjjd|T:::r   c                 Z     | j                   | \  }}}}t        ||g||ggd      \  }}|S )zScores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.
        less)alternative)r"   r   )r6   r!   ri   ro   rn   rq   oddspvalues           r   fisherzBigramAssocMeasures.fisher   sB     "2!1!19!=dD$%d|dD\&BPVWvr   c                 "    |\  }}d| z  ||z   z  S )z(Scores bigrams using Dice's coefficient.r2   r   )ri   rj   rk   rl   rm   s        r   dicezBigramAssocMeasures.dice   s      %t4x4$;''r   N)r_   r`   ra   rb   r5   rc   r"   r'   r;   rd   ru   rL   r|   r~   r   r   r   rg   rg      s    6 
B= = M M K K 
 
 ; ; 	 	 ( (r   rg   c                   4    e Zd ZdZdZed        Zed        Zy)TrigramAssocMeasuresa  
    A collection of trigram association measures. Each association measure
    is provided as a function with four arguments::

        trigram_score_fn(n_iii,
                         (n_iix, n_ixi, n_xii),
                         (n_ixx, n_xix, n_xxi),
                         n_xxx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored
    - n_ixx counts ``(w1, *, *)``
    - n_xxx counts ``(*, *, *)``, i.e. any trigram
    rO   c                     |\  }}}|\  }}}	|| z
  }
|| z
  }|| z
  }|	| z
  |
z
  |z
  }|| z
  |
z
  |z
  }|| z
  |z
  |z
  }|| z
  |
z
  |z
  |z
  |z
  |z
  |z
  }| |
||||||fS )zCalculates values of a trigram contingency table (or cube) from
        marginal values.
        >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
        (1, 0, 0, 0, 0, 72, 0, 1927)
        r   )n_iiin_iix_tuplen_ixx_tuplen_xxxn_iixn_ixin_xiin_ixxn_xixn_xxin_oiin_ioin_iion_ooin_oion_ioon_ooos                    r   r"   z!TrigramAssocMeasures._contingency  s     !,u +u%-%-%-%-5=EMueUE5%GGr   c                      | \  }}}}}}}}|||z   ||z   ||z   f||z   |z   |z   ||z   |z   |z   ||z   |z   |z   ft        |       fS )zCalculates values of contingency table marginals from its values.
        >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
        (1, (1, 1, 1), (1, 73, 1), 2000)
        r3   )	r&   r   r   r   r   r   r   r   r   s	            r   r'   zTrigramAssocMeasures._marginals&  s     BM>ueUE5%U]EEM55=9%-%-%-
 	
 		
r   Nr_   r`   ra   rb   r5   rc   r"   r'   r   r   r   r   r      s6    & 
BH H$ 
 
r   r   c                   4    e Zd ZdZdZed        Zed        Zy)QuadgramAssocMeasuresaF  
    A collection of quadgram association measures. Each association measure
    is provided as a function with five arguments::

        trigram_score_fn(n_iiii,
                        (n_iiix, n_iixi, n_ixii, n_xiii),
                        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
                        (n_ixxx, n_xixx, n_xxix, n_xxxi),
                        n_all)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored
    - n_ixxi counts ``(w1, *, *, w4)``
    - n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram
    rs   c                    |\  }}}}|\  }	}
}}}}|\  }}}}|| z
  }|| z
  }|| z
  }|| z
  |z
  |z
  }|| z
  |z
  |z
  }|| z
  |z
  |z
  }|| z
  |z
  |z
  |z
  |z
  |z
  |z
  }|| z
  }|| z
  |z
  |z
  }|
| z
  |z
  |z
  }|| z
  |z
  |z
  |z
  |z
  |z
  |z
  }|	| z
  |z
  |z
  }|| z
  |z
  |z
  |z
  |z
  |z
  |z
  }|| z
  |z
  |z
  |z
  |z
  |z
  |z
  } || z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  | z
  }!| |||||||||||||| |!fS )zXCalculates values of a quadgram contingency table from
        marginal values.
        r   )"n_iiiin_iiix_tuplen_iixx_tuplen_ixxx_tuplen_xxxxn_iiixn_iixin_ixiin_xiiin_iixxn_ixixn_ixxin_xixin_xxiin_xiixn_ixxxn_xixxn_xxixn_xxxin_oiiin_ioiin_iioin_ooiin_oioin_iooin_oooin_iiion_oiion_ioion_ooion_iioon_oioon_iooon_oooos"                                     r   r"   z"QuadgramAssocMeasures._contingencyP  s>   
 ,8(;G8+7(&&&&6)F2&6)F2&6)F2&6)F2V;fDvMPVV&&6)F2&6)F2&6)F2V;fDvMPVV&6)F2&6)F2V;fDvMPVV&6)F2V;fDvMPVV  	
     	 
      	( !
 	
r   c                     | \  }}}}}}}}}	}
}}}}}}||	z   }||z   }||z   }||z   }||z   |	z   |z   }||z   |	z   |z   }||z   |z   |z   }||z   |z   |z   }||z   |z   |z   }||z   |	z   |
z   }||z   |z   |	z   |z   |z   |z   |z   }||z   |z   |	z   |z   |
z   |z   |z   }||z   |z   |	z   |z   |z   |
z   |z   }||z   |z   |z   |z   |z   |z   |z   }t        |       }|||||f||||||f||||f|fS )a  Calculates values of contingency table marginals from its values.
        QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
        (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
        r   ) r&   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r9   s                                    r   r'   z QuadgramAssocMeasures._marginals  s   . #	
 &&&&&6)F2&6)F2&6)F2&6)F2&6)F2&6)F2&6)F2V;fDvMPVV&6)F2V;fDvMPVV&6)F2V;fDvMPVV&6)F2V;fDvMPVVK  VVV,VVVVV<VVV,
 	
r   Nr   r   r   r   r   r   9  s5    ( 
B9
 9
v 1
 1
r   r   c                   &    e Zd ZdZd Zed        Zy)ContingencyMeasureszWraps NgramAssocMeasures classes such that the arguments of association
    measures are contingency table values rather than marginals.
    c                    d|j                   j                  z   | j                   _        t        |      D ]P  }|j                  d      rt	        ||      }|j                  d      s| j                  ||      }t        | ||       R y)zAConstructs a ContingencyMeasures given a NgramAssocMeasures classContingency___N)	__class__r_   dir
startswithgetattr_make_contingency_fnsetattr)selfmeasureskvs       r   __init__zContingencyMeasures.__init__  sr    "/(2D2D2M2M"MXA||D!!$A<<$--h:D!Q r   c                 Z      fd}j                   |_         j                  |_        |S )zFrom an association measure function, produces a new function which
        accepts contingency table values as its arguments.
        c                  (      j                   |   S r   )r'   )r&   r   old_fns    r   resz5ContingencyMeasures._make_contingency_fn.<locals>.res  s    .8..<==r   )rb   r_   )r   r   r   s   `` r   r   z(ContingencyMeasures._make_contingency_fn  s%    	> nn
r   N)r_   r`   ra   rb   r   rc   r   r   r   r   r   r     s     	  
 
r   r   )rb   mathr	   abcr   r   	functoolsr   rT   logrX   r8   rB   scipy.statsr   ImportErrorr=   rA   r>   r   rg   r   r   r   r   r   r   <module>r      s     ' ii2	"( 	
 ) 7
 9t(7 t(nV(, V(r9
- 9
xE
. E
P M  """s   A1 1A<;A<