
    g^                     L    d Z 	 ddlZddZd Zd Zd	dZd
dZy# e$ r Y w xY w)a  
Text Segmentation Metrics

1. Windowdiff

Pevzner, L., and Hearst, M., A Critique and Improvement of
  an Evaluation Metric for Text Segmentation,
  Computational Linguistics 28, 19-36


2. Generalized Hamming Distance

Bookstein A., Kulyukin V.A., Raita T.
Generalized Hamming Distance
Information Retrieval 5, 2002, pp 353-375

Baseline implementation in C++
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html

Study describing benefits of Generalized Hamming Distance Versus
WindowDiff for evaluating text segmentation tasks
Begsten, Y.  Quel indice pour mesurer l'efficacite en segmentation de textes ?
TALN 2009


3. Pk text segmentation metric

Beeferman D., Berger A., Lafferty J. (1999)
Statistical Models for Text Segmentation
Machine Learning, 34, 177-210
    Nc                    t        |       t        |      k7  rt        d      |t        |       kD  rt        d      d}t        t        |       |z
  dz         D ]Q  }t        | |||z    j	                  |      ||||z    j	                  |      z
        }|r||z  }C|t        d|      z  }S |t        |       |z
  dz   z  S )aW  
    Compute the windowdiff score for a pair of segmentations.  A
    segmentation is any sequence over a vocabulary of two items
    (e.g. "0", "1"), where the specified boundary value is used to
    mark the edge of a segmentation.

        >>> s1 = "000100000010"
        >>> s2 = "000010000100"
        >>> s3 = "100000010000"
        >>> '%.2f' % windowdiff(s1, s1, 3)
        '0.00'
        >>> '%.2f' % windowdiff(s1, s2, 3)
        '0.30'
        >>> '%.2f' % windowdiff(s2, s3, 3)
        '0.80'

    :param seg1: a segmentation
    :type seg1: str or list
    :param seg2: a segmentation
    :type seg2: str or list
    :param k: window width
    :type k: int
    :param boundary: boundary value
    :type boundary: str or int or bool
    :param weighted: use the weighted variant of windowdiff
    :type weighted: boolean
    :rtype: float
    z!Segmentations have unequal lengthzCWindow width k should be smaller or equal than segmentation lengthsr            ?)len
ValueErrorrangeabscountmin)seg1seg2kboundaryweightedwdindiffs           N/var/www/openai/venv/lib/python3.12/site-packages/nltk/metrics/segmentation.py
windowdiffr   1   s    < 4yCI<==3t9}Q
 	
 
B3t9q=1$%DQUO))(3d1q1uo6K6KH6UUV%KB#a-B & TQ$%%    c                     t        j                  | |f      }|t        j                  |      z  |dd d f<   |t        j                  |       z  |d d df<   |S )Nr   )npemptyarange)nrowsncolsins_costdel_costmats        r   	_init_matr    b   sO    
((E5>
"C299U++C1I299U++C1IJr   c                 
   t        |      D ]u  \  }}t        |      D ]b  \  }}	|t        ||	z
        z  | ||f   z   }
||	k(  r| ||f   }n ||	kD  r|| ||dz   f   z   }n|| |dz   |f   z   }t        ||
      | |dz   |dz   f<   d w y )Nr   )	enumerater	   r   )r   rowvcolvr   r   shift_cost_coeffr   rowijcolj
shift_costtcosts               r   _ghd_auxr+   i   s    T?4 GAt)Ct,<<s1a4yHJt|AqD	 3q!a%x=0 !3q1uax=0 #E: 6CAq1u ' #r   c                    t        |       D cg c]  \  }}||k(  s| }}}t        |      D cg c]  \  }}||k(  s| }	}}t        |      }
t        |	      }|
dk(  r|dk(  ry|
dkD  r
|dk(  r|
|z  S |
dk(  r
|dkD  r||z  S t        |dz   |
dz   ||      }t        ||	||||       t	        |d         S c c}}w c c}}w )av  
    Compute the Generalized Hamming Distance for a reference and a hypothetical
    segmentation, corresponding to the cost related to the transformation
    of the hypothetical segmentation into the reference segmentation
    through boundary insertion, deletion and shift operations.

    A segmentation is any sequence over a vocabulary of two items
    (e.g. "0", "1"), where the specified boundary value is used to
    mark the edge of a segmentation.

    Recommended parameter values are a shift_cost_coeff of 2.
    Associated with a ins_cost, and del_cost equal to the mean segment
    length in the reference segmentation.

        >>> # Same examples as Kulyukin C++ implementation
        >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
        0.5
        >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
        2.0
        >>> ghd('011', '110', 1.0, 1.0, 0.5)
        1.0
        >>> ghd('1', '0', 1.0, 1.0, 0.5)
        1.0
        >>> ghd('111', '000', 1.0, 1.0, 0.5)
        3.0
        >>> ghd('000', '111', 1.0, 2.0, 0.5)
        6.0

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the hypothetical segmentation
    :type hyp: str or list
    :param ins_cost: insertion cost
    :type ins_cost: float
    :param del_cost: deletion cost
    :type del_cost: float
    :param shift_cost_coeff: constant used to compute the cost of a shift.
        ``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j``
        are the positions indicating the shift
    :type shift_cost_coeff: float
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
    r   g        r   )r-   )r"   r   r    r+   float)refhypr   r   r%   r   r   valref_idxhyp_idx
nref_bound
nhyp_boundr   s                r   ghdr6   y   s    \ "+3CXa3(?qGC!*3CXa3(?qGCWJWJQ:?	aJ!OH$$	qZ!^H$$
JNJNHh
GCS'7Hh8HIV DCs   B8B8B> B>c                 R   |2t        t        t        |       | j                  |      dz  z              }d}t	        t        |       |z
  dz         D ]A  }| |||z    j                  |      dkD  }||||z    j                  |      dkD  }||k7  s=|dz  }C |t        |       |z
  dz   z  S )a  
    Compute the Pk metric for a pair of segmentations A segmentation
    is any sequence over a vocabulary of two items (e.g. "0", "1"),
    where the specified boundary value is used to mark the edge of a
    segmentation.

    >>> '%.2f' % pk('0100'*100, '1'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
    '0.00'

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the segmentation to evaluate
    :type hyp: str or list
    :param k: window size, if None, set to half of the average reference segment length
    :type boundary: str or int or bool
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
           @r   r   r   )introundr   r
   r   )r/   r0   r   r   errr   rhs           r   pkr>      s    2 	yc#h#))H"5";<=>
C3s8a<!#$AEN  *Q.AEN  *Q.61HC	 %
 #c(Q,$%%r   )1F)r8   r8   r   r?   )Nr?   )	__doc__numpyr   ImportErrorr   r    r+   r6   r>    r   r   <module>rD      sC   @	
+&b7 =F"&_  		s    ##