
    g!                         d Z ddlZ	 ddlmZ ddlmZ  ej                  d      Z
 G d	 d
      Zd Zd ZefdZefdZd Zd Zy# e$ r d Zd Zd ZY Cw xY w)z

A port of the Gale-Church Aligner.

Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
https://aclweb.org/anthology/J93-1004.pdf

    N)logsf)normc                     t        |       }ddd|z  z   z  }|t        j                  | |z  dz
  |d|d|d|d|d|d	|d
|d|dz  z   z  z   z  z   z  z   z  z   z  z   z  z   z  z   z  z         z  }| dk\  r|S d|z
  S )zComplementary error function.         ?gś??g5 ?g`yg?gƸ?gꪂIǿg#v?g9)gS?gޅ1Ogv(?g        g       @)absmathexp)xztrs       O/var/www/openai/venv/lib/python3.12/site-packages/nltk/translate/gale_church.pyerfccr      s    FS1WBF"' *"#$/&''1Aq:~9U4V'V'X%X#"!"	

 
< 8H7N    c                 P    ddt        | t        j                  d      z        z  z
  S )u>   Return the area under the normal distribution from M{-∞..x}.r   r      )r   r	   sqrtr   s    r   norm_cdfr   @   s$    3q499Q</0000r   c                 z    	 t        j                  dt        |       z
        S # t        $ r t	        d      cY S w xY w)Nr   -inf)r	   logr   
ValueErrorfloatr   s    r   
norm_logsfr   D   s7    	!88AO,, 	!= 	!s    # ::r   c                   &    e Zd ZdddddddZdZdZy)	LanguageIndependentgׁsF?g{Gz?gbX9ȶ?gI+?))r   r   )r   r   )r   r   )r   r   )r   r   )r   r   r   g333333@N)__name__
__module____qualname__PRIORSAVERAGE_CHARACTERSVARIANCE_CHARACTERS r   r   r   r   N   s+     F r   r   c                    g }t        |      t        |      f}|dk7  rt        d |D              rv	 | |   \  }}t        |      D ]7  }t        |      D ]'  }|j	                  |d   |z
  dz
  |d   |z
  dz
  f       ) 9 |d   |z
  |d   |z
  f}|dk7  rt        d |D              rv|ddd   S # t        $ r |d   dz
  |d   dz
  f}Y w xY w)a  
    Traverse the alignment cost from the tracebacks and retrieves
    appropriate sentence pairs.

    :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
    :type backlinks: dict
    :param source_sents_lens: A list of target sentences' lengths
    :type source_sents_lens: list(int)
    :param target_sents_lens: A list of target sentences' lengths
    :type target_sents_lens: list(int)
    )r   r   c              3   &   K   | ]	  }|d k\    yw)r   Nr%   ).0ps     r   	<genexpr>ztrace.<locals>.<genexpr>n   s     $>XQ!VXs   r   r   N)lenall	TypeErrorrangeappend)		backlinkssource_sents_lenstarget_sents_lenslinkspositionsr   ijs	            r   tracer9   `   s    E%&,=(>?H
f
$>X$>!>	X&DAq qA1XhqkAo18A;?Q3FGH   QK!OXa[1_5 f
$>X$>!> 2;  	 a!q9H	s   B. .C
	C
c                     t         fdt        |d         D              }t        fdt        |d         D              }	 |||j                  z  z   dz  }||j                  z  |z
  t        j                  ||j
                  z        z  }	t        t        t        |	            z   t        j                  |j                  |         z    S # t        $ r t        d      cY S w xY w)aP  Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
    being aligned with a specific C{alignment}.

    @param i: The offset of the source sentence.
    @param j: The offset of the target sentence.
    @param source_sents: The list of source sentence lengths.
    @param target_sents: The list of target sentence lengths.
    @param alignment: The alignment type, a tuple of two integers.
    @param params: The sentence alignment parameters.

    @returns: The log probability of a specific alignment between the two sentences, given the parameters.
    c              3   4   K   | ]  }|z
  d z
       ywr   Nr%   )r(   offsetr7   source_sentss     r   r*   z!align_log_prob.<locals>.<genexpr>   !     M9Lvl1v:>*9L   r   c              3   4   K   | ]  }|z
  d z
       ywr<   r%   )r(   r=   r8   target_sentss     r   r*   z!align_log_prob.<locals>.<genexpr>   r?   r@   r   r   r   )sumr/   r#   r	   r   r$   ZeroDivisionErrorr   LOG2r   r   r   r"   )
r7   r8   r>   rB   	alignmentparamsl_sl_tmdeltas
   ````      r   align_log_probrL   |   s     My|9LM
MC
My|9LM
MC 32222a7v00036$))***;
 
 Js5z**TXXfmmI6N-OOPP  V}s   A
C C('C(c                 J   t        |j                  j                               }g g}i }t        t	        |       dz         D ]  }t        t	        |      dz         D ]  }t        d      }d}	|D ]J  }
d|
d   z
  }||
d   z
  }|t	        |       k  s|dk  r(||   |   t        ||| ||
|      z   }||k  sG|}|
}	L |t        d      k(  rd}|	|||f<   |d   j                  |        t	        |      dkD  r|j                  d       |j                  g         t        || |      S )a  Return the sentence alignment of two text blocks (usually paragraphs).

        >>> align_blocks([5,5,5], [7,7,7])
        [(0, 0), (1, 1), (2, 2)]
        >>> align_blocks([10,5,5], [12,20])
        [(0, 0), (1, 1), (2, 1)]
        >>> align_blocks([12,20], [10,5,5])
        [(0, 0), (1, 1), (1, 2)]
        >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
        [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]

    @param source_sents_lens: The list of source sentence lengths.
    @param target_sents_lens: The list of target sentence lengths.
    @param params: the sentence alignment parameters.
    @return: The sentence alignments, a list of index pairs.
    r   infNr+   r   r   )
listr"   keysr/   r,   r   rL   r0   popr9   )r2   r3   rG   alignment_typesDr1   r7   r8   min_dist	min_alignaprev_iprev_jr)   s                 r   align_blocksrY      sI   $ 6==--/0O 
AI3()A-.s,-12AU|HI$adQqTSVG#vzfIf%q+->6)  x< H !I % 5<' )Iq!fbELL"' 3* q6A:EE!H	1 /4 -/@AAr   c           	          t        |       t        |      k7  rt        d      t        | |      D cg c]  \  }}t        |||       c}}S c c}}w )a  Creates the sentence alignment of two texts.

    Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
    alignment links.

    Each block consists of a list that contains the lengths (in characters) of the sentences
    in this block.

    @param source_blocks: The list of blocks in the source text.
    @param target_blocks: The list of blocks in the target text.
    @param params: the sentence alignment parameters.

    @returns: A list of sentence alignment lists
    z>Source and target texts do not have the same number of blocks.)r,   r   ziprY   )source_blockstarget_blocksrG   source_blocktarget_blocks        r   align_textsr`      sb     =S//L
 	
 +.m]*K*K&L, 	\<8*K  s   Ac              #   N    K    fd}	  | j                                w)zSplits an iterator C{it} at values of C{split_value}.

    Each instance of C{split_value} is swallowed. The iterator produces
    subiterators which need to be consumed fully before the next subiterator
    can be used.
    c              3   R   K   | }|k7  r| j                         }|k7  ry y wNnext)firstvitsplit_values     r   _chunk_iteratorz!split_at.<locals>._chunk_iterator   s.     ;G	A ;s   !''rd   )rh   ri   rj   s   `` r   split_atrk      s'      bggi(( s   !%c                     t        | |      D cg c],  }t        ||      D cg c]  }t        d |D               c}. c}}S c c}w c c}}w )zParses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
    and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
    c              3   2   K   | ]  }t        |        y wrc   )r,   )r(   tokens     r   r*   z%parse_token_stream.<locals>.<genexpr>  s     4uE
s   )rk   rC   )streamsoft_delimiterhard_delimiterblock_itsentence_its        r   parse_token_streamrt      sc     !8
 9H  (.A	
A 444A	
 9 	
s   A
AA
A
)__doc__r	   r   r   r   scipy.statsImportErrorr   r   r   rE   r   r9   rL   rY   r`   rk   rt   r%   r   r   <module>rx      s    4!( j txx{ $8Q8 ?R 3Bl 6I :)$
M  1!%N1![1!s   A AA