
    g
                     t    d Z ddlmZ ddlmZ ddlmZ d Z G d de      Z G d d	e      Z	 G d
 de      Z
y)zSmoothing algorithms for language modeling.

According to Chen & Goodman 1995 these should work with both Backoff and
Interpolation.
    )methodcaller)	Smoothing)ConditionalFreqDistc                     t        | t              rt        d      nd t        fd| j	                         D              S )zCount values that are greater than zero in a distribution.

    Assumes distribution is either a mapping with counts as values or
    an instance of `nltk.ConditionalFreqDist`.
    Nc                     | S N )counts    F/var/www/openai/venv/lib/python3.12/site-packages/nltk/lm/smoothing.py<lambda>z'_count_values_gt_zero.<locals>.<lambda>   s    5    c              3   :   K   | ]  } |      d kD  sd  yw)r      Nr
   ).0dist_or_countas_counts     r   	<genexpr>z(_count_values_gt_zero.<locals>.<genexpr>   s#      4m8ORS8S4s   )
isinstancer   r   sumvalues)distributionr   s    @r   _count_values_gt_zeror      sG     l$78 	S    +224  r   c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )
WittenBellzWitten-Bell smoothing.c                 (    t        |   ||fi | y r	   )super__init__)self
vocabularycounterkwargs	__class__s       r   r   zWittenBell.__init__'   s    W77r   c                 t    | j                   |   j                  |      }| j                  |      }d|z
  |z  |fS )Ng      ?)countsfreq_gammar   wordcontextalphagammas        r   alpha_gammazWittenBell.alpha_gamma*   s=    G$))$/G$eu$e++r   c                 x    t        | j                  |         }||| j                  |   j                         z   z  S r	   )r   r%   r   r   r*   n_pluss      r   r'   zWittenBell._gamma/   s7    &t{{7';<$++g"6"8"8"::;;r   c                 L    | j                   j                  j                  |      S r	   r%   unigramsr&   r   r)   s     r   unigram_scorezWittenBell.unigram_score3       {{##((..r   	__name__
__module____qualname____doc__r   r-   r'   r5   __classcell__r#   s   @r   r   r   $   s     8,
</r   r   c                   6     e Zd ZdZd fd	Zd Zd Zd Z xZS )AbsoluteDiscountingz!Smoothing with absolute discount.c                 6    t        |   ||fi | || _        y r	   )r   r   discount)r   r    r!   rA   r"   r#   s        r   r   zAbsoluteDiscounting.__init__:   s    W77 r   c                     t        | j                  |   |   | j                  z
  d      | j                  |   j                         z  }| j	                  |      }||fS )Nr   )maxr%   rA   r   r'   r(   s        r   r-   zAbsoluteDiscounting.alpha_gamma>   s\    G$T*T]]:A>kk'"$$&' 	 G$e|r   c                     t        | j                  |         }| j                  |z  | j                  |   j                         z  S r	   )r   r%   rA   r   r/   s      r   r'   zAbsoluteDiscounting._gammaF   s;    &t{{7';<&$++g*>*@*@*BBBr   c                 L    | j                   j                  j                  |      S r	   r2   r4   s     r   r5   z!AbsoluteDiscounting.unigram_scoreJ   r6   r   )g      ?r7   r=   s   @r   r?   r?   7   s    +!C/r   r?   c                   D     e Zd ZdZd fd	Zd Zd Z e       fdZ xZ	S )	KneserNeya  Kneser-Ney Smoothing.

    This is an extension of smoothing with a discount.

    Resources:
    - https://pages.ucsd.edu/~rlevy/lign256/winter2008/kneser_ney_mini_example.pdf
    - https://www.youtube.com/watch?v=ody1ysUTD7o
    - https://medium.com/@dennyc/a-simple-numerical-example-for-kneser-ney-smoothing-nlp-4600addf38b8
    - https://www.cl.uni-heidelberg.de/courses/ss15/smt/scribe6.pdf
    - https://www-i6.informatik.rwth-aachen.de/publications/download/951/Kneser-ICASSP-1995.pdf
    c                 D    t        |   ||fi | || _        || _        y r	   )r   r   rA   _order)r   r    r!   orderrA   r"   r#   s         r   r   zKneserNey.__init__[   s%    W77 r   c                 4    | j                  |      \  }}||z  S r	   )_continuation_counts)r   r)   word_continuation_counttotal_counts       r   r5   zKneserNey.unigram_score`   s#    /3/H/H/N,&44r   c                     | j                   |   }t        |      dz   | j                  k(  r||   |j                         fn| j	                  ||      \  }}t        || j                  z
  d      |z  }| j                  t        |      z  |z  }||fS )Nr   g        )r%   lenrI   r   rL   rC   rA   r   )r   r)   r*   prefix_countsrM   rN   r+   r,   s           r   r-   zKneserNey.alpha_gammad   s    G, 7|a4;;. 4 -//"34**49 	-
 +dmm;SAKO 5m DD{Re|r   c                     fd| j                   t              dz      j                         D        }d\  }}|D ]$  }|t        ||   dkD        z  }|t	        |      z  }& ||fS )a  Count continuations that end with context and word.

        Continuations track unique ngram "types", regardless of how many
        instances were observed for each "type".
        This is different than raw ngram counts which track number of instances.
        c              3   8   K   | ]  \  }}|d d k(  r|  yw)r   Nr
   )r   prefix_ngramr%   r*   s      r   r   z1KneserNey._continuation_counts.<locals>.<genexpr>v   s-      ,
(M$fAB7* (Ms      )r   r   r   )r%   rP   itemsintr   )r   r)   r*    higher_order_ngrams_with_context#higher_order_ngrams_with_word_counttotalr%   s     `    r   rL   zKneserNey._continuation_countso   s{    ,
(,CL14D(E(K(K(M,
(
 6:2+U6F/3vd|a7G3HH/*622E 7 3E99r   )g?)
r8   r9   r:   r;   r   r5   r-   tuplerL   r<   r=   s   @r   rG   rG   N   s#    

5	 27 :r   rG   N)r;   operatorr   nltk.lm.apir   nltk.probabilityr   r   r   r?   rG   r
   r   r   <module>r_      s>   
 " ! 0"/ /&/) /.1:	 1:r   