
    g                          d Z ddlZddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ  G d	 d
e      Zd Zd ZddZ G d de      Zy)zLanguage Model Interface.    N)ABCMetaabstractmethod)bisect)
accumulate)NgramCounter)	log_base2)
Vocabularyc                   6    e Zd ZdZd Zed        Zed        Zy)	SmoothingzNgram Smoothing Interface

    Implements Chen & Goodman 1995's idea that all smoothing algorithms have
    certain features in common. This should ideally allow smoothing algorithms to
    work both with Backoff and Interpolation.
    c                      || _         || _        y)z
        :param vocabulary: The Ngram vocabulary object.
        :type vocabulary: nltk.lm.vocab.Vocabulary
        :param counter: The counts of the vocabulary items.
        :type counter: nltk.lm.counter.NgramCounter
        N)vocabcounts)self
vocabularycounters      @/var/www/openai/venv/lib/python3.12/site-packages/nltk/lm/api.py__init__zSmoothing.__init__   s      
    c                     t               NNotImplementedError)r   words     r   unigram_scorezSmoothing.unigram_score&       !##r   c                     t               r   r   r   r   contexts      r   alpha_gammazSmoothing.alpha_gamma*   r   r   N)__name__
__module____qualname____doc__r   r   r   r    r   r   r   r      s4     $ $ $ $r   r   )	metaclassc                 0    t        |       t        |       z  S )z0Return average (aka mean) for sequence of items.)sumlen)itemss    r   _meanr*   /   s    u:E
""r   c                 d    t        | t        j                        r| S t        j                  |       S r   )
isinstancerandomRandom)seed_or_generators    r   _random_generatorr0   4   s'    #V]]3  ==*++r   c                     | st        d      t        |       t        |      k7  rt        d      t        t        |            }|d   }|j	                         }| t        |||z           S )z`Like random.choice, but with weights.

    Heavily inspired by python 3.6 `random.choices`.
    z"Can't choose from empty populationz3The number of weights does not match the population)
ValueErrorr(   listr   r-   r   )
populationweightsrandom_generatorcum_weightstotal	thresholds         r   _weighted_choicer;   :   sl    
 =>>
:#g,&NOOz'*+KOE '')If[%)*;<==r   c                   \    e Zd ZdZddZddZddZedd       ZddZ	d Z
d	 Zd
 ZddZy)LanguageModelzKABC for Language Models.

    Cannot be directly instantiated itself.

    Nc                     || _         |r?t        |t              s/t        j                  d| j
                  j                  dd       |
t               n|| _        |t               | _	        y|| _	        y)ap  Creates new LanguageModel.

        :param vocabulary: If provided, this vocabulary will be used instead
            of creating a new one when training.
        :type vocabulary: `nltk.lm.Vocabulary` or None
        :param counter: If provided, use this object to count ngrams.
        :type counter: `nltk.lm.NgramCounter` or None
        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram
            sequences.
        :type ngrams_fn: function or None
        :param pad_fn: If given, defines how sentences in training text are padded.
        :type pad_fn: function or None
        z$The `vocabulary` argument passed to z- must be an instance of `nltk.lm.Vocabulary`.   )
stacklevelN)
orderr,   r	   warningswarn	__class__r    r   r   r   )r   rA   r   r   s       r   r   zLanguageModel.__init__P   sh     
jZ@MM6t~~7N7N6Q R? ?
 &0%7Z\Z
(/lnWr   c                       j                   s(|t        d       j                   j                  |        j                  j                   fd|D               y)zeTrains the model on a text.

        :param text: Training text as a sequence of sentences.

        Nz:Cannot fit without a vocabulary or text to create it from.c              3   T   K   | ]  }j                   j                  |       ! y wr   )r   lookup).0sentr   s     r   	<genexpr>z$LanguageModel.fit.<locals>.<genexpr>t   s!     Dtt4::,,T2ts   %()r   r3   updater   )r   textvocabulary_texts   `  r   fitzLanguageModel.fith   sN     zz& P  JJo.DtDDr   c                     | j                  | j                  j                  |      |r| j                  j                  |            S d      S )zMasks out of vocab (OOV) words and computes their model score.

        For model-specific logic of calculating scores, see the `unmasked_score`
        method.
        N)unmasked_scorer   rG   r   s      r   scorezLanguageModel.scorev   sI     ""JJd#7TZZ%6%6w%?
 	
PT
 	
r   c                     t               )a  Score a word given some optional context.

        Concrete models are expected to provide an implementation.
        Note that this method does not mask its arguments with the OOV label.
        Use the `score` method for that.

        :param str word: Word for which we want the score
        :param tuple(str) context: Context the word is in.
            If `None`, compute unigram score.
        :param context: tuple(str) or None
        :rtype: float
        r   r   s      r   rP   zLanguageModel.unmasked_score   s     "##r   c                 8    t        | j                  ||            S )zEvaluate the log score of this word in this context.

        The arguments are the same as for `score` and `unmasked_score`.

        )r   rQ   r   s      r   logscorezLanguageModel.logscore   s     D'233r   c                 n    |r| j                   t        |      dz      |   S | j                   j                  S )zHelper method for retrieving counts for a given context.

        Assumes context has been checked and oov words in it masked.
        :type context: tuple(str) or None

           )r   r(   unigrams)r   r   s     r   context_countszLanguageModel.context_counts   s7     7>DKKGq()'2	
CG;;CWCW	
r   c                 p    dt        |D cg c]  }| j                  |d   |dd        c}      z  S c c}w )a?  Calculate cross-entropy of model for given evaluation text.

        This implementation is based on the Shannon-McMillan-Breiman theorem,
        as used and referenced by Dan Jurafsky and Jordan Boyd-Graber.

        :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.
        :rtype: float

        r2   N)r*   rT   )r   text_ngramsngrams      r   entropyzLanguageModel.entropy   sB     E?JK{eT]]59eCRj1{K
 
 	
Ks   3
c                 8    t        d| j                  |            S )zCalculates the perplexity of the given text.

        This is simply 2 ** cross-entropy for the text, so the arguments are the same.

        g       @)powr\   )r   rZ   s     r   
perplexityzLanguageModel.perplexity   s     3[122r   c                 L    |g n
t        |      }t        |      }|dk(  rt        |       j                  k\  r| j                   dz   d n| j	                   j
                  j                              }rF|sDt              dkD  rdd ng  j	                   j
                  j                              }r|sDt        |      }t        |t         fd|D              |      S g }t        |      D ](  }|j                   j                  d||z   |             * |S )a  Generate words from the model.

        :param int num_words: How many words to generate. By default 1.
        :param text_seed: Generation can be conditioned on preceding context.
        :param random_seed: A random seed or an instance of `random.Random`. If provided,
            makes the random sampling part of generation reproducible.
        :return: One (str) word or a list of words generated from model.

        Examples:

        >>> from nltk.lm import MLE
        >>> lm = MLE(2)
        >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])
        >>> lm.fit([[("a",), ("b",), ("c",)]])
        >>> lm.generate(random_seed=3)
        'a'
        >>> lm.generate(text_seed=['a'])
        'b'

        NrV   c              3   B   K   | ]  }j                  |        y wr   )rQ   )rH   wr   r   s     r   rJ   z)LanguageModel.generate.<locals>.<genexpr>   s     >gdjjG,gs   )	num_words	text_seedrandom_seed)r4   r0   r(   rA   rX   r   rG   sortedr;   tuplerangeappendgenerate)	r   rc   rd   re   r7   samples	generated_r   s	   `       @r   rj   zLanguageModel.generate   s3   * $+Bi	,[9> y>TZZ/ 4::+/+, 
 ))$***;*;G*DEG'),W)9'!"+r--djj.?.?.HI ' WoG#>g>>   	y!A')3 0   " r   )NNr   )rV   NN)r    r!   r"   r#   r   rN   rQ   r   rP   rT   rX   r\   r_   rj   r$   r   r   r=   r=   I   sE    E0E
 $ $4	

35r   r=   r   )r#   r-   rB   abcr   r   r   	itertoolsr   nltk.lm.counterr   nltk.lm.utilr   nltk.lm.vocabularyr	   r   r*   r0   r;   r=   r$   r   r   <module>rs      sK        '    ( " )$' $6#
,>eg er   