
    gn                     Z   d Z ddlZddlZddlZddlmZmZmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ  edg d      Z G d d      Z  G d d      Z! G d d      Z" G d d      Z# G d de#      Z$d Z%e&dk(  r e%        g dZ'y)a  
This module brings together a variety of NLTK functionality for
text analysis, and provides simple, interactive interfaces.
Functionality includes: concordancing, collocation discovery,
regular expression search over tokenized strings, and
distributional similarity.
    N)Counterdefaultdict
namedtuple)reduce)log)BigramCollocationFinder)MLE)padded_everygram_pipeline)BigramAssocMeasures	f_measure)ConditionalFreqDist)FreqDist)sent_tokenize)LazyConcatenation
cut_string	tokenwrapConcordanceLine)leftqueryrightoffset
left_printright_printlinec                   L    e Zd ZdZed        Zddd fdZd Zd Zd
dZ	dd	Z
y)ContextIndexa  
    A bidirectional index between words and their 'contexts' in a text.
    The context of a word is usually defined to be the words that occur
    in a fixed window around the word; but other definitions may also
    be used by providing a custom context function.
    c                     |dk7  r| |dz
     j                         nd}|t        |       dz
  k7  r| |dz      j                         nd}||fS )z;One left token and one right token, normalized to lowercaser      *START**END*)lowerlen)tokensir   r   s       >/var/www/openai/venv/lib/python3.12/site-packages/nltk/text.py_default_contextzContextIndex._default_context/   sS     )*Qva!e}""$I)*c&kAo)=q1u##%7e}    Nc                     | S N xs    r%   <lambda>zContextIndex.<lambda>6   s    Qr'   c                 &    | _          _        |r| _        n j                   _        |rD cg c]  } ||      s| c}t	         fdt              D               _        t	         fdt              D               _        y c c}w )Nc              3   j   K   | ]*  \  }}j                  |      j                  |      f , y wr)   )_key_context_func.0r$   wselfr#   s      r%   	<genexpr>z(ContextIndex.__init__.<locals>.<genexpr>?   s4      %
FWdaTYYq\4--fa89FW   03c              3   j   K   | ]*  \  }}j                  |      j                  |      f , y wr)   )r1   r0   r2   s      r%   r6   z(ContextIndex.__init__.<locals>.<genexpr>B   s4      %
FWdaT*DIIaL9FWr7   )r0   _tokensr1   r&   CFD	enumerate_word_to_contexts_context_to_words)r5   r#   context_funcfilterkeyts   ``    r%   __init__zContextIndex.__init__6   s    	!-D!%!6!6D!'5A6!9a5F!$ %
FOPVFW%
 "
 "% %
FOPVFW%
 "
	 6s   B Bc                     | j                   S )zw
        :rtype: list(str)
        :return: The document that this context index was
            created from.
        r9   r5   s    r%   r#   zContextIndex.tokensF        ||r'   c                     | j                  |      }t        | j                  |         }i }| j                  j                         D ]  \  }}t	        |t        |            ||<    |S )z
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        )r0   setr<   itemsr   )r5   wordword_contextsscoresr4   
w_contextss         r%   word_similarity_dictz!ContextIndex.word_similarity_dictN   sc     yyD22489!3399;MAz!-ZAF1I < r'   c                 0   t        t              }| j                  | j                  |         D ]L  }| j                  |   D ]8  }||k7  s	||xx   | j                  |   |   | j                  |   |   z  z  cc<   : N t        ||j                  d      d | S )NT)r@   reverse)r   intr<   r0   r=   sortedget)r5   rJ   nrL   cr4   s         r%   similar_wordszContextIndex.similar_words]   s    S!''		$8A++A.91I..q1$7$:P:PQR:STU:VVI / 9 f&**d;BQ??r'   c                     |D cg c]  } j                  |       }}|D cg c]  }t         j                  |          }}t        t	        |            D cg c]  }||   r	||    }}t        t        j                  |      |r|rt        ddj                  |            s
t               S t         fd|D              }|S c c}w c c}w c c}w )a  
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        z%The following word(s) were not found: c              3   T   K   | ]  }j                   |   D ]  }|v s|  ! y wr)   )r<   )r3   r4   rU   commonr5   s      r%   r6   z/ContextIndex.common_contexts.<locals>.<genexpr>|   s/       a$*@*@*CQqF{*C5s   (	()
r0   rH   r<   ranger"   r   intersection
ValueErrorjoinr   )	r5   wordsfail_on_unknownr4   contextsr$   emptyfdrZ   s	   `       @r%   common_contextszContextIndex.common_contextsg   s     (--u!1u-<ABEqC..q12EB#(U#4H#4aHQKq#4H(((3_DchhuoVV:   B I .BHs   CC
C'C   )F)__name__
__module____qualname____doc__staticmethodr&   rB   r#   rN   rV   rd   r*   r'   r%   r   r   '   s>       -1; 
 @r'   r   c                   >    e Zd ZdZd fdZd Zd Zd Zd
dZddZ	y	)ConcordanceIndexzs
    An index that can be used to look up the offset locations at which
    a given word occurs in a document.
    c                     | S r)   r*   r+   s    r%   r-   zConcordanceIndex.<lambda>   s    Qr'   c                     || _         	 || _        	 t        t              | _        	 t        |      D ]4  \  }}| j                  |      }| j                  |   j                  |       6 y)a  
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        N)r9   r0   r   list_offsetsr;   append)r5   r#   r@   indexrJ   s        r%   rB   zConcordanceIndex.__init__   sb     	  	D#D)L$V,KE499T?DMM$&&u- -r'   c                     | j                   S )z{
        :rtype: list(str)
        :return: The document that this concordance index was
            created from.
        rD   rE   s    r%   r#   zConcordanceIndex.tokens   rF   r'   c                 B    | j                  |      }| j                  |   S )z
        :rtype: list(int)
        :return: A list of the offset positions at which the given
            word occurs.  If a key function was specified for the
            index, then given word's key will be looked up.
        )r0   rq   r5   rJ   s     r%   offsetszConcordanceIndex.offsets   s      yy}}T""r'   c                 \    dt        | j                        t        | j                        fz  S )Nz+<ConcordanceIndex for %d tokens (%d types)>)r"   r9   rq   rE   s    r%   __repr__zConcordanceIndex.__repr__   s-    <@
 
 	
r'   c           
      H   t        |t              r|}n|g}dj                  |      }t        d |D              }||z
  dz
  dz  }|dz  }g }| j	                  |d         }	t        |dd       D ]C  \  }
}| j	                  |      D ch c]
  }||
z
  dz
   }}t        |j                  |	            }	E |	r|	D ]  }
dj                  | j                  |
|
t        |      z          }| j                  t        d|
|z
        |
 }| j                  |
t        |      z   |
|z    }t        dj                  |      |       j                  |      }t        dj                  |      |      }dj                  |||g      }t        ||||
|||      }|j                  |        |S c c}w )z
        Find all concordance lines given the query word.

        Provided with a list of words, these will be found as a phrase.
        rX   c              3   L   K   | ]  }t        j                  |      rd   yw)r   N)unicodedata	combining)r3   chars     r%   r6   z4ConcordanceIndex.find_concordance.<locals>.<genexpr>   s     Uzt9N9Nt9Tzs   $$      r   r   N)
isinstancerp   r^   sumrw   r;   rR   r\   r9   r"   maxr   rjustr   rr   )r5   rJ   widthphrase
phrase_str
phrase_len
half_widthcontextconcordance_listrw   r$   r   word_offsets
query_wordleft_contextright_contextr   r   
line_printconcordance_lines                       r%   find_concordancez!ConcordanceIndex.find_concordance   s    dD!FVFXXf%
UzUU
j(1,2
1* ,,vay) ,GAt9=d9KL9KvFQJN9KLL\66w?@G -  XXdll1q3v;&GH
#||C1w;,?!D $QV_q7{ K'(>LRR
 )-)@*M XXz:{&KL
#2 !$  !''(89- .  5 Ms   Fc                     | j                  ||      }|st        d       yt        |t        |            }t        d| dt        |       d       t	        |d|       D ]  \  }}t        |j
                          y)a  
        Print concordance lines given the query word.
        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param lines: The number of lines to display (default=25)
        :type lines: int
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param save: The option to save the concordance.
        :type save: bool
        )r   z
no matcheszDisplaying z of z	 matches:N)r   printminr"   r;   r   )r5   rJ   r   linesr   r$   r   s          r%   print_concordancez"ConcordanceIndex.print_concordance   s      00U0C,s#345EKwd3/?+@*AKL'01A&51I'J##&++, (Kr'   N)P   )r      )
rg   rh   ri   rj   rB   r#   rw   ry   r   r   r*   r'   r%   rm   rm      s+    
 $/ .4#
. `-r'   rm   c                       e Zd ZdZd Zd Zy)TokenSearchera  
    A class that makes it easier to use regular expressions to search
    over tokenized strings.  The tokenized string is converted to a
    string where tokens are marked with angle brackets -- e.g.,
    ``'<the><window><is><still><open>'``.  The regular expression
    passed to the ``findall()`` method is modified to treat angle
    brackets as non-capturing parentheses, in addition to matching the
    token boundaries; and to have ``'.'`` not match the angle brackets.
    c                 >    dj                  d |D              | _        y )N c              3   ,   K   | ]  }d |z   dz     yw)<>Nr*   )r3   r4   s     r%   r6   z)TokenSearcher.__init__.<locals>.<genexpr>  s     :6aC!GcM6s   )r^   _raw)r5   r#   s     r%   rB   zTokenSearcher.__init__  s    GG:6::	r'   c                    t        j                  dd|      }t        j                  dd|      }t        j                  dd|      }t        j                  dd|      }t        j                  || j                        }|D ]0  }|j	                  d      r|j                  d      s't        d	       |D cg c]  }|d
d j                  d       }}|S c c}w )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.text import TokenSearcher
        >>> from nltk.book import text1, text5, text9
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        z\sr   r   z(?:<(?:r   z)>)z	(?<!\\)\.z[^>]z$Bad regexp for TokenSearcher.findallr   z><)resubfindallr   
startswithendswithr]   splitr5   regexphitshs       r%   r   zTokenSearcher.findall  s    0 r6*i0eV,ff5 zz&$)), A<<$C !GHH 
 .22T!Bd#T2 3s   6CN)rg   rh   ri   rj   rB   r   r*   r'   r%   r   r     s    ;'r'   r   c                       e Zd ZdZdZddZd Zd ZddZddZ	dd	Z
dd
Zd Zd Zd ZddZddZd ZddZddZd Zd Zd Z ej0                  d      Zd Zd Zd Zy) Texta  
    A wrapper around a sequence of simple (string) tokens, which is
    intended to support initial exploration of texts (via the
    interactive console).  Its methods perform a variety of analyses
    on the text's contexts (e.g., counting, concordancing, collocation
    discovery), and display the results.  If you wish to write a
    program which makes use of these analyses, then you should bypass
    the ``Text`` class, and use the appropriate analysis function or
    class directly instead.

    A ``Text`` is typically initialized from a given document or
    corpus.  E.g.:

    >>> import nltk.corpus
    >>> from nltk.text import Text
    >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

    TNc                    | j                   rt        |      }|| _        |r|| _        yd|dd v r5|dd j	                  d      }dj                  d |d| D              | _        ydj                  d |dd D              d	z   | _        y)
zv
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        ]Nrf   rX   c              3   2   K   | ]  }t        |        y wr)   strr3   toks     r%   r6   z Text.__init__.<locals>.<genexpr>b  s      C]cS]   r   c              3   2   K   | ]  }t        |        y wr)   r   r   s     r%   r6   z Text.__init__.<locals>.<genexpr>d  s      @ZcSZr      z...)_COPY_TOKENSrp   r#   namers   r^   )r5   r#   r   ends       r%   rB   zText.__init__S  s     &\FDIF3BK"+##C(C CVAc] CCDI @VBQZ @@5HDIr'   c                      | j                   |   S r)   )r#   )r5   r$   s     r%   __getitem__zText.__getitem__j  s    {{1~r'   c                 ,    t        | j                        S r)   )r"   r#   rE   s    r%   __len__zText.__len__m  s    4;;r'   c                     d| j                   vrt        | j                  d       | _        | j                  j	                  |||      S )a  
        Prints a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        _concordance_indexc                 "    | j                         S r)   r!   ss    r%   r-   z"Text.concordance.<locals>.<lambda>  
    1779r'   r@   )__dict__rm   r#   r   r   r5   rJ   r   r   s       r%   concordancezText.concordancet  sD      t}}4&6!4'D# &&88ueLLr'   c                     d| j                   vrt        | j                  d       | _        | j                  j	                  ||      d| S )a  
        Generate a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        r   c                 "    | j                         S r)   r   r   s    r%   r-   z'Text.concordance_list.<locals>.<lambda>  r   r'   r   N)r   rm   r#   r   r   r   s       r%   r   zText.concordance_list  sI      t}}4&6!4'D# &&77eDVeLLr'   c                    d| j                   v r| j                  |k(  r| j                  |k(  s|| _        || _        ddlm} |j                  d      t        j                  | j                  |      }|j                  d       |j                  fd       t               }t        |j                  |j                  |            | _        | j                  S )a  
        Return collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocation_list()[:2]
            [('United', 'States'), ('fellow', 'citizens')]

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        :rtype: list(tuple(str, str))
        _collocationsr   )	stopwordsenglishr   c                 H    t        |       dk  xs | j                         v S )N   )r"   r!   )r4   ignored_wordss    r%   r-   z'Text.collocation_list.<locals>.<lambda>  s     s1vz/WQWWY-=W/Wr'   )r   _num_window_sizenltk.corpusr   r_   r   
from_wordsr#   apply_freq_filterapply_word_filterr   rp   nbestlikelihood_ratior   )r5   numwindow_sizer   finderbigram_measuresr   s         @r%   collocation_listzText.collocation_list  s     t}},		S !![0DI +D .%OOI6M,77[QF$$Q'$$%WX13O!%_==sC"D !!!r'   c                     | j                  ||      D cg c]  \  }}|dz   |z    }}}t        t        |d             yc c}}w )a  
        Print collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocations() # doctest: +NORMALIZE_WHITESPACE
            United States; fellow citizens; years ago; four years; Federal
            Government; General Government; American people; Vice President; God
            bless; Chief Justice; one another; fellow Americans; Old World;
            Almighty God; Fellow citizens; Chief Magistrate; every citizen; Indian
            tribes; public debt; foreign nations


        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        rX   ; )	separatorN)r   r   r   )r5   r   r   w1w2collocation_stringss         r%   collocationszText.collocations  sQ    ( )-(=(=c;(O
(Ofb"BHrM(O 	 
 	i+t<=
s   Ac                 8    | j                   j                  |      S )zJ
        Count the number of times this word appears in the text.
        )r#   countrv   s     r%   r   z
Text.count       {{  &&r'   c                 8    | j                   j                  |      S )zQ
        Find the index of the first occurrence of the word in the text.
        )r#   rs   rv   s     r%   rs   z
Text.index  r   r'   c                     t         r)   )NotImplementedError)r5   methods     r%   readabilityzText.readability  s    !!r'   c                    d| j                   vrt        | j                  d d       | _        j	                         | j                  j
                  j                         v rjt                 t        fdj                         D              }|j                  |      D cg c]  \  }}|	 }}}t        t        |             yt        d       yc c}}w )a~  
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        _word_context_indexc                 "    | j                         S r)   )isalphar+   s    r%   r-   zText.similar.<locals>.<lambda>  s
    aiikr'   c                 "    | j                         S r)   r   r   s    r%   r-   zText.similar.<locals>.<lambda>  s
    r'   )r?   r@   c              3   H   K   | ]  }|   D ]  }|v r	|k(  s|   y wr)   r*   )r3   r4   rU   ra   wcirJ   s      r%   r6   zText.similar.<locals>.<genexpr>  s7      )AQA=d  )s   "z
No matchesN)r   r   r#   r   r!   r<   
conditionsrH   r   most_commonr   r   )	r5   rJ   r   rc   r4   _r_   ra   r   s	    `     @@r%   similarzText.similar  s     !5'3$9?R(D$ zz|&&883>>##3t9~H ) B $&>>##67#641aQ#6E7)E"#, 8s   /Cc                 z   d| j                   vrt        | j                  d       | _        	 | j                  j	                  |d      }|st        d       y|j                  |      D cg c]  \  }}|	 }}}t        t        d |D                     yc c}}w # t        $ r}t        |       Y d}~yd}~ww xY w)aY  
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param words: The words used to seed the similarity search
        :type words: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        r   c                 "    | j                         S r)   r   r   s    r%   r-   z&Text.common_contexts.<locals>.<lambda>  r   r'   r   TzNo common contexts were foundc              3   2   K   | ]  \  }}|d z   |z     yw)r   Nr*   )r3   r   r   s      r%   r6   z'Text.common_contexts.<locals>.<genexpr>!  s     LO&"bS2Or   N)	r   r   r#   r   rd   r   r   r   r]   )r5   r_   r   rc   r4   r   ranked_contextses           r%   rd   zText.common_contexts
  s     !5'3!4(D$		))99%FB56131D"E1DA11D"EiLOLLM #F  	!HH	s/   )B B +B7B B 	B:%B55B:c                 "    ddl m}  || |       y)z
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        r   )dispersion_plotN)	nltk.drawr  )r5   r_   r  s      r%   r  zText.dispersion_plot&  s     	.e$r'   c                 `    t        ||      \  }}t        |      }|j                  ||       |S )N)order)r
   r	   fit)r5   tokenized_sentsrT   
train_datapadded_sentsmodels         r%   _train_default_ngram_lmzText._train_default_ngram_lm3  s/    #<Q#P 
L!		*l+r'   c                    t        dj                  | j                              D cg c]  }|j                  d       c}| _        t        | d      s=t        dt        j                         | j                  | j                  d      | _
        g }|dkD  sJ d       t        |      |k  rat        | j                  j                  |||	            D ]#  \  }}|d
k(  r|dk(  r n|j                  |       % |dz  }t        |      |k  ra|rdj                  |      dz   nd}|t        |d|       z   }	t        |	       |	S c c}w )a  
        Print random text, generated using a trigram language model.
        See also `help(nltk.lm)`.

        :param length: The length of text to generate (default=100)
        :type length: int

        :param text_seed: Generation can be conditioned on preceding context.
        :type text_seed: list(str)

        :param random_seed: A random seed or an instance of `random.Random`. If provided,
            makes the random sampling part of generation reproducible. (default=42)
        :type random_seed: int
        rX   _trigram_modelzBuilding ngram index...)filer   )rT   r   z!The `length` must be more than 0.)	text_seedrandom_seedz<s>z</s>r   r   N)r   r^   r#   r   _tokenized_sentshasattrr   sysstderrr  r  r"   r;   generaterr   r   )
r5   lengthr  r  sentgenerated_tokensidxtokenprefix
output_strs
             r%   r  zText.generate9  sX   " )6chht{{6K(L!
(LDJJsO(L!
 t-.+#**="&">">%% #? #D z>>>z"#f,'##,,i[ - 
U
 E>F? ''. 1K "#f, /8)$s*Ri(8&(ABB
j9!
s   Ec                 <     | j                         j                  | S )zc
        See documentation for FreqDist.plot()
        :seealso: nltk.prob.FreqDist.plot()
        )vocabplot)r5   argss     r%   r  z	Text.plotg  s    
 !tzz|  $''r'   c                 V    d| j                   vrt        |       | _        | j                  S )z.
        :seealso: nltk.prob.FreqDist
        _vocab)r   r   r"  rE   s    r%   r  z
Text.vocabn  s%     4==("4.DK{{r'   c                     d| j                   vrt        |       | _        | j                  j                  |      }|D cg c]  }dj	                  |       }}t        t        |d             yc c}w )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.book import text1, text5, text9
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        _token_searcherrX   r   N)r   r   r$  r   r^   r   r   r   s       r%   r   zText.findallw  sc    . DMM1#0#6D ##++F3%)*TT*id#$ +s   A.z\w+|[\.\!\?]c                    |dz
  }|dk\  rG| j                   j                  ||         s)|dz  }|dk\  r| j                   j                  ||         s)|dk7  r||   nd}|dz   }|t        |      k  rP| j                   j                  ||         s2|dz  }|t        |      k  r| j                   j                  ||         s2|t        |      k7  r||   nd}||fS )z
        One left & one right token, both case-normalized.  Skip over
        non-sentence-final punctuation.  Used by the ``ContextIndex``
        that is created for ``similar()`` and ``common_contexts()``.
        r   r   r   r    )_CONTEXT_REmatchr"   )r5   r#   r$   jr   r   s         r%   _contextzText._context  s     E1fT--33F1I>FA 1fT--33F1I>Fvay	 E#f+od&6&6&<&<VAY&GFA #f+od&6&6&<&<VAY&G#f+-q	7e}r'   c                      d| j                   z  S Nz
<Text: %s>r   rE   s    r%   __str__zText.__str__      dii''r'   c                      d| j                   z  S r+  r,  rE   s    r%   ry   zText.__repr__  r.  r'   r)   )O   r   )rf   r   re   )r   )d   N*   )rg   rh   ri   rj   r   rB   r   r   r   r   r   r   r   rs   r   r   rd   r  r  r  r  r  r   r   compiler&  r)  r-  ry   r*   r'   r%   r   r   9  s    . LI. M*M(!"F>0''"  D8%,\(%D "**_-K0((r'   r   c                   (    e Zd ZdZd Zd Zd Zd Zy)TextCollectiona;  A collection of texts, which can be loaded with list of texts, or
    with a corpus consisting of one or more texts, and which supports
    counting, concordancing, collocation discovery, etc.  Initialize a
    TextCollection as follows:

    >>> import nltk.corpus
    >>> from nltk.text import TextCollection
    >>> from nltk.book import text1, text2, text3
    >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
    >>> mytexts = TextCollection([text1, text2, text3])

    Iterating over a TextCollection produces all the tokens of all the
    texts in order.
    c                     t        |d      r,|j                         D cg c]  }|j                  |       }}|| _        t        j                  | t        |             i | _        y c c}w )Nr_   )r  fileidsr_   _textsr   rB   r   
_idf_cache)r5   sourcefs      r%   rB   zTextCollection.__init__  sY    67#/5~~/?@/?!fll1o/?F@d-f56	 As   A'c                 <    |j                  |      t        |      z  S )z"The frequency of the term in text.)r   r"   r5   termtexts      r%   tfzTextCollection.tf  s    zz$#d)++r'   c                 H   | j                   j                  |      }|t        | j                  D cg c]	  }||v sd c}      }t        | j                        dk(  rt	        d      |r!t        t        | j                        |z        nd}|| j                   |<   |S c c}w )zThe number of texts in the corpus divided by the
        number of texts that the term appears in.
        If a term does not appear in the corpus, 0.0 is returned.Tr   z+IDF undefined for empty document collectiong        )r9  rS   r"   r8  r]   r   )r5   r>  idfr?  matchess        r%   rB  zTextCollection.idf  s    
 oo!!$';DKKHKD44<4KHIG4;;1$ !NOO5<#c$++&01#C$'DOOD!
 Is
   	BBc                 J    | j                  ||      | j                  |      z  S r)   )r@  rB  r=  s      r%   tf_idfzTextCollection.tf_idf  s     wwtT"TXXd^33r'   N)rg   rh   ri   rj   rB   r@  rB  rE  r*   r'   r%   r5  r5    s    ,4r'   r5  c                  z   ddl m}  t        | j                  d            }t	        |       t	                t	        d       |j                  d       t	                t	        d       |j                  d       t	                t	        d       |j                          t	                t	        d       |j                  g d	       t	                t	        d
       |j                  d       t	                t	        d       t	        d|d          t	        d|dd        t	        d|j                         d          y )Nr   )brownnews)
categorieszConcordance:zDistributionally similar words:zCollocations:zDispersion plot:)rH  reportsaid	announcedzVocabulary plot:2   z	Indexing:ztext[3]:r   z
text[3:5]:   ztext.vocab()['news']:)r   rG  r   r_   r   r   r   r   r  r  r  )rG  r?  s     r%   demorO    s    !v./D	$K	G	.V	G	
+,LL	G	/	G 

@A	G	
IIbM	G	+	*d1g	,Qq	"	
!4::<#78r'   __main__)r   rm   r   r   r5  )(rj   r   r  r|   collectionsr   r   r   	functoolsr   mathr   nltk.collocationsr   nltk.lmr	   nltk.lm.preprocessingr
   nltk.metricsr   r   nltk.probabilityr   r:   r   nltk.tokenizer   	nltk.utilr   r   r   r   r   rm   r   r   r5  rO  rg   __all__r*   r'   r%   <module>r\     s    
 
  8 8   5  ; 7 7 % ' > >MX Xv|- |-~5 5p~( ~(D+4T +4\9< zFr'   