
    g-                     T   d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0 ejb                  dd       Z2ddZ3 e	       Z4ddZ5y)a@	  
NLTK Tokenizer Package

Tokenizers divide strings into lists of substrings.  For example,
tokenizers can be used to find the words and punctuation in a string:

    >>> from nltk.tokenize import word_tokenize
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
    ... two of them.\n\nThanks.'''
    >>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

This particular tokenizer requires the Punkt sentence tokenization
models to be installed. NLTK also provides a simpler,
regular-expression based tokenizer, which splits text on whitespace
and punctuation:

    >>> from nltk.tokenize import wordpunct_tokenize
    >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

We can also operate at the level of sentences, using the sentence
tokenizer directly as follows:

    >>> from nltk.tokenize import sent_tokenize, word_tokenize
    >>> sent_tokenize(s)
    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
    >>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]

Caution: when tokenizing a Unicode string, make sure you are not
using an encoded version of the string (it may be necessary to
decode it first, e.g. with ``s.decode("utf8")``.

NLTK tokenizers can produce token-spans, represented as tuples of integers
having the same semantics as string slices, to support efficient comparison
of tokenizers.  (These methods are implemented as generators.)

    >>> from nltk.tokenize import WhitespaceTokenizer
    >>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]

There are numerous ways to tokenize text.  If you need more control over
tokenization, see the other methods provided in this package.

For further information, please see Chapter 3 of the NLTK book.
    N)load)TweetTokenizercasual_tokenize)NLTKWordTokenizer)LegalitySyllableTokenizer)MWETokenizer)PunktSentenceTokenizerPunktTokenizer)BlanklineTokenizerRegexpTokenizerWhitespaceTokenizerWordPunctTokenizerblankline_tokenizeregexp_tokenizewordpunct_tokenize)ReppTokenizer)SExprTokenizersexpr_tokenize)LineTokenizerSpaceTokenizerTabTokenizerline_tokenize)SyllableTokenizer)StanfordSegmenter)TextTilingTokenizer)ToktokTokenizer)TreebankWordDetokenizerTreebankWordTokenizer)regexp_span_tokenizestring_span_tokenizec                     t        |       S )z
    A constructor for the PunktTokenizer that utilizes
    a lru cache for performance.

    :param language: the model name in the Punkt corpus
    :type language: str
    )r
   )languages    K/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/__init__.py_get_punkt_tokenizerr$   `   s     (##    c                 :    t        |      }|j                  |       S )a  
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    )r$   tokenize)textr"   	tokenizers      r#   sent_tokenizer*   m   s     %X.Id##r%   c                     |r| gnt        | |      }|D cg c]  }t        j                  |      D ]  }|   c}}S c c}}w )a  
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
    :type preserve_line: bool
    )r*   _treebank_word_tokenizerr'   )r(   r"   preserve_line	sentencessenttokens         r#   word_tokenizer1      sM     (]4-JI##$1I1R1RSW1X1X)  s   #>)english)r2   F)6__doc__	functoolsre	nltk.datar   nltk.tokenize.casualr   r   nltk.tokenize.destructiver    nltk.tokenize.legality_principler   nltk.tokenize.mwer   nltk.tokenize.punktr	   r
   nltk.tokenize.regexpr   r   r   r   r   r   r   nltk.tokenize.reppr   nltk.tokenize.sexprr   r   nltk.tokenize.simpler   r   r   r   !nltk.tokenize.sonority_sequencingr    nltk.tokenize.stanford_segmenterr   nltk.tokenize.texttilingr   nltk.tokenize.toktokr   nltk.tokenize.treebankr   r   nltk.tokenize.utilr   r    	lru_cacher$   r*   r,   r1    r%   r#   <module>rH      s   2h  	  @ 7 F * F   - >  @ > 8 0 Q I $ $$ -. r%   