
    g                         d Z ddlmZmZ ddlmZmZ  G d de      Z G d de      Z G d d	e      Z	 G d
 de      Z
ddZy)a  
Simple Tokenizers

These tokenizers divide strings into substrings using the string
``split()`` method.
When tokenizing using a particular delimiter string, use
the string ``split()`` method directly, as this is more efficient.

The simple tokenizers are *not* available as separate functions;
instead, you should just use the string ``split()`` method directly:

    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> s.split() # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    >>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    >>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
    ['Good muffins cost $3.88', 'in New York.  Please buy me',
    'two of them.', '', 'Thanks.']

The simple tokenizers are mainly useful because they follow the
standard ``TokenizerI`` interface, and so can be used with any code
that expects a tokenizer.  For example, these tokenizers can be used
to specify the tokenization conventions when building a `CorpusReader`.

    )StringTokenizer
TokenizerI)regexp_span_tokenizestring_span_tokenizec                       e Zd ZdZdZy)SpaceTokenizera  Tokenize a string using the space character as a delimiter,
    which is the same as ``s.split(' ')``.

        >>> from nltk.tokenize import SpaceTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
     N__name__
__module____qualname____doc___string     I/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/simple.pyr   r   *   s     Gr   r   c                       e Zd ZdZdZy)TabTokenizerzTokenize a string use the tab character as a delimiter,
    the same as ``s.split('\t')``.

        >>> from nltk.tokenize import TabTokenizer
        >>> TabTokenizer().tokenize('a\tb c\n\t d')
        ['a', 'b c\n', ' d']
    	Nr
   r   r   r   r   r   8   s     Gr   r   c                        e Zd ZdZdZd Zd Zy)CharTokenizerzTokenize a string into individual characters.  If this functionality
    is ever required directly, use ``for char in string``.
    Nc                     t        |      S N)listselfss     r   tokenizezCharTokenizer.tokenizeK   s    Awr   c              #   b   K   t        t        dt        |      dz               E d {    y 7 w)N   )	enumeraterangelenr   s     r   span_tokenizezCharTokenizer.span_tokenizeN   s#     U1c!fqj1222s   %/-/)r   r   r   r   r   r   r$   r   r   r   r   r   D   s     G3r   r   c                   $    e Zd ZdZddZd Zd Zy)LineTokenizera  Tokenize a string into its lines, optionally discarding blank lines.
    This is similar to ``s.split('\n')``.

        >>> from nltk.tokenize import LineTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', '', 'Thanks.']
        >>> # same as [l for l in s.split('\n') if l.strip()]:
        >>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', 'Thanks.']

    :param blanklines: Indicates how blank lines should be handled.  Valid values are:

        - ``discard``: strip blank lines out of the token list before returning it.
           A line is considered blank if it contains only whitespace characters.
        - ``keep``: leave all blank lines in the token list.
        - ``discard-eof``: if the string ends with a newline, then do not generate
           a corresponding token ``''`` after that newline.
    c                 X    d}||vrt        ddj                  |      z        || _        y )N)discardkeepdiscard-eofzBlank lines must be one of: %sr	   )
ValueErrorjoin_blanklines)r   
blanklinesvalid_blankliness      r   __init__zLineTokenizer.__init__i   s:    =--0388<L3MM  &r   c                     |j                         }| j                  dk(  r"|D cg c]  }|j                         s| }}|S | j                  dk(  r%|r#|d   j                         s|j	                          |S c c}w )Nr(   r*   )
splitlinesr-   rstripstrippop)r   r   linesls       r   r   zLineTokenizer.tokenizer   sq    y( %41QE4  .U2Y__.			 5s
   A8A8c              #      K   | j                   dk(  rt        |d      E d {    y t        |d      E d {    y 7 7 w)Nr)   z\nz
\n(\s+\n)*)r-   r   r   r   s     r   r$   zLineTokenizer.span_tokenize}   s=     v%+Au555+A}=== 6=s   A <A >A A Nr(   )r   r   r   r   r0   r   r$   r   r   r   r&   r&   R   s    ,&>r   r&   c                 6    t        |      j                  |       S r   )r&   r   )textr.   s     r   line_tokenizer=      s    $--d33r   Nr:   )r   nltk.tokenize.apir   r   nltk.tokenize.utilr   r   r   r   r   r&   r=   r   r   r   <module>r@      sH   : : I_ 	? 	3O 3/>J />p4r   