
    g                     p    d Z ddlmZmZ ddlmZmZmZ ddlm	Z	 ddl
mZ  G d de      Z G d d	e      Zy
)z
Tokenizer Interface
    )ABCabstractmethod)IteratorListTuple)
overridden)string_span_tokenizec                       e Zd ZdZededee   fd       Zdedee	e
e
f      fdZdee   deee      fdZdee   deee	e
e
f         fdZy	)

TokenizerIz
    A processing interface for tokenizing a string.
    Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
    sreturnc                 X    t        | j                        r| j                  |g      d   S y)zL
        Return a tokenized copy of *s*.

        :rtype: List[str]
        r   N)r   tokenize_sentsselfr   s     F/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/api.pytokenizezTokenizerI.tokenize   s/     d))*&&s+A.. +    c                     t               )z
        Identify the tokens using integer offsets ``(start_i, end_i)``,
        where ``s[start_i:end_i]`` is the corresponding token.

        :rtype: Iterator[Tuple[int, int]]
        NotImplementedErrorr   s     r   span_tokenizezTokenizerI.span_tokenize$   s     "##r   stringsc                 J    |D cg c]  }| j                  |       c}S c c}w )z
        Apply ``self.tokenize()`` to each element of ``strings``.  I.e.:

            return [self.tokenize(s) for s in strings]

        :rtype: List[List[str]]
        )r   r   r   r   s      r   r   zTokenizerI.tokenize_sents-   s%     +22'Qa '222s    c              #   R   K   |D ]  }t        | j                  |               yw)z
        Apply ``self.span_tokenize()`` to each element of ``strings``.  I.e.:

            return [self.span_tokenize(s) for s in strings]

        :yield: List[Tuple[int, int]]
        N)listr   r   s      r   span_tokenize_sentszTokenizerI.span_tokenize_sents7   s(      At))!,-- s   %'N)__name__
__module____qualname____doc__r   strr   r   r   r   intr   r   r    r   r   r   r      s    
 /# /$s) / /$s $xc3h'@ $3d3i 3DcO 3.Cy.	$uS#X'	(.r   r   c                   6    e Zd ZdZeed               Zd Zd Zy)StringTokenizerzxA tokenizer that divides a string into substrings by splitting
    on the specified string (defined in subclasses).
    c                     t         Nr   )r   s    r   _stringzStringTokenizer._stringJ   s
     "!r   c                 8    |j                  | j                        S r)   )splitr*   r   s     r   r   zStringTokenizer.tokenizeO   s    wwt||$$r   c              #   L   K   t        || j                        E d {    y 7 wr)   )r	   r*   r   s     r   r   zStringTokenizer.span_tokenizeR   s     '4<<888s   $"$N)	r   r    r!   r"   propertyr   r*   r   r   r%   r   r   r'   r'   E   s-     "  "%9r   r'   N)r"   abcr   r   typingr   r   r   nltk.internalsr   nltk.tokenize.utilr	   r   r'   r%   r   r   <module>r3      s4    $ ( ( % 3.. ..b9j 9r   