
    g$                     j    d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	  G d d      Z
 G d de      Zy)	    N)IteratorListTuple)
TokenizerI)align_tokensc                   (    e Zd ZdZg dZddgZddgZy)MacIntyreContractionszI
    List of contractions adapted from Robert MacIntyre's tokenizer.
    )z(?i)\b(can)(?#X)(not)\bz(?i)\b(d)(?#X)('ye)\bz(?i)\b(gim)(?#X)(me)\bz(?i)\b(gon)(?#X)(na)\bz(?i)\b(got)(?#X)(ta)\bz(?i)\b(lem)(?#X)(me)\bz(?i)\b(more)(?#X)('n)\bz(?i)\b(wan)(?#X)(na)(?=\s)z(?i) ('t)(?#X)(is)\bz(?i) ('t)(?#X)(was)\bz(?i)\b(whad)(dd)(ya)\bz(?i)\b(wha)(t)(cha)\bN)__name__
__module____qualname____doc__CONTRACTIONS2CONTRACTIONS3CONTRACTIONS4     N/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/destructive.pyr	   r	      s&    	M -.FGM.0HIMr   r	   c                      e Zd ZdZ ej
                  dej                        df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d	ej                        d
fgZ ej
                  dej                        df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d      dfgZ ej
                  dej                        df ej
                  d      df ej
                  d      df ej
                  dej                        df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d      d f ej
                  d!ej                        dfg	Z	 ej
                  d"      dfZ
 ej
                  d#      d$f ej
                  d%      d&f ej
                  d'      d(f ej
                  d)      d*f ej
                  d+      d,f ej
                  d-      d.fgZ ej
                  d/      d0fZ e       Z e eej
                  ej"                              Z e eej
                  ej$                              Z	 d8d1ed2ed3ed4ee   fd5Zd1ed4eeeef      fd6Zy7)9NLTKWordTokenizeraE  
    The NLTK tokenizer that has improved upon the TreebankWordTokenizer.

    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

    The tokenizer is "destructive" such that the regexes applied will munge the
    input string to a state beyond re-construction. It is possible to apply
    `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
    `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
    revert to the original string.
    u   ([«“‘„]|[`]+)z \1 z^\"``z(``)z([ \(\[{<])(\"|\'{2})z\1 `` z$(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\bz\1 \2u   ([»”’])''z '' "z\s+ z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) u&   ([^\.])(\.)([\]\)}>"\'»”’ ]*)\s*$z	\1 \2 \3 z([:,])([^\d])z \1 \2z([:,])$z\.{2,}z \g<0> z[;@#$%&]z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[*]z[\]\[\(\)\{\}\<\>]z\(z-LRB-z\)z-RRB-z\[z-LSB-z\]z-RSB-z\{z-LCB-z\}z-RCB-z--z -- textconvert_parentheses
return_strreturnc                    |rt        j                  dt        d       | j                  D ]  \  }}|j	                  ||      } | j
                  D ]  \  }}|j	                  ||      } | j                  \  }}|j	                  ||      }|r&| j                  D ]  \  }}|j	                  ||      } | j                  \  }}|j	                  ||      }d|z   dz   }| j                  D ]  \  }}|j	                  ||      } | j                  D ]  }|j	                  d|      } | j                  D ]  }|j	                  d|      } |j                         S )a  Return a tokenized copy of `text`.

        >>> from nltk.tokenize import NLTKWordTokenizer
        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']


        :param text: A string with a sentence or sentences.
        :type text: str
        :param convert_parentheses: if True, replace parentheses to PTB symbols,
            e.g. `(` to `-LRB-`. Defaults to False.
        :type convert_parentheses: bool, optional
        :param return_str: If True, return tokens as space-separated string,
            defaults to False.
        :type return_str: bool, optional
        :return: List of tokens from `text`.
        :rtype: List[str]
        zHParameter 'return_str' has been deprecated and should no longer be used.   )category
stacklevelr   z \1 \2 )warningswarnDeprecationWarningSTARTING_QUOTESsubPUNCTUATIONPARENS_BRACKETSCONVERT_PARENTHESESDOUBLE_DASHESENDING_QUOTESr   r   split)selfr   r   r   regexpsubstitutions         r   tokenizezNLTKWordTokenizer.tokenizey   s^   8 MM"+	 %)$8$8 FL::lD1D %9 %)$4$4 FL::lD1D %5  $33zz,-(,(@(@$zz,5 )A  $11zz,- TzC$($6$6 FL::lD1D %7 ((F::j$/D )((F::j$/D ) zz|r   c              #   .  K   | j                  |      }d|v sd|v rVt        j                  d|      D cg c]  }|j                          }}|D cg c]  }|dv r|j	                  d      n| }}n|}t        ||      E d{    yc c}w c c}w 7 w)a}  
        Returns the spans of the tokens in ``text``.
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import NLTKWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
            True

        :param text: A string with a sentence or sentences.
        :type text: str
        :yield: Tuple[int, int]
        r   r   z
``|'{2}|\")r   r   r   r   N)r0   refinditergrouppopr   )r-   r   
raw_tokensmmatchedtoktokenss          r   span_tokenizezNLTKWordTokenizer.span_tokenize   s     . ]]4(
 4KTT\*,++mT*JK*JQqwwy*JGK
 &%C #&):":AC%  
  F--- L 	.s(   2BB	BB/BBBN)FF)r
   r   r   r   r2   compileUr%   r+   r'   r(   r)   r*   r	   _contractionslistmapr   r   strboolr   r0   r   r   intr;   r   r   r   r   r   %   s    
*BDD	17;	F	U#	G	g&	,	-y9	;RTT	BHMO 
NBDD	)73	E	F#	D	6"	F	S!	4	5yA	@	A9MM( 
Dbdd	K\Z	$	%y1	J	)BJJy"$$'	
 
K	 *-BJJ78	
 
G	j)	K	 (+BJJvrtt$	
K, "rzz"78*EO 
E	G$	E	G$	E	G$	E	G$	E	G$	E	G$  RZZ&0M *+MRZZ)D)DEFMRZZ)D)DEFM PUFF.2FHLF	cFP).# ).(5c?*C ).r   r   )r2   r"   typingr   r   r   nltk.tokenize.apir   nltk.tokenize.utilr   r	   r   r   r   r   <module>rG      s3    
  ( ( ( +J J&E.
 E.r   