
    g?                     |    d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
 ddlmZ  G d de      Z G d	 d
e      Zy)a	  

Penn Treebank Tokenizer

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyre
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
    N)IteratorListTuple)
TokenizerI)MacIntyreContractions)align_tokensc            
       D   e Zd ZdZ ej
                  d      df ej
                  d      df ej
                  d      dfgZ ej
                  d      d	f ej
                  d
      df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d      dfgZ ej
                  d      dfZ ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d      df ej
                  d      d fgZ	 ej
                  d!      d"fZ
 ej
                  d#      d$f ej
                  d%      d$f ej
                  d&      d'f ej
                  d(      d'fgZ e       Z e eej
                  ej                               Z e eej
                  ej"                              Z	 d0d)ed*ed+ed,ee   fd-Zd)ed,eeeef      fd.Zy/)1TreebankWordTokenizera	  
    The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.

    This tokenizer performs the following steps:

    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
    - treat most punctuation characters as separate tokens
    - split off commas and single quotes, when followed by whitespace
    - separate periods that appear at the end of line

    >>> from nltk.tokenize import TreebankWordTokenizer
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
    >>> TreebankWordTokenizer().tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
    >>> s = "They'll save and invest more."
    >>> TreebankWordTokenizer().tokenize(s)
    ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
    >>> s = "hi, my name can't hello,"
    >>> TreebankWordTokenizer().tokenize(s)
    ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
    z^\"``z(``)z \1 z([ \(\[{<])(\"|\'{2})z\1 `` z([:,])([^\d])z \1 \2z([:,])$z\.\.\.z ... z[;@#$%&]z \g<0> z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[\]\[\(\)\{\}\<\>]z\(-LRB-z\)-RRB-z\[-LSB-z\]-RSB-z\{-LCB-z\}-RCB--- -- ''z '' "z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) textconvert_parentheses
return_strreturnc                    |durt        j                  dt        d       | j                  D ]  \  }}|j	                  ||      } | j
                  D ]  \  }}|j	                  ||      } | j                  \  }}|j	                  ||      }|r&| j                  D ]  \  }}|j	                  ||      } | j                  \  }}|j	                  ||      }d|z   dz   }| j                  D ]  \  }}|j	                  ||      } | j                  D ]  }|j	                  d|      } | j                  D ]  }|j	                  d|      } |j                         S )a  Return a tokenized copy of `text`.

        >>> from nltk.tokenize import TreebankWordTokenizer
        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> TreebankWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']

        :param text: A string with a sentence or sentences.
        :type text: str
        :param convert_parentheses: if True, replace parentheses to PTB symbols,
            e.g. `(` to `-LRB-`. Defaults to False.
        :type convert_parentheses: bool, optional
        :param return_str: If True, return tokens as space-separated string,
            defaults to False.
        :type return_str: bool, optional
        :return: List of tokens from `text`.
        :rtype: List[str]
        FzHParameter 'return_str' has been deprecated and should no longer be used.   )category
stacklevel z \1 \2 )warningswarnDeprecationWarningSTARTING_QUOTESsubPUNCTUATIONPARENS_BRACKETSCONVERT_PARENTHESESDOUBLE_DASHESENDING_QUOTESCONTRACTIONS2CONTRACTIONS3split)selfr   r   r   regexpsubstitutions         K/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/treebank.pytokenizezTreebankWordTokenizer.tokenizee   sc   6 U"MM"+	 %)$8$8 FL::lD1D %9 %)$4$4 FL::lD1D %5  $33zz,-(,(@(@$zz,5 )A  $11zz,- TzC$($6$6 FL::lD1D %7 ((F::j$/D )((F::j$/D ) zz|    c              #   .  K   | j                  |      }d|v sd|v rVt        j                  d|      D cg c]  }|j                          }}|D cg c]  }|dv r|j	                  d      n| }}n|}t        ||      E d{    yc c}w c c}w 7 w)a  
        Returns the spans of the tokens in ``text``.
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        :param text: A string with a sentence or sentences.
        :type text: str
        :yield: Tuple[int, int]
        r   r   z
``|'{2}|\")r   r   r   r   N)r0   refinditergrouppopr   )r,   r   
raw_tokensmmatchedtoktokenss          r/   span_tokenizez#TreebankWordTokenizer.span_tokenize   s     . ]]4(
 4KTT\*,++mT*JK*JQqwwy*JGK
 &%C #&):":AC%  
  F--- L 	.s(   2BB	BB/BBBN)FF)__name__
__module____qualname____doc__r3   compiler"   r$   r%   r&   r'   r(   r   _contractionslistmapr)   r*   strboolr   r0   r   r   intr<    r1   r/   r
   r
      sw   0 
F	U#	G	g&	,	-y9O 
$	%y1	J	)	I	)	K	 *-BJJ78	
 
G	j)	K	 (+K "rzz"78*EO 
E	G$	E	G$	E	G$	E	G$	E	G$	E	G$  RZZ&0M 
E	F#	D	6"	4	5yA	@	A9M	M *+MRZZ)D)DEFMRZZ)D)DEFM PUEE.2EHLE	cEN).# ).(5c?*C ).r1   r
   c                   B   e Zd ZdZ e       Zej                  D  cg c]'  }t        j                  |j                  dd            ) c}}} Zej                  D  cg c]'  }t        j                  |j                  dd            ) c}}} Z
 ej                  d      df ej                  d      df ej                  d      df ej                  d	      df ej                  d
      dfgZ ej                  d      dfZ ej                  d      df ej                  d      df ej                  d      df ej                  d      df ej                  d      df ej                  d      dfgZ ej                  d      df ej                  d      df ej                  d      dfgZ ej                  d      df ej                  d       df ej                  d!      d"f ej                  d#      df ej                  d$      df ej                  d%      d&f ej                  d'      d(fgZ ej                  d)      d*f ej                  d+      d(f ej                  d,      dfgZd3d-ee   d.ed/efd0Zd3d-ee   d.ed/efd1Zy2c c}}} w c c}}} w )4TreebankWordDetokenizera  
    The Treebank detokenizer uses the reverse regex operations corresponding to
    the Treebank tokenizer's regexes.

    Note:

    - There're additional assumption mades when undoing the padding of ``[;@#$%&]``
      punctuation symbols that isn't presupposed in the TreebankTokenizer.
    - There're additional regexes added in reversing the parentheses tokenization,
       such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
       padding added to the closing parentheses precedding ``[:;,.]``.
    - It's not possible to return the original whitespaces as they were because
      there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
      the text.split() operation.

    >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
    >>> d = TreebankWordDetokenizer()
    >>> t = TreebankWordTokenizer()
    >>> toks = t.tokenize(s)
    >>> d.detokenize(toks)
    'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'

    The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
    parameter:

    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
    >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
    ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
    ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
    >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
    True
    >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
    >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
    True

    During tokenization it's safe to add more spaces but during detokenization,
    simply undoing the padding doesn't really help.

    - During tokenization, left and right pad is added to ``[!?]``, when
      detokenizing, only left shift the ``[!?]`` is needed.
      Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.

    - During tokenization ``[:,]`` are left and right padded but when detokenizing,
      only left shift is necessary and we keep right pad after comma/colon
      if the string after is a non-digit.
      Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.

    >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
    >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
    >>> twd = TreebankWordDetokenizer()
    >>> twd.detokenize(toks)
    "hello, i can't feel my feet! Help!!"

    >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
    ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
    >>> twd.detokenize(toks)
    "hello, i can't feel; my feet! Help!! He said: Help, help?!"
    z(?#X)z\sz+([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) z\1\2 z([^' ])\s('[sS]|'[mM]|'[dD]|') z(\S)\s(\'\')\1\2z(\'\')\s([.,:)\]>};%])r   r   r   r   r   (r   )r   [r   ]r   {r   }z([\[\(\{\<])\sz\g<1>z\s([\]\)\}\>])z([\]\)\}\>])\s([:;,.])z([^'])\s'\sz\1' z\s([?!])z([^\.])\s(\.)([\]\)}>"\']*)\s*$z\1\2\3z([#$])\sz\s([;%])z
\s\.\.\.\sz...z\s([:,])z\1z([ (\[{<])\s``z\1``z(``)\sr   r;   r   r   c                    dj                  |      }d|z   dz   }| j                  D ]  }|j                  d|      } | j                  D ]  }|j                  d|      } | j                  D ]  \  }}|j                  ||      } |j                         }| j                  \  }}|j                  ||      }|r&| j                  D ]  \  }}|j                  ||      } | j                  D ]  \  }}|j                  ||      } | j                  D ]  \  }}|j                  ||      } | j                  D ]  \  }}|j                  ||      } |j                         S )a  
        Treebank detokenizer, created by undoing the regexes from
        the TreebankWordTokenizer.tokenize.

        :param tokens: A list of strings, i.e. tokenized text.
        :type tokens: List[str]
        :param convert_parentheses: if True, replace PTB symbols with parentheses,
            e.g. `-LRB-` to `(`. Defaults to False.
        :type convert_parentheses: bool, optional
        :return: str
        r   rK   )joinr*   r#   r)   r(   stripr'   r&   r%   r$   r"   )r,   r;   r   r   r-   r.   s         r/   r0   z TreebankWordDetokenizer.tokenize[  sf    xx TzC ((F::gt,D )((F::gt,D ) %)$6$6 FL::lD1D %7 zz|  $11zz,-(,(@(@$zz,5 )A %)$8$8 FL::lD1D %9 %)$4$4 FL::lD1D %5 %)$8$8 FL::lD1D %9 zz|r1   c                 &    | j                  ||      S )z&Duck-typing the abstract *tokenize()*.)r0   )r,   r;   r   s      r/   
detokenizez"TreebankWordDetokenizer.detokenize  s    }}V%899r1   N)F)r=   r>   r?   r@   r   rB   r)   r3   rA   replacer*   r(   r'   r&   r%   r$   r"   r   rE   rF   r0   rV   ).0patternr3   s   000r/   rJ   rJ      s   :x *+M %222G 	

7??7E232M %222G 	

7??7E232M 
B	CXN	6	7B	O	$g.BJJ01	
 
E	C 	M  RZZ(%0M 
G	c"	G	c"	G	c"	G	c"	G	c"	G	c" 
%	&1	%	&1	-	.8O 
N	#W-	K	 (+	6	7C
 
K	 (+	K	 (+	M	"F+ BJJ{#	
K, 
%	&0	I	&	E	D!O3tCy 3t 3PS 3j:c : :RU :us   ,J,JrJ   )r@   r3   r   typingr   r   r   nltk.tokenize.apir   nltk.tokenize.destructiver   nltk.tokenize.utilr   r
   rJ   rH   r1   r/   <module>r^      s>    
  ( ( ( ; +x.J x.vz:j z:r1   