
    gu                     R    d Z ddlZddlZddlmZ ddlmZ ddlmZ  G d de      Z	y)a  
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
which was also ported into Python in
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
    N)perluniprops)
TokenizerI)xml_unescapec            	       z   e Zd ZdZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      d	fZ	 ej
                  d
      dfZ
 ej
                  d      d	fZee	e
egZ edj                   e ej"                  d                        Z edj                   e ej"                  d                        Z edj                   e ej"                  d                        Z ej*                  dde      Z ej*                  dde      Z ej*                  dde      Z ej
                  d      dfZ ej
                  de de d      d	fZ ej
                  de de d      dfZ ej
                  de d      dfZeeeegZd ZddZ	 ddZ y)NISTTokenizeruT  
    This NIST tokenizer is sentence-based instead of the original
    paragraph-based tokenization from mteval-14.pl; The sentence-based
    tokenization is consistent with the other tokenizers available in NLTK.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()
    >>> s = "Good muffins cost $3.88 in New York."
    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
    >>> nist.tokenize(s, lowercase=False) == expected_cased
    True
    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
    True

    The international_tokenize() is the preferred function when tokenizing
    non-european text, e.g.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()

    # Input strings.
    >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) is a Chinese e-commerce company...'
    >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
    >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'

    # Expected tokens.
    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'阿里巴巴集团控股', u'有限公司', u')']
    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'ˈæ', u'm']
    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'楽天株式会社', u'Rakuten', u'Kabushiki', u'-', u'gaisha']

    >>> nist.international_tokenize(albb)[:10] == expected_albb
    True
    >>> nist.international_tokenize(amz)[:10] == expected_amz
    True
    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
    True

    # Doctest for patching issue #1926
    >>> sent = u'this is a foo☄sentence.'
    >>> expected_sent = [u'this', u'is', u'a', u'foo', u'☄', u'sentence', u'.']
    >>> nist.international_tokenize(sent) == expected_sent
    True
    z	<skipped> u     z([\{-\~\[-\` -\&\(-\+\:-\@\/])z \1 z([^0-9])([\.,])z\1 \2 z([\.,])([^0-9])z \1 \2z
([0-9])(-)NumberPunctuationSymbolz[]^\\-]z\\\g<0>z([ -]+)z([z])([z])c                     | j                   \  }}|j                  ||      }t        |      }| j                  \  }}|j                  ||      }|S )z8Performs the language independent string substituitions.)
STRIP_SKIPsubr   STRIP_EOL_HYPHEN)selftextregexpsubstitutions       G/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/nist.pylang_independent_subz"NISTTokenizer.lang_independent_sub   sQ    
  $zz,-D!#44zz,-    c                 V   t        |      }| j                  |      }|r@d|z   dz   }|r|j                         }| j                  D ]  \  }}|j	                  ||      } dj                  |j                               }t        |j                               }|r|S |j                         S Nr	   )strr   lowerLANG_DEPENDENT_REGEXESr   joinsplitstrip)r   r   	lowercasewestern_lang
return_strr   r   s          r   tokenizezNISTTokenizer.tokenize   s    4y((.:#Dzz|(,(C(C$zz,5 )D xx

% 4::< !t3tzz|3r   c                    t        |      }| j                  \  }}|j                  ||      }| j                  \  }}|j                  ||      }t	        |      }|r|j                         }| j                  D ]  \  }}|j                  ||      } dj                  |j                         j                               }|r|S |j                         S r   )
r   r   r   r   r   r   INTERNATIONAL_REGEXESr   r   r   )r   r   r    split_non_asciir"   r   r   s          r   international_tokenizez$NISTTokenizer.international_tokenize   s     4y  $zz,-#44zz,-D!::<D$($>$> FL::lD1D %?
 xx

**,-!t3tzz|3r   N)FTF)!__name__
__module____qualname____doc__recompiler   r   PUNCTPERIOD_COMMA_PRECEEDPERIOD_COMMA_FOLLOWDASH_PRECEED_DIGITr   r   r   setr   chars
pup_number	pup_punct
pup_symbolr   number_regexpunct_regexsymbol_regexNONASCIIPUNCT_1PUNCT_2SYMBOLSr%   r   r#   r'    r   r   r   r      s   +\ K(",J!rzz(+S0BJJ897BE%2::&89:E$"**%78*D#L1:= 		 RWWS!3!3!3H!=>?@JBGGC 2 2 2= ABCDIRWWS!3!3!3H!=>?@J 266*j*=L"&&Z;K266*j*=L rzz*+W4H 	

R~T+b9:G
 	

R}Db9:G
 bjj2l^2./8G%wA
4( GL4r   r   )
r+   ior,   nltk.corpusr   nltk.tokenize.apir   nltk.tokenize.utilr   r   r>   r   r   <module>rC      s)    
 	 $ ( +Y4J Y4r   