
    g[                     2    d Z ddlZddlmZ  G d de      Zy)a  
The tok-tok tokenizer is a simple, general tokenizer, where the input has one
sentence per line; thus only final period is tokenized.

Tok-tok has been tested on, and gives reasonably good results for English,
Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
The input should be in UTF-8 encoding.

Reference:
Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
    N)
TokenizerIc                      e Zd ZdZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ	 ej
                  d      d	fZ
 ej
                  d
      dfZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ ej
                  d      dfZ ed      Z ed      Z ed      Z ej
                  de d      d fZ ej
                  de d      d fZ ej
                  de d      d fZ ej
                  d!      d"fZ ej
                  d#      d$fZ ej
                  d%      d&fZ ej
                  d'      d&fZ  ej
                  d(      d)fZ! ej
                  d*      d+fZ" ej
                  d,      dfZ#eeeeeee e
eeeeeeeeeee	eeeee#gZ$d/d-Z%y.)0ToktokTokenizeru  
    This is a Python port of the tok-tok.pl from
    https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl

    >>> toktok = ToktokTokenizer()
    >>> text = u'Is 9.5 or 525,600 my favorite number?'
    >>> print(toktok.tokenize(text, return_str=True))
    Is 9.5 or 525,600 my favorite number ?
    >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
    >>> print(toktok.tokenize(text, return_str=True))
    The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
    >>> text = u'¡This, is a sentence with weird» symbols… appearing everywhere¿'
    >>> expected = u'¡ This , is a sentence with weird » symbols … appearing everywhere ¿'
    >>> assert toktok.tokenize(text, return_str=True) == expected
    >>> toktok.tokenize(text) == [u'¡', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'»', u'symbols', u'…', u'appearing', u'everywhere', u'¿']
    True
         u1   ([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])z \1 u   ([({\[“‘„‚«‹「『])u
   ([–—])z& z&amp; 	z &#9; z\|z &#124; u   (?<!,)([,،])(?![,\d])u	   (['’`])z ` ` z `` z ' ' z '' z
(?<!\.)\.$z .u    (?<!\.)\.\s*(["'’»›”]) *$z . \1z(,{2,})z(-{2,})z(\.{2,})u   ([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢u   )]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣u   $¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩＄￠￡￥￦z([z])z\1 z:(?!//)z : z\?(?!\S)z ? z(:\/\/)[\S+\.\S+\/\S+][\/]z / z /z^ + z\s+$
z {2,}c                     t        |      }| j                  D ]  \  }}|j                  ||      } t        |j                               }|r|S |j	                         S )N)strTOKTOK_REGEXESsubstripsplit)selftext
return_strregexpsubstitutions        I/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/toktok.pytokenizezToktokTokenizer.tokenize   sU    4y$($7$7 FL::lD1D %8 4::< !t3tzz|3    N)F)&__name__
__module____qualname____doc__recompileNON_BREAKINGFUNKY_PUNCT_1FUNKY_PUNCT_2EN_EM_DASHES	AMPERCENTTABPIPECOMMA_IN_NUMPROB_SINGLE_QUOTESSTUPID_QUOTES_1STUPID_QUOTES_2FINAL_PERIOD_1FINAL_PERIOD_2MULTI_COMMASMULTI_DASHES
MULTI_DOTSr   
OPEN_PUNCTCLOSE_PUNCTCURRENCY_SYMOPEN_PUNCT_RECLOSE_PUNCT_RECURRENCY_SYM_RE	URL_FOE_1	URL_FOE_2	URL_FOE_3	URL_FOE_4LSTRIPRSTRIP	ONE_SPACEr   r    r   r   r   r      s   & 2::h',L BJJSTV]]MBJJABGKM2::l+W4L 

4 (*I
"**T
H
$C2::ej(D 2::78'AL $L17: bjj*G3O bjj*G3O  RZZ.5N  RZZ GH(RN 2::j)72L2::j)72LK('1J 	/
J 	)
K 	5L BJJJ<r23V;MRZZ"[M 45v=N bjj2l^2!67?O 

:&.I

;'/I

895@I

5!6)I RZZ#FRZZ $&F

8$c)I 	1N64r   r   )r   r   nltk.tokenize.apir   r   r<   r   r   <module>r>      s     
 (W4j W4r   