
    g                     z    d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ dZ G d de      Zy)	    N)PIPE)_java_optionsconfig_javafind_jarjava)CoreNLPParser)
TokenizerIz1https://nlp.stanford.edu/software/tokenizer.shtmlc                   D    e Zd ZdZdZ	 	 	 	 	 ddZed        Zd Zd	dZ	y)
StanfordTokenizeraF  
    Interface to the Stanford Tokenizer

    >>> from nltk.tokenize.stanford import StanfordTokenizer
    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
    >>> StanfordTokenizer().tokenize(s) # doctest: +SKIP
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    >>> s = "The colour of the wall is blue."
    >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP
    ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
    zstanford-postagger.jarNc                    t        j                  t        d      t        d       t	        | j
                  |ddt        |      | _        || _        || _	        |i n|}dj                  d |j                         D              | _        y )	Nzz
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.'   )
stacklevel)STANFORD_POSTAGGER )env_vars
searchpathurlverbose,c              3   0   K   | ]  \  }}| d |   yw)=Nr   ).0keyvals      K/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/stanford.py	<genexpr>z-StanfordTokenizer.__init__.<locals>.<genexpr>E   s     $TOSuAcU^Os   )warningswarnstrDeprecationWarningr   _JAR_stanford_url_stanford_jar	_encodingjava_optionsjoinitems_options_cmd)selfpath_to_jarencodingoptionsr   r%   s         r   __init__zStanfordTokenizer.__init__%   s     	W
 	
 &II,
 "("WHH$TGMMO$TT    c                 "    | j                         S )N)
splitlines)ss    r   _parse_tokenized_outputz)StanfordTokenizer._parse_tokenized_outputG   s    ||~r.   c                 J    dg}| j                  | j                  ||            S )zW
        Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
        z%edu.stanford.nlp.process.PTBTokenizer)r2   _execute)r)   r1   cmds      r   tokenizezStanfordTokenizer.tokenizeK   s(     77++DMM#q,ABBr.   c                    | j                   }|j                  d|g       | j                  }|r|j                  d| j                  g       dj                  t              }t        | j                  |       t        j                  dd      5 }t        |t              r|r|j                  |      }|j                  |       |j                          |j                  |j                         t!        || j"                  t$        t$              \  }}	|j'                  |      }d d d        t)        j*                  j                         t        |d       S # 1 sw Y   7xY w)	Nz-charsetz-options )r,   r   wbF)modedelete)	classpathstdoutstderr)r$   extendr(   r&   r   r   r%   tempfileNamedTemporaryFile
isinstancer   encodewriteflushappendnamer   r#   r   decodeosunlink)
r)   r5   input_r   r+   r(   default_options
input_filer=   r>   s
             r   r4   zStanfordTokenizer._executeR   s   >>

J)*((JJ
D$5$567((=1 	D--w? ((d5AZ&#&8x0V$JJz' "t11$tNFF ]]8,F B 			*//" 	OU;) BAs   BEE")Nutf8NFz-mx1000m)F)
__name__
__module____qualname____doc__r!   r-   staticmethodr2   r6   r4   r   r.   r   r   r      sE    
 $D  UD  C!r.   r   )jsonrI   r@   r   
subprocessr   nltk.internalsr   r   r   r   nltk.parse.corenlpr   nltk.tokenize.apir	   r"   r   r   r.   r   <module>rY      s5     	    E E , (C]
 ]r.   