
    g]%                     v    d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlmZ dZ G d de      Zy)    N)PIPE)_java_optionsconfig_javafind_dir	find_filefind_jarjava)
TokenizerIz!https://nlp.stanford.edu/softwarec                   f     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 d
dZd Z fdZd Zd Z	d Z
dd	Z xZS )StanfordSegmenteru[  Interface to the Stanford Segmenter

    If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
    should be provieded, for example::

        seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')

    >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
    >>> seg = StanfordSegmenter() # doctest: +SKIP
    >>> seg.default_config('zh') # doctest: +SKIP
    >>> sent = u'这是斯坦福中文分词器测试'
    >>> print(seg.segment(sent)) # doctest: +SKIP
    这 是 斯坦福 中文 分词器 测试
    <BLANKLINE>
    >>> seg.default_config('ar') # doctest: +SKIP
    >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
    >>> print(seg.segment(sent.split())) # doctest: +SKIP
    هذا هو تصنيف ستانفورد العربي ل الكلمات
    <BLANKLINE>
    zstanford-segmenter.jarc                 T   t        j                  dt               t        j                  t	        d      t        d       t        j                  dt               t        | j                  |ddt        |      }|t        d	|d
dt        |      }nd }t        j                  j                  d ||fD              | _        || _        || _        || _        || _        || _        || _        |	| _        || _        |
i n|
}
dj                  d |
j)                         D              | _        y )Nalwaysz}
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPTokenizer[0m instead.'   )
stacklevelignoreSTANFORD_SEGMENTER )env_vars
searchpathurlverbosezslf4j-api.jar)SLF4Jr   c              3   &   K   | ]	  }||  y wNr   ).0_s     U/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/stanford_segmenter.py	<genexpr>z-StanfordSegmenter.__init__.<locals>.<genexpr>j   s      -
2!amA2s   ,c              3   V   K   | ]!  \  }}| d t        j                  |        # yw)=N)jsondumps)r   keyvals      r   r   z-StanfordSegmenter.__init__.<locals>.<genexpr>x   s,      %
7F83se1TZZ_%&s   '))warningssimplefilterDeprecationWarningwarnstrr   _JAR_stanford_urlospathsepjoin_stanford_jar_java_class_model_sihan_corpora_dict_sihan_post_processing_keep_whitespaces_dict	_encodingjava_optionsitems_options_cmd)selfpath_to_jarpath_to_slf4j
java_classpath_to_modelpath_to_dictpath_to_sihan_corpora_dictsihan_post_processingkeep_whitespacesencodingoptionsr   r9   stanford_segmenterslf4js                  r   __init__zStanfordSegmenter.__init__8   s-     	h(:;Z
 	
 	h(:;%II,
 $8!E E  ZZ__ -
*E2-
 
 &##= &;#!1!
!("WHH %
7>}}%
 
    c                    d}t         j                  j                  d      r>t         j                  j	                  t         j                  j                  d      d      h}d| _        d| _        d| _        |dk(  r
d| _        d}n{|d	k(  rhd
| _        d}d| _        d}	 t        ||t        dd      | _        d}	 t        |t        dd      }t         j                  j	                  ||      | _        nt        d|       	 t        ||t        dd      | _        y# t        $ r}t        d|z        |d}~ww xY w# t        $ r}t        d|z        |d}~ww xY w# t        $ r}t        d|z        |d}~ww xY w)z
        Attempt to initialize Stanford Word Segmenter for the specified language
        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
        r   r   dataNfalsearz=edu.stanford.nlp.international.arabic.process.ArabicSegmenterz'arabic-segmenter-atb+bn+arztrain.ser.gzzhz%edu.stanford.nlp.ie.crf.CRFClassifierzpku.gztruezdict-chris6.ser.gzF)STANFORD_MODELS)r   r   r   r   z_Could not find '%s' (tried using env. variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)z./data/r   )r   r   r   zMCould not find '%s' (tried using the STANFORD_SEGMENTER environment variable)zUnsupported language )rQ   r   )r.   environgetpathr0   r7   r4   r5   r2   r   r-   LookupErrorr   r3   )r<   langsearch_pathmodelrA   e	sihan_dirpath_to_sihan_dirs           r   default_configz StanfordSegmenter.default_config|   s    ::>>./77<<

7K(LfUVK 
#' &-#4<O  >ET\FDE*0D'/L& *%!1
 "I$,%!4	%! ,.77<<8I9+U(  5dV<==	#&!BDK3  !P"# 	   !?AJK    	LNST 	sH   D 88D:  E 	D7#D22D7:	EEE	E7#E22E7c                 $    t         |   |       y r   )supertokenize)r<   s	__class__s     r   r_   zStanfordSegmenter.tokenize   s    rJ   c                     | j                   d| j                  d| j                  d|g}| j                  5|j	                  d| j
                  d| j                  d| j                  g       | j                  |      }|S ) -loadClassifier-keepAllWhitespaces	-textFile-serDictionary-sighanCorporaDict-sighanPostProcessing)r2   r3   r6   r4   extendr7   r5   _execute)r<   input_file_pathcmdstdouts       r   segment_filezStanfordSegmenter.segment_file   s     KK!""
 ##/JJ$JJ(,,+//	 s#rJ   c                 &    | j                  |g      S r   )segment_sents)r<   tokenss     r   segmentzStanfordSegmenter.segment   s    !!6(++rJ   c                    | j                   }t        j                  d      \  }| _        t	        j
                  |d      }dj                  d |D              }t        |t              r|r|j                  |      }|j                  |       |j                          | j                  d| j                  d| j                  d| j                  g}| j                  5|j!                  d	| j"                  d
| j                  d| j$                  g       | j'                  |      }t	        j(                  | j                         |S )rc   T)textwb
c              3   >   K   | ]  }d j                  |        yw)rc   N)r0   )r   xs     r   r   z2StanfordSegmenter.segment_sents.<locals>.<genexpr>   s     :	1388A;	s   rd   re   rf   rg   rh   ri   )r8   tempfilemkstemp_input_file_pathr.   fdopenr0   
isinstancer+   encodewritecloser2   r3   r6   r4   rj   r7   r5   rk   unlink)r<   	sentencesrE   	_input_fh_inputrm   rn   s          r   rq   zStanfordSegmenter.segment_sents   s   >>+3+;+;+F(	4( IIi.	:	::fc"x]]8,F KK!""!!
 ##/JJ$JJ(,,+//	 s# 			$''(rJ   c                 v   | j                   }|j                  d|g       | j                  }|r|j                  d| j                  g       dj                  t              }t        | j                  |       t        || j                  t        t              \  }}|j                  |      }t        |d       |S )Nz-inputEncodingz-optionsrc   )rF   r   )	classpathrn   stderrF)r8   rj   r;   r0   r   r   r9   r	   r1   r   decode)r<   rm   r   rE   r;   default_optionsrn   _stderrs           r   rk   zStanfordSegmenter._execute  s    >>

$h/0((JJ
D$5$567((=1 	D--w?4--d4
 x( 	OU;rJ   )NNNNNNrM   rM   zUTF-8NFz-mx2g)F)__name__
__module____qualname____doc__r,   rI   r\   r_   ro   rs   rq   rk   __classcell__)ra   s   @r   r   r       s]    * $D #'% B
HGR6,(TrJ   r   )r#   r.   rz   r'   
subprocessr   nltk.internalsr   r   r   r   r   r	   nltk.tokenize.apir
   r-   r   r   rJ   r   <module>r      s8     	     )3D
 DrJ   