
    gI                         d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ dZ G d d	e      Z G d
 de      Z G d de      Z G d de      Zy)    N)PIPE)_java_optionsconfig_javafind_jar_iterfind_jars_within_pathjava)ParserIDependencyGraph)Treez2https://nlp.stanford.edu/software/lex-parser.shtmlc                   p    e Zd ZdZdZdZdZdZdZ	 	 	 	 	 	 	 ddZ	d Z
dd	Zdd
ZddZddZddZddZy)GenericStanfordParserz Interface to the Stanford Parserz+stanford-parser-(\d+)(\.(\d+))+-models\.jarzstanford-parser\.jarz3edu.stanford.nlp.parser.lexparser.LexicalizedParserFNc                 j   t        t        | j                  |ddt        |d      d       }t        t        | j                  |ddt        |d      d       }	t
        j                  j                  |      d	   }
t        |	gt        |
      z         | _
        || _        || _        || _        || _        y )
N)STANFORD_PARSERSTANFORD_CORENLP T)env_vars
searchpathurlverboseis_regexc                 @    t         j                  j                  |       S Nospathdirname
model_paths    H/var/www/openai/venv/lib/python3.12/site-packages/nltk/parse/stanford.py<lambda>z0GenericStanfordParser.__init__.<locals>.<lambda>;       277??:#>    )key)STANFORD_MODELSr   c                 @    t         j                  j                  |       S r   r   r   s    r    r!   z0GenericStanfordParser.__init__.<locals>.<lambda>H   r"   r#   r   )maxr   _JAR_stanford_url_MODEL_JAR_PATTERNr   r   splittupler   
_classpathr   	_encodingcorenlp_optionsjava_options)selfpath_to_jarpath_to_models_jarr   encodingr   r0   r/   stanford_jar	model_jarstanford_dirs              r    __init__zGenericStanfordParser.__init__&   s     		@! ?
 ''"@! ?
	  ww}}\215.CL.Q QR$!.(r#   c           
         g }g }g }d}|j                  d      D ]  }|dk(  r|r|j                  t        |             g }d})| j                  r4|j                  | j	                  dj                  |                   g }d}i|j                  t        | j	                  dj                  |            g             g }|j                  |       d} t        |      S )NF 
T)
splitlinesappenditer_DOUBLE_SPACED_OUTPUT
_make_treejoin)r1   output_res	cur_lines	cur_treesblanklines          r    _parse_trees_outputz)GenericStanfordParser._parse_trees_outputV   s    		&&u-DrzJJtI/ "I!E//$$T__TYYy5I%JK "I EJJtT__TYYy5I%J$KLM "I  & .  Cyr#   c           
          | j                   d| j                  ddd| j                  dddg
}| j                  | j	                  |dj                  d	 |D              |            S )
a  
        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
        list where each sentence is a list of words.
        Each sentence will be automatically tagged with this StanfordParser instance's
        tagger.
        If whitespaces exists inside a token, then the token will be treated as
        separate tokens.

        :param sentences: Input sentences to parse
        :type sentences: list(list(str))
        :rtype: iter(iter(Tree))
        -model
-sentencesnewline-outputFormat
-tokenizedz-escaperz-edu.stanford.nlp.process.PTBEscapingProcessorr;   c              3   >   K   | ]  }d j                  |        yw) NrA   ).0sentences     r    	<genexpr>z4GenericStanfordParser.parse_sents.<locals>.<genexpr>   s     L)hsxx1)s   _MAIN_CLASSr   _OUTPUT_FORMATrH   _executerA   r1   	sentencesr   cmds       r    parse_sentsz!GenericStanfordParser.parse_sentsm   sp     OO;
 ''MMTYYL)LLg
 	
r#   c                 :    t        | j                  |g|            S )a&  
        Use StanfordParser to parse a sentence. Takes a sentence as a string;
        before parsing, it will be automatically tokenized and tagged by
        the Stanford Parser.

        :param sentence: Input sentence to parse
        :type sentence: str
        :rtype: iter(Tree)
        )nextraw_parse_sentsr1   rS   r   s      r    	raw_parsezGenericStanfordParser.raw_parse   s     D(((W=>>r#   c                     | j                   d| j                  ddd| j                  g}| j                  | j	                  |dj                  |      |            S )aI  
        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
        list of strings.
        Each sentence will be automatically tokenized and tagged by the Stanford Parser.

        :param sentences: Input sentences to parse
        :type sentences: list(str)
        :rtype: iter(iter(Tree))
        rJ   rK   rL   rM   r;   rU   rY   s       r    r_   z%GenericStanfordParser.raw_parse_sents   s^     OO
 ''MM#tyy3W=
 	
r#   c                 :    t        | j                  |g|            S )a0  
        Use StanfordParser to parse a sentence. Takes a sentence as a list of
        (word, tag) tuples; the sentence must have already been tokenized and
        tagged.

        :param sentence: Input sentence to parse
        :type sentence: list(tuple(str, str))
        :rtype: iter(Tree)
        )r^   tagged_parse_sentsr`   s      r    tagged_parsez"GenericStanfordParser.tagged_parse   s     D++XJ@AAr#   c                     d| j                   d| j                  ddd| j                  dddd	d
dg}| j                  | j	                  |dj                  fd|D              |            S )ad  
        Use StanfordParser to parse multiple sentences. Takes multiple sentences
        where each sentence is a list of (word, tag) tuples.
        The sentences must have already been tokenized and tagged.

        :param sentences: Input sentences to parse
        :type sentences: list(list(tuple(str, str)))
        :rtype: iter(iter(Tree))
        /rJ   rK   rL   rM   rN   z-tagSeparatorz-tokenizerFactoryz,edu.stanford.nlp.process.WhitespaceTokenizerz-tokenizerMethodnewCoreLabelTokenizerFactoryr;   c              3   R   K   | ]  }d j                  fd|D                 yw)rP   c              3   @   K   | ]  }j                  |        y wr   rQ   )rR   taggedtag_separators     r    rT   zEGenericStanfordParser.tagged_parse_sents.<locals>.<genexpr>.<genexpr>   s     OhF]//7hs   NrQ   )rR   rS   rl   s     r    rT   z;GenericStanfordParser.tagged_parse_sents.<locals>.<genexpr>   s(      $- HHOhOO$-s   $'rU   )r1   rZ   r   r[   rl   s       @r    rd   z(GenericStanfordParser.tagged_parse_sents   s     OO:*
" ''MM		 $-  	
 		
r#   c                    | j                   }|j                  d|g       | j                  r)|j                  | j                  j                                dj	                  t
              }t        | j                  |       t        j                  dd      5 }t        |t              r|r|j                  |      }|j                  |       |j                          | j                  r7|j!                  d       t#        || j$                  |t&        t&              \  }}n?|j)                  |j*                         t#        || j$                  t&        t&        	      \  }}|j-                  d
d      }|j-                  dd      }|j/                  |      }d d d        t1        j2                  j*                         t        |d       S # 1 sw Y   7xY w)Nz	-encodingrP   )optionsr   wbF)modedeleter   )	classpathstdinstdoutstderr)rr   rt   ru   s        s    )r.   extendr/   r+   rA   r   r   r0   tempfileNamedTemporaryFile
isinstancestrencodewriteflush
_USE_STDINseekr   r-   r   r=   namereplacedecoder   unlink)	r1   r[   input_r   r4   default_options
input_filert   ru   s	            r    rX   zGenericStanfordParser._execute   so   >>

K*+JJt++1134((=1 	D--w? ((d5AZ&#&8x0V$ "!%"oo$" 

:??+!%4??4" ^^K6F^^K6F]]8,F3 B6 			*//" 	OU;A BAs   C<G

G)NNz4edu/stanford/nlp/models/lexparser/englishPCFG.ser.gzutf8Fz-mx4gr:   F)__name__
__module____qualname____doc__r*   r(   rV   r   r?   r8   rH   r\   ra   r_   re   rd   rX   r   r#   r    r   r      sb    *G"DGKJ! I.)`.
>
?
.
B%
N,r#   r   c                   ,     e Zd ZdZdZ fdZd Z xZS )StanfordParsera  
    >>> parser=StanfordParser(
    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    ... ) # doctest: +SKIP

    >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP
    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]

    >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
    ...     "the quick brown fox jumps over the lazy dog",
    ...     "the quick grey wolf jumps over the lazy fox"
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
    [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
    [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
    Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]

    >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
    ...     "I 'm a dog".split(),
    ...     "This is my friends ' cat ( the tabby )".split(),
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
    Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
    [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
    Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
    Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]

    >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
    ...     (
    ...         ("The", "DT"),
    ...         ("quick", "JJ"),
    ...         ("brown", "JJ"),
    ...         ("fox", "NN"),
    ...         ("jumped", "VBD"),
    ...         ("over", "IN"),
    ...         ("the", "DT"),
    ...         ("lazy", "JJ"),
    ...         ("dog", "NN"),
    ...         (".", "."),
    ...     ),
    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
    Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
    [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
    pennc                 \    t        j                  dt        d       t        |   |i | y )NzcThe StanfordParser will be deprecated
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.   
stacklevelwarningswarnDeprecationWarningsuperr8   r1   argskwargs	__class__s      r    r8   zStanfordParser.__init__F  s/    R		
 	$)&)r#   c                 ,    t        j                  |      S r   )r   
fromstringr1   results     r    r@   zStanfordParser._make_treeP  s    v&&r#   r   r   r   r   rW   r8   r@   __classcell__r   s   @r    r   r     s    0d N*'r#   r   c                   ,     e Zd ZdZdZ fdZd Z xZS )StanfordDependencyParsera
  
    >>> dep_parser=StanfordDependencyParser(
    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    ... ) # doctest: +SKIP

    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]

    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
    ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]

    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
    ...     "The quick brown fox jumps over the lazy dog.",
    ...     "The quick grey wolf jumps over the lazy fox."
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
    Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]

    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
    ...     "I 'm a dog".split(),
    ...     "This is my friends ' cat ( the tabby )".split(),
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]

    >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
    ...     (
    ...         ("The", "DT"),
    ...         ("quick", "JJ"),
    ...         ("brown", "JJ"),
    ...         ("fox", "NN"),
    ...         ("jumped", "VBD"),
    ...         ("over", "IN"),
    ...         ("the", "DT"),
    ...         ("lazy", "JJ"),
    ...         ("dog", "NN"),
    ...         (".", "."),
    ...     ),
    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
    [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
    ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]

    	conll2007c                 \    t        j                  dt        d       t        |   |i | y )NzwThe StanfordDependencyParser will be deprecated
Please use [91mnltk.parse.corenlp.CoreNLPDependencyParser[0m instead.r   r   r   r   s      r    r8   z!StanfordDependencyParser.__init__  s/    \		
 	$)&)r#   c                     t        |d      S )Nroottop_relation_labelr
   r   s     r    r@   z#StanfordDependencyParser._make_tree      v&AAr#   r   r   s   @r    r   r   T  s    .` !N*Br#   r   c                   H     e Zd ZdZdZdZdZdZdZdZ	 fdZ
d
dZd	 Z xZS )StanfordNeuralDependencyParserar  
    >>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP
    >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP

    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]

    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
    (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
    u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
    ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
    (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
    u'punct', (u'.', u'.'))]]

    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
    ...     "The quick brown fox jumps over the lazy dog.",
    ...     "The quick grey wolf jumps over the lazy fox."
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
    'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
    Tree('fox', ['over', 'the', 'lazy']), '.'])]

    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
    ...     "I 'm a dog".split(),
    ...     "This is my friends ' cat ( the tabby )".split(),
    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
    ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
    conllz)edu.stanford.nlp.pipeline.StanfordCoreNLPz%stanford-corenlp-(\d+)(\.(\d+))+\.jarz,stanford-corenlp-(\d+)(\.(\d+))+-models\.jarTc                     t        j                  dt        d       t        |   |i | | xj
                  dz  c_        y )Nz}The StanfordNeuralDependencyParser will be deprecated
Please use [91mnltk.parse.corenlp.CoreNLPDependencyParser[0m instead.r   r   z(-annotators tokenize,ssplit,pos,depparse)r   r   r   r   r8   r/   r   s      r    r8   z'StanfordNeuralDependencyParser.__init__  sA    \		
 	$)&) JJr#   c                     t        d      )z
        Currently unimplemented because the neural dependency parser (and
        the StanfordCoreNLP pipeline class) doesn't support passing in pre-
        tagged tokens.
        zxtagged_parse[_sents] is not supported by StanfordNeuralDependencyParser; use parse[_sents] or raw_parse[_sents] instead.)NotImplementedError)r1   rZ   r   s      r    rd   z1StanfordNeuralDependencyParser.tagged_parse_sents  s     ":
 	
r#   c                     t        |d      S )NROOTr   r
   r   s     r    r@   z)StanfordNeuralDependencyParser._make_tree  r   r#   r   )r   r   r   r   rW   rV   r(   r*   r   r?   r8   rd   r@   r   r   s   @r    r   r     s;    > N=K3DHJ 	K

Br#   r   )r   rx   r   
subprocessr   nltk.internalsr   r   r   r   r   nltk.parse.apir	   nltk.parse.dependencygraphr   	nltk.treer   r)   r   r   r   r   r   r#   r    <module>r      sk    
     # 6 DrG rj@'* @'F>B4 >BB?B%: ?Br#   