
    gb                     ^    d Z ddlmZmZmZ ddlmZmZ ddlm	Z	m
Z
 ddlmZ  G d de      Zy)	a!  
A general interface to the SENNA pipeline that supports any of the
operations specified in SUPPORTED_OPERATIONS.

Applying multiple operations at once has the speed advantage. For example,
Senna will automatically determine POS tags if you are extracting named
entities. Applying both of the operations will cost only the time of
extracting the named entities.

The SENNA pipeline has a fixed maximum size of the sentences that it can read.
By default it is 1024 token/sentence. If you have larger sentences, changing
the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
system specific binary should be rebuilt. Otherwise this could introduce
misalignment errors.

The input is:

- path to the directory that contains SENNA executables. If the path is incorrect,
  Senna will automatically search for executable file specified in SENNA environment variable
- List of the operations needed to be performed.
- (optionally) the encoding of the input data (default:utf-8)

Note: Unit tests for this module can be found in test/unit/test_senna.py

>>> from nltk.classify import Senna
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])  # doctest: +SKIP
>>> sent = 'Dusseldorf is an international business center'.split()
>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)]  # doctest: +SKIP
[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
    )environpathsep)architecturesystem)PIPEPopen)TaggerIc                   4    e Zd Zg dZddZd Zd Zd Zd Zy)	Senna)poschknerc                    || _         t        j                  |      t        z   | _        | j                  | j                        }t        j                  |      srdt        v rjt        j                  t        d         t        z   | _        | j                  | j                        }t        j                  |      st        d|d|d      || _	        y )NSENNAzSenna executable expected at z or z but not found)
	_encodingr   normpathr   _path
executableisfiler   LookupError
operations)self
senna_pathr   encoding
exe_file_1
exe_file_2s         H/var/www/openai/venv/lib/python3.12/site-packages/nltk/classify/senna.py__init__zSenna.__init__2   s    !]]:.4
 __TZZ0
{{:&'!!]]77+;<sB
!__TZZ8
{{:.%%z3 
 %    c                 4   t               }|dk(  r>t               d   }|dk(  rt        j                  |d      S t        j                  |d      S |dk(  rt        j                  |d      S |dk(  rt        j                  |d	      S t        j                  |d
      S )z
        The function that determines the system specific binary that should be
        used in the pipeline. In case, the system is not known the default senna binary will
        be used.
        Linuxr   64bitzsenna-linux64zsenna-linux32Windowszsenna-win32.exeDarwinz	senna-osxsenna)r   r   r   join)r   	base_pathos_namebitss       r   r   zSenna.executableG   s     (g>!$DwyyO<<99Y88i99Y(9::h99Y44yyG,,r    c                 j    i }d}t         j                  D ]  }|| j                  v s|||<   |dz  } |S )z
        A method that calculates the order of the columns that SENNA pipeline
        will output the tags into. This depends on the operations being ordered.
           )r   SUPPORTED_OPERATIONSr   )r   _mapi	operations       r   r.   z
Senna._mapY   sC    
 33IDOO+"#YQ 4 r    c                 ,    | j                  |g      d   S )zI
        Applies the specified operation(s) on a list of tokens.
        r   )	tag_sents)r   tokenss     r   tagz	Senna.tagf   s     ~~vh'**r    c                 Z   | j                   }t        j                  | j                  | j                              s't        d| j                  | j                        z        | j                  | j                        d| j                  ddg}|j                  | j                  D cg c]  }d|z   	 c}       dj                  d |D              dz   }t        |t              r|r|j                  |      }t        |t        t        t              }|j                  |	      \  }}|}	|j                  d
k7  rt!        d|z        |r|j#                  |      }	| j%                         }
g g}d
}d
}|	j'                         j)                  d      D ]u  }|s|j+                  g        |dz  }d
}|j)                  d      }i }|
D ]  }||
|      j'                         ||<    	 ||   |   |d<   |d   j+                  |       |dz  }w |S c c}w # t,        $ r}t-        d|z        |d}~ww xY w)z
        Applies the tag method over a list of sentences. This method will return a
        list of dictionaries. Every dictionary will contain a word with its
        calculated annotations/tags.
        z-Senna executable expected at %s but not foundz-pathz
-usrtokensz-iobtags-
c              3   >   K   | ]  }d j                  |        yw) N)r'   ).0xs     r   	<genexpr>z"Senna.tag_sents.<locals>.<genexpr>   s     :	1388A;	s   )stdinstdoutstderr)inputr   z!Senna command failed! Details: %sr,   	wordzMisalignment error occurred at sentence number %d. Possible reason is that the sentence size exceeded the maximum size. Check the documentation of Senna class for more information.N)r   r   r   r   r   r   extendr   r'   
isinstancestrencoder	   r   communicate
returncodeRuntimeErrordecoder.   stripsplitappend
IndexError)r   	sentencesr   
_senna_cmdop_inputpr>   r?   senna_outputmap_tagged_sentencessentence_indextoken_indextagged_wordtagsresultr4   es                      r   r2   zSenna.tag_sentsl   s?    >>{{4??4::67?//$**-.  OODJJ'JJ

 	doo>o38o>? :	::TAfc"x]]8,F *DdC==v=6 <<1BVKLL!==2L yy{4'--/55d;K ''+!#$$T*DF"49o335s !*>!:;!Gv R ''/1K) <*  [ ?H   I %%
 s   %H H	H*H%%H*N)zutf-8)	__name__
__module____qualname__r-   r   r   r.   r4   r2    r    r   r   r   /   s"    0%*-$+C r    r   N)__doc__osr   r   r   platformr   r   
subprocessr   r	   nltk.tag.apir
   r   ra   r    r   <module>rg      s*   @ " ! ) "  @ G @ r    