
    g~0                     2   d Z ddlZddlZddlmZ ddlmZmZ 	 ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ  G d de      Z G d de      Zd Zd Zd ZddZd Zd Z G d de      ZddZ	 e dk(  r ed        ed       yy# e$ r Y uw xY w)z
Named entity chunker
    N)ElementTree)ClassifierBasedTaggerpos_tag)MaxentClassifier)ChunkParserI)
ChunkScorefind)word_tokenize)Treec                   *    e Zd ZdZddZd Zd Zd Zy)NEChunkParserTaggerz2
    The IOB tagger used by the chunk parser.
    Nc                 J    t        j                  | || j                  |       y )N)trainclassifier_builder
classifier)r   __init___classifier_builder)selfr   r   s      L/var/www/openai/venv/lib/python3.12/site-packages/nltk/chunk/named_entity.pyr   zNEChunkParserTagger.__init__$   s"    &&#77!		
    c                 4    t        j                  |ddd      S )Niis      )	algorithmgaussian_prior_sigmatrace)r   r   r   r   s     r   r   z'NEChunkParserTagger._classifier_builder,   s!    %%!"
 	
r   c                     	 | j                   }|S # t        $ r5 ddlm} t	        |j                  d            | _         | j                   }Y |S w xY w)Nr   )wordszen-basic)_en_wordlistAttributeErrornltk.corpusr!   set)r   wlr!   s      r   _english_wordlistz%NEChunkParserTagger._english_wordlist5   sS    	#""B 	  	#) #EKK
$; <D""B		#s    :AAc                    ||   d   }t        ||   d         }|dk(  rd x}}d x}}	d x}
x}}n|dk(  rA||dz
     d   j                         }d }t        ||dz
     d         }d }	||dz
     d   }d x}
}nu||dz
     d   j                         }||dz
     d   j                         }t        ||dz
     d         }t        ||dz
     d         }	||dz
     }||dz
     }t        |      }
|t        |      dz
  k(  r	d x}}d x}}n|t        |      dz
  k(  r7||dz      d   j                         }||dz      d   j                         }d }d }nd||dz      d   j                         }||dz      d   j                         }||dz      d   j                         }||dz      d   j                         }i dddt        |      dt        |      d|d d	 j                         d
|dd  j                         d|d|d|| j	                         v d|d|d|d|d|d|j                          d| d| d| d|
 d| }|S )Nr   r   r   biasTshapewordlenprefix3   suffix3poswordzen-wordlistprevtagprevposnextposprevwordnextwordzword+nextpos+zpos+prevtagzshape+prevtag)simplify_poslowerr*   lenr'   )r   tokensindexhistoryr1   r0   r5   prevprevwordr3   prevprevpos	prevshaper2   prevprevtagr6   nextnextwordr4   nextnextposfeaturess                     r   _feature_detectorz%NEChunkParserTagger._feature_detector?   sC   e}Q6%=+,A:&**H|$((Gk044I4+aZeai(+113HL"6%!)#4Q#78GKeai(+G&**Ieai(+113H!%!),Q/557L"6%!)#4Q#78G&veai'8';<Keai(G!%!),KhICK!O#&**H|$((Gkc&kAo%eai(+113HUQY'*002GLKeai(+113HUQY'*002G!%!),Q/557L +A.446K
D
U4[
 s4y
 tBQx~~'	

 tBCy(
 3
 D
 DD$:$:$<<
 w
 w
 w
 
 
 tzz|nAgY7
 cU!G9-
  	{!G95!
& r   )NN)__name__
__module____qualname____doc__r   r   r'   rE    r   r   r   r      s    

8r   r   c                   8    e Zd ZdZd Zd Zd Zd Zed        Z	y)NEChunkParser2
    Expected input: list of pos-tagged words
    c                 &    | j                  |       y N)_trainr   s     r   r   zNEChunkParser.__init__   s    Er   c                 ^    | j                   j                  |      }| j                  |      }|S )z8
        Each token should be a pos-tagged word
        )_taggertag_tagged_to_parse)r   r;   taggedtrees       r   parsezNEChunkParser.parse   s-     !!&)$$V,r   c                 l    |D cg c]  }| j                  |       }}t        |      | _        y c c}w )N)r   )_parse_to_taggedr   rR   )r   corpusss      r   rP   zNEChunkParser._train   s2    4:;Fq$''*F;*8 <s   1c                    t        dg       }|D ]  \  }}|dk(  r|j                  |       |j                  d      r |j                  t        |dd |g             N|j                  d      s`|rAt        |d   t               r.|d   j	                         |dd k(  r|d   j                  |       |j                  t        |dd |g              |S )zH
        Convert a list of tagged tokens to a chunk-parse tree.
        SOB-r   NI-)r   append
startswith
isinstancelabel)r   tagged_tokenssenttokrS   s        r   rT   zNEChunkParser._tagged_to_parse   s     C}%HCczC %DQR3%01%JtBx648>>;KsSTSUw;VHOOC(KKSWse 45 & r   c                 >   g }| D ]  }t        |t              rpt        |      dk(  rt        d       -|j	                  |d   d|j                          f       |dd D ]&  }|j	                  |d|j                          f       ( |j	                  |df        |S )zH
        Convert a chunk-parse tree to a list of tagged tokens.
        r   z"Warning -- empty chunk in sentencer_   r   Nr`   r^   )rd   r   r:   printrb   re   )rg   tokschildrh   s       r   rY   zNEChunkParser._parse_to_tagged   s    
 E%&u:?>?U1XEKKM?';<= 9CKK5;;=/&: ;< % UCL)  r   N)
rF   rG   rH   rI   r   rW   rP   rT   staticmethodrY   rJ   r   r   rL   rL   z   s/    9$  r   rL   c                 ,   t        j                  d| t         j                        ryt        j                  d| t         j                        ryt        j                  d| t         j                        r#| j                         ry| j	                         ryyy	)
Nz![0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$numberz\W+$punctz\w+$upcasedowncase	mixedcaseother)rematchUNICODEistitleislower)r1   s    r   r*   r*      sc    	xx4dBJJG	'4	,	'4	,<<>\\^r   c                 N    | j                  d      ry| j                  d      d   S )NV-r   )rc   split)r[   s    r   r8   r8      s#    ||Cwws|Ar   c                 h   | j                         }d t        |      D        }t        dg       }| D ]~  }t        |t              rP|j	                  t        |j                         g              |D ]!  }|d   j	                  |t        |      f       # c|j	                  |t        |      f        |S )Nc              3   &   K   | ]	  \  }}|  y wrO   rJ   ).0r1   r0   s      r   	<genexpr>zpostag_tree.<locals>.<genexpr>   s     6~s~s   r]   ra   )leavesr   r   rd   rb   re   next)rV   r!   tag_iternewtreerl   subchilds         r   postag_treer      s    KKME6wu~6H3mGeT"NN4r23!""Hd8n#=> " NNE4>23  Nr   binaryc           	   #     K   | D ]  }t        j                  |      D ]e  \  }}}|j                  d      r|r|D ]F  }|j                  d      st        t         j                  j                  ||      |      E d {    H g  y 7 w)Nbnewsz.sgm)oswalkendswithload_ace_filepathjoin)rootsfmt
skip_bnewsrootdirsfilesfs          r   load_ace_datar      su     !#D$}}W%*::f%,RWW\\$-BCHHH  "/  Is   AB
-B
<B
=B
c           	   #     K   t        dt        j                  j                  |       d           | dz   }g }t	        |      5 }t        j                  |      j                         }d d d        j                  d      D ]  }|j                  d      j                  }|j                  d      D ]v  }|j                  d      dk7  rt        |j                  d	      j                        }	t        |j                  d
      j                        dz   }
|j                  |	|
|f       x  t	        |       5 }|j                         }d d d        t        j                   dd      }d }t        j                   d||      }t        j                   dd|      }t        j                   dd|      }t        j                   dd|      }|D 	
ch c]  \  }	}
}|
 }}
}	}|dk(  rd}t#        dg       }t%        |      D ]^  \  }	}
}|	|k  r|}	|
|	k  r|j'                  t)        |||	              |j                  t#        d||	|
 j                                      |
}` |j'                  t)        ||d               | y |dk(  rd}t#        dg       }t%        |      D ]^  \  }	}
}|	|k  r|}	|
|	k  r|j'                  t)        |||	              |j                  t#        |||	|
 j                                      |
}` |j'                  t)        ||d               | y t+        d      # 1 sw Y   xY w# 1 sw Y   xY wc c}}
}	w w)Nz  - r   z.tmx.rdc.xmlzdocument/entityentity_typeentity_mentionTYPENAMEzhead/charseq/startzhead/charseq/endz<(?!/?TEXT)[^>]+> c                 P    d| j                         | j                         z
  dz
  z  S )N    )endstart)ms    r   subfunczload_ace_file.<locals>.subfunc   s#    aeeg	)A-..r   z[\s\S]*<TEXT>z</TEXT>[\s\S]*z``z "z''z" r   r   r]   NE
multiclasszbad fmt value)rj   r   r   r}   openETrW   getrootfindallr
   textgetintrb   readru   subr   sortedextendr   
ValueError)textfiler   annfileentitiesinfilexmlentitytypmentionr[   er   r   entity_typesirk   s                   r   r   r      s    	Dx(+,
-.'G H	g&hhv&&( 
++/0kk-(--~~&67G{{6"f,GLL!56;;<AGLL!3499:Q>AOOQ3K( 8 1 
h6{{} 
 66%r40D/ 66"GT2D66#R.D 66$d#D66$d#D+348KQ3C8L4 hC})IAq#1uAvKKd1Qi01KKT4!9??#456A * 	M$qr(+,
 
	C})IAq#1uAvKKd1Qi01KKS$q)//"345A * 	M$qr(+,
 ))} 
 
" 5sE   AM	$L('CM	5L5BM	
MEM	(L2-M	5L?:M	c           	      L   t         j                  |       } t         j                  |      }d}t        | |      D ]i  \  \  }}\  }}||cxk(  rdk(  r;n n8|rt        d|dd|dd|        t        dj	                  ddd             d}Rd}t        d|dd|dd|        k y )	NFr^   z  15r   z  {:15} {:15} {2}z...T)rL   rY   ziprj   format)correctguessedellipsiswctgts         r   
cmp_chunksr   .  s    ,,W5G,,W5GH1B!R?s?2b'2b'1#./)00uEFHBr"gQr"gQqc*+ 2r   c                   $    e Zd ZdZddZd Zd Zy)Maxent_NE_ChunkerrM   c                 `    ddl m} || _         |d| d      | _        | j	                          y )Nr   r	   z+chunkers/maxent_ne_chunker_tab/english_ace_/)	nltk.datar
   _fmt_tab_dirload_params)r   r   r
   s      r   r   zMaxent_NE_Chunker.__init__E  s.    "	J3%qQRr   c                     ddl m}m}  || j                        \  }}}}t	         ||||      |      }t        |      | _        y )Nr   )BinaryMaxentFeatureEncodingload_maxent_params)alwayson_features)r   )nltk.classify.maxentr   r   r   r   r   rR   )r   r   r   wgtmpglabaonmcs           r   r   zMaxent_NE_Chunker.load_paramsL  sB    X/>S#s'SCH#
 +b9r   c           	          ddl m} | j                  j                  }|j                  }|j
                  }|j                  }|j                  }|j                  }| j                  } |||||d| d       y )Nr   )save_maxent_paramsz/tmp/english_ace_r   )tab_dir)
r   r   rR   _classifier	_encoding_weights_mapping_labels	_alwaysonr   )	r   r   classifecgr   r   r   r   r   s	            r   save_paramszMaxent_NE_Chunker.save_paramsU  sh    ;,,**llkkmmii3S#9J3%q7QRr   Nr   )rF   rG   rH   rI   r   r   r   rJ   r   r   r   r   @  s    :
Sr   r   r   c                 <    t        |       }|j                          |S rO   )r   r   )r   chunkers     r   build_modelr   b  s    $GNr   __main__)r   Tr   )!rI   r   ru   	xml.etreer   r   nltk.tagr   r   nltk.classifyr   ImportErrornltk.chunk.apir   nltk.chunk.utilr   r   r
   nltk.tokenizer   	nltk.treer   r   rL   r*   r8   r   r   r   r   r   r   rF   rJ   r   r   <module>r      s    
 	 ' 3	. ( &  ' X/ Xv8L 8v ID*R,$S SD(T z   		s   B BB