
    gF:                     0   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
mZmZ d dlmZmZ d Zd Zd Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZd ZddZ eddg      Z eg d      Z d Z!e"dk(  r e        yy)    N)treebank)BrillTaggerTrainerRegexpTaggerUnigramTagger)PosWord)Template
error_listc                      t                y)z
    Run a demo with defaults. See source comments for details,
    or docstrings of any of the more specific demo_* functions.
    Npostag     B/var/www/openai/venv/lib/python3.12/site-packages/nltk/tbl/demo.pydemor      s	    
 Hr   c                      t        d       y)N
    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
    repr
ruleformatNr   r   r   r   demo_repr_rule_formatr      s     fr   c                      t        d       y)r   strr   Nr   r   r   r   demo_str_rule_formatr   $   s     er   c                      t        d       y)z*
    Exemplify Rule.format("verbose")
    verboser   Nr   r   r   r   demo_verbose_rule_formatr   +   s     i r   c                  F    t        t        t        g d            g       y)a  
    The feature/s of a template takes a list of positions
    relative to the current word where the feature should be
    looked for, conceptually joined by logical OR. For instance,
    Pos([-1, 1]), given a value V, will hold whenever V is found
    one step to the left and/or one step to the right.

    For contiguous ranges, a 2-arg form giving inclusive end
    points can also be used: Pos(-3, -1) is the same as the arg
    below.
    )	templatesN)r   r	   r   r   r   r   demo_multiposition_featurer$   2   s     hs<0123r   c            	      \    t        t        t        dg      t        ddg            g       y)z8
    Templates can have more than a single feature.
    r   r    r!   r"   N)r   r	   r   r   r   r   r   demo_multifeature_templater&   A   s$     htQCy#r2h-89:r   c                      t        dd       y)ah  
    Show aggregate statistics per template. Little used templates are
    candidates for deletion, much used templates may possibly be refined.

    Deleting unused templates is mostly about saving time and/or space:
    training is basically O(T) in the number of templates T
    (also in terms of memory usage, which often will be the limiting factor).
    T)incremental_statstemplate_statsNr   r   r   r   demo_template_statisticsr*   H   s     T$7r   c                     t        j                  g dddgd      } t        j                  g dddgd      }t        t	        j                  | |gd	            }t        d
j                  t        |                   t        |dd       y)a	  
    Template.expand and Feature.expand are class methods facilitating
    generating large amounts of templates. See their documentation for
    details.

    Note: training with 500 templates can easily fill all available
    even on relatively small corpora
    )r!   r      r,      F)excludezero)r    r!   r   r,   T)r,      )combinationsz8Generated {} templates for transformation-based learning)r#   r(   r)   N)	r   expandr   listr	   printformatlenr   )wordtplstagtplsr#   s      r   demo_generated_templatesr8   T   su     {{:1v5AHjj!QTBGX__h%8vNOI	BII	N	

 Y$tLr   c                       t        ddd       y)z
    Plot a learning curve -- the contribution on tagging accuracy of
    the individual rules.
    Note: requires matplotlib
    Tzlearningcurve.png)r(   separate_baseline_datalearning_curve_outputNr   r   r   r   demo_learning_curver<   h   s     #1r   c                      t        d       y)zW
    Writes a file with context for each erroneous word after tagging testing data
    z
errors.txt)error_outputNr   r   r   r   demo_error_analysisr?   u   s     %r   c                      t        d       y)zm
    Serializes the learned tagger to a file in pickle format; reloads it
    and validates the process.
    z
tagger.pcl)serialize_outputNr   r   r   r   demo_serialize_taggerrB   |   s    
 L)r   c                       t        ddd       y)z
    Discard rules with low accuracy. This may hurt performance a bit,
    but will often produce rules which are more interesting read to a human.
    i  gQ?
   )	num_sentsmin_acc	min_scoreNr   r   r   r   demo_high_accuracy_rulesrH      s    
 T426r   c           	         |xs t         }| ddlm}m}  |       } t	        |||||      \  }}}}|rt
        j                  j                  |      sRt        ||      }t        |d      5 }t        j                  ||       ddd       t        dj                  |             t        |      5 }t        j                  |      }t        d|        ddd       nt        ||      }t        d       |r)t        d	j                  j                  |                   t!        j                          }t#        | ||	
      }t        d       |j%                  ||||      }t        dt!        j                          |z
  dd       |rt        d|j                  |      z         |dk(  rNt        d       t'        |j)                         d      D ]&  \  }}t        |dd|j                  |	      d       ( |
r{t        d       |j+                  ||      \  } }!t        d       |st        d       |j-                         }"|r|j/                  |!       |rLt1        ||!|"|       t        d|        n.t        d       |j3                  |      } |r|j/                          |st        |d      5 }#|#j5                  d|z         |#j5                  dj7                  t9        ||             j;                  d      dz          ddd       t        d|        ||j3                  |      } t        |d      5 }t        j                  ||       ddd       t        d|        t        |      5 }t        j                  |      }$ddd       t        d|        |j3                  |      }%| |%k(  rt        d        yt        d!       yy# 1 sw Y   JxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xxY w)"a
  
    Brill Tagger Demonstration
    :param templates: how many sentences of training and testing data to use
    :type templates: list of Template

    :param tagged_data: maximum number of rule instances to create
    :type tagged_data: C{int}

    :param num_sents: how many sentences of training and testing data to use
    :type num_sents: C{int}

    :param max_rules: maximum number of rule instances to create
    :type max_rules: C{int}

    :param min_score: the minimum score for a rule in order for it to be considered
    :type min_score: C{int}

    :param min_acc: the minimum score for a rule in order for it to be considered
    :type min_acc: C{float}

    :param train: the fraction of the the corpus to be used for training (1=all)
    :type train: C{float}

    :param trace: the level of diagnostic tracing output to produce (0-4)
    :type trace: C{int}

    :param randomize: whether the training data should be a random subset of the corpus
    :type randomize: C{bool}

    :param ruleformat: rule output format, one of "str", "repr", "verbose"
    :type ruleformat: C{str}

    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
    :type incremental_stats: C{bool}

    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
    :type template_stats: C{bool}

    :param error_output: the file where errors will be saved
    :type error_output: C{string}

    :param serialize_output: the file where the learned tbl tagger will be saved
    :type serialize_output: C{string}

    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
    :type learning_curve_output: C{string}

    :param learning_curve_take: how many rules plotted
    :type learning_curve_take: C{int}

    :param baseline_backoff_tagger: the file where rules will be saved
    :type baseline_backoff_tagger: tagger

    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    Nr   )brill24describe_template_sets)backoffwz)Trained baseline tagger, pickled it to {}zReloaded pickled tagger from zTrained baseline taggerz!    Accuracy on test set: {:0.4f}r   zTraining tbl tagger...zTrained tbl tagger in z0.2fz secondsz    Accuracy on test set: %.4fr,   z
Learned rules: 4d szJIncrementally tagging the test data, collecting individual rule statisticsz    Rule statistics collectedzbWARNING: train_stats asked for separate_baseline_data=True; the baseline will be artificially high)takez Wrote plot of learning curve to zTagging the test datazErrors for Brill Tagger %r


zutf-8z)Wrote tagger errors including context to zWrote pickled tagger to z4Reloaded tagger tried on test set, results identicalz;PROBLEM: Reloaded tagger gave different results on test set)REGEXP_TAGGERnltk.tag.brillrJ   rK   _demo_prepare_dataospathexistsr   openpickledumpr3   r4   loadaccuracytimer   train	enumeraterulesbatch_tag_incrementaltrain_statsprint_template_statistics
_demo_plot	tag_sentswritejoinr
   encode)&r#   tagged_datarE   	max_rulesrG   rF   r_   trace	randomizer   r(   r)   r>   rA   r;   learning_curve_takebaseline_backoff_taggerr:   cache_baseline_taggerrJ   rK   training_databaseline_data	gold_datatesting_databaseline_taggerprint_rulestbrilltrainerbrill_taggerrulenorule
taggedtest	teststats
trainstatsfbrill_tagger_reloadedtaggedtest_reloadeds&                                         r   r   r      s   p 6FB
 I	>PUIy2H?;]M9l ww~~34+'>O +S1[O[9 2;BB)
 '(K$kk+6O12G1HIJ )( (?VW'(/66((3	
 YY[F EjG 

"#==	9gNL	"499;#7"=X
FG.1F1Fy1QQR z!"%l&8&8&:A>LFDVBKqZ!8 ;<= ?
 X	
 #/"D"D)#
Y 	-.%, "--/
229= %y*CV 45J4KLM%&!++L9
224 ,$GG47GGHGGDIIjJ?@GGPSWWX % 	9,HI #!++L9
"C(KKKk2 )()9(:;<"#{$*KK$<! $-.>-?@A*44\B,,HIOP $U 21 )(z %$ )( $#s=   *O	.$O0AO#6O/.O;	OO #O,/O8;Pc           	         | t        d       t        j                         } |t        |       |k  rt        |       }|r3t	        j
                  t        |              t	        j                  |        t        ||z        }| d | }| || }|D 	cg c]  }|D 	cg c]  }	|	d   	 c}	 }
}}	|s|}nt        |      dz  }|d | ||d  }}t        |      \  }}t        |
      \  }}t        |      \  }}t        d|dd|dd       t        d|dd|dd       t        d	j                  |||rd
nd             ||||
fS c c}	w c c}	}w )Nz%Loading tagged data from treebank... r   r/   zRead testing data (dz sents/z wds)zRead training data (z-Read baseline data ({:d} sents/{:d} wds) {:s} z[reused the training set])
r3   r   tagged_sentsr5   randomseedshuffleintcorpus_sizer4   )rj   r_   rE   rm   r:   cutoffrq   rs   senttrt   rr   	bl_cutoff	trainseqstraintokenstestseqs
testtokensbltrainseqsbltraintokenss                      r   rU   rU   Q  s   
 56++-C,	9$	C$%{#U"#F(MF9-I5>?YT4(4aQqT4(YL?!%&!+	*9%)*% &  +=9Y(6Xz#.}#= [-	|7:a.
FG	 1W[O5
IJ	7>>(B.I	
 =)\BB+ )?s   	EE$EEc                    |d   g}|d   D ]  }|j                  |d   |z
          |d | D cg c]  }d||d   z  z
   }}|d   g}|d   D ]  }|j                  |d   |z
          |d | D cg c]  }d||d   z  z
   }}dd lm} t        t	        t        |                  }	|j                  |	||	|       |j                  g d       |j                  |        y c c}w c c}w )Ninitialerrors
rulescoresr!   r,   
tokencountr   )NNNg      ?)	appendmatplotlib.pyplotpyplotr2   ranger5   plotaxissavefig)
r;   r}   r~   rQ   	testcurve	rulescorex
traincurvepltrs
             r   re   re   y  s   ?+,I|,	223 -:CET:JK:JQQ<000:JIK_-.J-	*R.945 .<Fu<MN<Mq!a*\222<MJN#U3y>"#AHHQ	1j)HH$%KK%& L
 Os   C!1C&z^-?[0-9]+(\.[0-9]+)?$CDz.*NN)	r   )z(The|the|A|a|An|an)$AT)z.*able$JJ)z.*ness$r   )z.*ly$RB)z.*s$NNS)z.*ing$VBG)z.*ed$VBDr   c                 <    t        |       t        d | D              fS )Nc              3   2   K   | ]  }t        |        y w)N)r5   ).0r   s     r   	<genexpr>zcorpus_size.<locals>.<genexpr>  s     04a3q64s   )r5   sum)seqss    r   r   r     s    Is040011r   __main__)NNi  ,  r/   Ng?r/   Fr   FFNNNr   NFN)NN)#rV   rZ   r   r^   nltk.corpusr   nltk.tagr   r   r   rT   r   r   nltk.tblr	   r
   r   r   r   r   r$   r&   r*   r8   r<   r?   rB   rH   r   rU   re   NN_CD_TAGGERrS   r   __name__r   r   r   <module>r      s    
      D D $ )!4;	8M(
&*7 

  'BQJ%CP'& =}MN
2 z r   