
    g                       d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZ dZ	 dZ	 d	Z	 d
Z	 dZ	 dZ	 eez   ez   Z	 eez   ez   Z	 eeeeeedZ	 dZdZdZdZdZ dZ!dZ" G d d      Z# ejH                  dejJ                        Z&	 d Z' G d d      Z( G d d      Z) G d d      Z* G d de*      Z+ G d  d!e*e      Z, G d" d#e,      Z-d$ Z.d)d%Z/d&Z0d' Z1e,e+fd(Z2y)*a}  
Punkt Sentence Tokenizer

This tokenizer divides a text into a list of sentences
by using an unsupervised algorithm to build a model for abbreviation
words, collocations, and words that start sentences.  It must be
trained on a large collection of plaintext in the target language
before it can be used.

The NLTK data package includes a pre-trained Punkt tokenizer for
English.

    >>> from nltk.tokenize import PunktTokenizer
    >>> text = '''
    ... Punkt knows that the periods in Mr. Smith and Johann S. Bach
    ... do not mark sentence boundaries.  And sometimes sentences
    ... can start with non-capitalized words.  i is a good variable
    ... name.
    ... '''
    >>> sent_detector = PunktTokenizer()
    >>> print('\n-----\n'.join(sent_detector.tokenize(text.strip())))
    Punkt knows that the periods in Mr. Smith and Johann S. Bach
    do not mark sentence boundaries.
    -----
    And sometimes sentences
    can start with non-capitalized words.
    -----
    i is a good variable
    name.

(Note that whitespace from the original text, including newlines, is
retained in the output.)

Punctuation following sentences is also included by default
(from NLTK 3.0 onwards). It can be excluded with the realign_boundaries
flag.

    >>> text = '''
    ... (How does it deal with this parenthesis?)  "It should be part of the
    ... previous sentence." "(And the same with this one.)" ('And this one!')
    ... "('(And (this)) '?)" [(and this. )]
    ... '''
    >>> print('\n-----\n'.join(
    ...     sent_detector.tokenize(text.strip())))
    (How does it deal with this parenthesis?)
    -----
    "It should be part of the
    previous sentence."
    -----
    "(And the same with this one.)"
    -----
    ('And this one!')
    -----
    "('(And (this)) '?)"
    -----
    [(and this. )]
    >>> print('\n-----\n'.join(
    ...     sent_detector.tokenize(text.strip(), realign_boundaries=False)))
    (How does it deal with this parenthesis?
    -----
    )  "It should be part of the
    previous sentence.
    -----
    " "(And the same with this one.
    -----
    )" ('And this one!
    -----
    ')
    "('(And (this)) '?
    -----
    )" [(and this.
    -----
    )]

However, Punkt is designed to learn parameters (a list of abbreviations, etc.)
unsupervised from a corpus similar to the target domain. The pre-packaged models
may therefore be unsuitable: use ``PunktSentenceTokenizer(text)`` to learn
parameters from the given text.

:class:`.PunktTrainer` learns parameters such as a list of abbreviations
(without supervision) from portions of text. Using a ``PunktTrainer`` directly
allows for incremental training and modification of the hyper-parameters used
to decide what is considered an abbreviation, etc.

The algorithm for this tokenizer is described in::

  Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
    Boundary Detection.  Computational Linguistics 32: 485-525.
    N)defaultdict)AnyDictIteratorListMatchOptionalTupleUnionFreqDist)
TokenizerI                @   ))initialupper)internalr   )unknownr   )r   lower)r   r   )r   r   zdefault decisionzknown collocation (both words)z%abbreviation + orthographic heuristicz(abbreviation + frequent sentence starterz initial + orthographic heuristicz(initial + special orthographic heuristicc                       e Zd ZdZdZd Zd ZdZ	 ed        Z	dZ
	  ej                  dej                        Z	 d	Z	 ed
        Z	 dZ	 dZ	 d Zd ZdZ	 d Zy)PunktLanguageVarsaX  
    Stores variables, mostly regular expressions, which may be
    language-dependent for correct application of the algorithm.
    An extension of this class may modify its properties to suit
    a language other than English; an instance can then be passed
    as an argument to PunktSentenceTokenizer and PunktTrainer
    constructors.
    )_re_period_context_re_word_tokenizerc                      yN    selfs    H/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/punkt.py__getstate__zPunktLanguageVars.__getstate__   s         c                      yr   r!   )r#   states     r$   __setstate__zPunktLanguageVars.__setstate__   s    r&   ).?!c                 d    dt        j                  dj                  | j                              z  S )Nz[%s] )reescapejoinsent_end_charsr"   s    r$   _re_sent_end_charsz$PunktLanguageVars._re_sent_end_chars   s%    		"''$*=*=">???r&   z,:;z["\')\]}]+?(?:\s+|(?=--)|$)z[^\(\"\`{\[:;&\#\*@\)}\]\-,]c                 ~    dt        j                  dj                  t        | j                        dhz
              z  S )Nz(?:[)\";}\]\*:@\'\({\[%s])r.   r*   )r/   r0   r1   setr2   r"   s    r$   _re_non_word_charsz$PunktLanguageVars._re_non_word_chars   s8    ,ryyGGC++,u450
 
 	
r&   z (?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)a  (
        %(MultiChar)s
        |
        (?=%(WordStart)s)\S+?  # Accept word characters until end is found
        (?= # Sequences marking a word's end
            \s|                                 # White-space
            $|                                  # End-of-string
            %(NonWord)s|%(MultiChar)s|          # Punctuation
            ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word
        )
        |
        \S
    )c                 $   	 | j                   S # t        $ rx t        j                  | j                  | j
                  | j                  | j                  dz  t        j                  t        j                  z        | _         | j                   cY S w xY w)z?Compiles and returns a regular expression for word tokenization)NonWord	MultiChar	WordStart)
r   AttributeErrorr/   compile_word_tokenize_fmtr6   _re_multi_char_punct_re_word_startUNICODEVERBOSEr"   s    r$   _word_tokenizer_rez$PunktLanguageVars._word_tokenizer_re   s    	+*** 
	+&(jj''#66!%!:!:!%!4!4 

RZZ''D# ***
	+s    A>BBc                 @    | j                         j                  |      S )z=Tokenize a string to split off punctuation other than periods)rB   findall)r#   ss     r$   word_tokenizezPunktLanguageVars.word_tokenize  s    &&(0033r&   a   
        %(SentEndChars)s             # a potential sentence ending
        (?=(?P<after_tok>
            %(NonWord)s              # either other punctuation
            |
            \s+(?P<next_tok>\S+)     # or whitespace and some other token
        ))c                     	 | j                   S #  t        j                  | j                  | j                  | j
                  dz  t        j                  t        j                  z        | _         | j                   cY S xY w)zjCompiles and returns a regular expression to find contexts
        including possible sentence boundaries.)r8   SentEndChars)r   r/   r<   _period_context_fmtr6   r3   r@   rA   r"   s    r$   period_context_rez#PunktLanguageVars.period_context_re  sp    	+***		+&(jj((#66$($;$;
 

RZZ''D# ***s
    A,A<N)__name__
__module____qualname____doc__	__slots__r%   r)   r2   propertyr3   internal_punctuationr/   r<   	MULTILINEre_boundary_realignmentr?   r6   r>   r=   rB   rF   rI   rJ   r!   r&   r$   r   r      s     =I %NA@ @ !) )bjj)GV1 5N<
 

 5>=	+ 4L+r&   r   z[^\W\d]c              #      K   t        |       } 	 t        |       }| D ]
  }||f |} |df y# t        $ r Y yw xY ww)z
    Yields pairs of tokens from the given iterator such that each input
    token will appear as the first element in a yielded tuple. The last
    pair will have None as its second element.
    N)iternextStopIteration)iteratorprevels      r$   
_pair_iterr[   9  sY      H~HH~ Rj  ,  s   ?0 ?	<?<?c                   :    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
y	)
PunktParameterszCStores data used to perform sentence boundary detection with Punkt.c                     t               | _        	 t               | _        	 t               | _        	 t	        t
              | _        y N)r5   abbrev_typescollocationssent_startersr   intortho_contextr"   s    r$   __init__zPunktParameters.__init__R  sC    E:E	 !U	# )-	5r&   c                 "    t               | _        y r_   )r5   r`   r"   s    r$   clear_abbrevszPunktParameters.clear_abbrevsf      Er&   c                 "    t               | _        y r_   )r5   ra   r"   s    r$   clear_collocationsz"PunktParameters.clear_collocationsi  rh   r&   c                 "    t               | _        y r_   )r5   rb   r"   s    r$   clear_sent_startersz#PunktParameters.clear_sent_startersl  s     Ur&   c                 ,    t        t              | _        y r_   )r   rc   rd   r"   s    r$   clear_ortho_contextz#PunktParameters.clear_ortho_contexto  s    (-r&   c                 2    | j                   |xx   |z  cc<   y r_   )rd   )r#   typflags      r$   add_ortho_contextz!PunktParameters.add_ortho_contextr  s    34'r&   c              #      K   | j                   |   }|t        z  rd |t        z  rd |t        z  rd |t        z  rd |t
        z  rd |t        z  rd y y w)NzBEG-UCzMID-UCzUNK-UCzBEG-LCzMID-LCzUNK-LC)rd   _ORTHO_BEG_UC_ORTHO_MID_UC_ORTHO_UNK_UC_ORTHO_BEG_LC_ORTHO_MID_LC_ORTHO_UNK_LC)r#   rp   contexts      r$   _debug_ortho_contextz$PunktParameters._debug_ortho_contextu  si     $$S)]"N]"N]"N]"N]"N]"N #s   A A"N)rK   rL   rM   rN   re   rg   rj   rl   rn   rr   r{   r!   r&   r$   r]   r]   O  s(    M5(""#.(r&   r]   c                      e Zd ZdZg dZg dez   Zd Z ej                  d      Z	 ej                  d      Z
 ej                  dej                        Z ej                  dej                        Zd	 Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zd Zd Zy)
PunktTokenzXStores a token of text with annotations produced during
    sentence boundary detection.)	parastart	linestart	sentbreakabbrellipsis)toktypeperiod_finalc                     || _         | j                  |      | _        |j                  d      | _        | j
                  D ]  }t        | |d         |D ]  }t        | |||           y )Nr*   )r   	_get_typer   endswithr   _propertiessetattr)r#   r   paramspropks        r$   re   zPunktToken.__init__  s`    NN3'	LL-$$DD$% %AD!VAY' r&   z\.\.+$z^-?[\.,]?\d[\d,\.-]*\.?$z
[^\W\d]\.$z	[^\W\d]+$c                 V    | j                   j                  d|j                               S )z6Returns a case-normalized representation of the token.
##number##)_RE_NUMERICsubr   )r#   r   s     r$   r   zPunktToken._get_type  s!    ##L#))+>>r&   c                     t        | j                        dkD  r!| j                  d   dk(  r| j                  dd S | j                  S )zG
        The type with its final period removed if it has one.
        r    r*   N)lenr   r"   s    r$   type_no_periodzPunktToken.type_no_period  s=    
 tyy>A$))B-3"699Sb>!yyr&   c                 J    | j                   r| j                  S | j                  S )ze
        The type with its final period removed if it is marked as a
        sentence break.
        )r   r   r   r"   s    r$   type_no_sentperiodzPunktToken.type_no_sentperiod  s!     >>&&&yyr&   c                 <    | j                   d   j                         S )z1True if the token's first character is uppercase.r   )r   isupperr"   s    r$   first_upperzPunktToken.first_upper       xx{""$$r&   c                 <    | j                   d   j                         S )z1True if the token's first character is lowercase.r   )r   islowerr"   s    r$   first_lowerzPunktToken.first_lower  r   r&   c                 8    | j                   ry| j                  ryy)Nr   r   none)r   r   r"   s    r$   
first_casezPunktToken.first_case  s    r&   c                 L    | j                   j                  | j                        S )z.True if the token text is that of an ellipsis.)_RE_ELLIPSISmatchr   r"   s    r$   is_ellipsiszPunktToken.is_ellipsis  s       &&txx00r&   c                 8    | j                   j                  d      S )z+True if the token text is that of a number.r   )r   
startswithr"   s    r$   	is_numberzPunktToken.is_number  s     yy##L11r&   c                 L    | j                   j                  | j                        S )z-True if the token text is that of an initial.)_RE_INITIALr   r   r"   s    r$   
is_initialzPunktToken.is_initial  s     %%dhh//r&   c                 L    | j                   j                  | j                        S )z)True if the token text is all alphabetic.)	_RE_ALPHAr   r   r"   s    r$   is_alphazPunktToken.is_alpha  s     ~~##DHH--r&   c                 @    t         j                  | j                        S )z6True if the token is either a number or is alphabetic.)_re_non_punctsearchr   r"   s    r$   is_non_punctzPunktToken.is_non_punct  s     ##DII..r&   c                 (     j                    j                  k7  rdt         j                         z  nd}dj                   fd j                  D              }dj                   j                  j                  t         j                        ||      S )z
        A string representation of the token that can reproduce it
        with eval(), which lists all the token's non-default
        annotations.
        z	 type=%s,r.   z, c           	   3   j   K   | ]*  }t        |      r| d t        t        |              , yw)=N)getattrrepr).0pr#   s     r$   	<genexpr>z&PunktToken.__repr__.<locals>.<genexpr>  s;      
%tQ c4a()*+%s   03z{}({},{} {}))r   r   r   r1   r   format	__class__rK   )r#   typestrpropvalss   `  r$   __repr__zPunktToken.__repr__  s|     48993H+TYY/b99 
%%
 
 $$NN##N	
 	
r&   c                     | j                   }| j                  r|dz  }| j                  r|dz  }| j                  r|dz  }|S )zO
        A string representation akin to that used by Kiss and Strunk.
        z<A>z<E>z<S>)r   r   r   r   )r#   ress     r$   __str__zPunktToken.__str__  sB     hh995LC==5LC>>5LC
r&   N)rK   rL   rM   rN   r   rO   re   r/   r<   r   r   r@   r   r   r   rP   r   r   r   r   r   r   r   r   r   r   r   r   r!   r&   r$   r}   r}     sD   $ NK/+=I( 2::i(L"**89K"**]BJJ7K

<4I?     % % % %   1 1 2 2 0 0 . . / /
*r&   r}   c                   P    e Zd ZdZdedfdZd Zdee   dee   fdZdeddfd	Z	y)
PunktBaseClasszP
    Includes common components of PunktTrainer and PunktSentenceTokenizer.
    Nc                 ^    |
t               }|
t               }|| _        || _        || _        y r_   )r   r]   _params
_lang_vars_Token)r#   	lang_vars	token_clsr   s       r$   re   zPunktBaseClass.__init__  s7    )+I>$&F#	#r&   c              #   D  K   d}|j                  d      D ]w  }|j                         rct        | j                  j	                  |            }	 t        |      }| j                  ||d       d}|D ]  }| j                  |        vd}y y# t        $ r Y w xY ww)aB  
        Divide the given text into tokens, using the punkt word
        segmentation regular expression, and generate the resulting list
        of tokens augmented as three-tuples with two boolean values for whether
        the given token occurs at the start of a paragraph or a new line,
        respectively.
        F
T)r~   r   N)splitstriprU   r   rF   rV   rW   r   )r#   	plaintextr~   line	line_toksr   s         r$   _tokenize_wordszPunktBaseClass._tokenize_words*  s      	OOD)Dzz| !>!>t!DE	y/C kk#dkKK!	$C++c** % !	 * % s*   AB B8B 	BB BB tokensreturnc              #   D   K   |D ]  }| j                  |       |  yw)a  
        Perform the first pass of annotation, which makes decisions
        based purely based on the word type of each word:

          - '?', '!', and '.' are marked as sentence breaks.
          - sequences of two or more periods are marked as ellipsis.
          - any word ending in '.' that's a known abbreviation is
            marked as an abbreviation.
          - any other word ending in '.' is marked as a sentence break.

        Return these annotations as a tuple of three sets:

          - sentbreak_toks: The indices of all sentence breaks.
          - abbrev_toks: The indices of all abbreviations.
          - ellipsis_toks: The indices of all ellipsis marks.
        N)_first_pass_annotationr#   r   aug_toks      r$   _annotate_first_passz#PunktBaseClass._annotate_first_passH  s%     & G''0M s    r   c                    |j                   }|| j                  j                  v rd|_        y|j                  rd|_        y|j                  r|j                  d      ss|dd j                         | j                  j                  v s;|dd j                         j                  d      d   | j                  j                  v rd|_        yd|_        y)zC
        Performs type-based annotation on a single token.
        Tz..Nr   -)r   r   r2   r   r   r   r   r   r   r   r`   r   r   )r#   r   r   s      r$   r   z%PunktBaseClass._first_pass_annotation_  s    
 kk$//000 $G 	   #G 	 !!#,,t*<CR DLL$=$==s8>>#))#.r2dll6O6OO# 	 %)!r&   )
rK   rL   rM   rN   r}   re   r   r   r   r   r!   r&   r$   r   r     sL     "&D 	#!<z*	*	.j T r&   r   c                       e Zd ZdZdddefdZd ZdZ	 dZ	 dZ		 dZ
	 d	Z	 dZ	 dZ	 	 d
Z	 ddZddZd Zd ZddZ	 ddZd Zd Zd Zd Zd Zed        Zed        Zd Zd Zd Zd Z d Z!y) PunktTrainerz<Learns parameters used in Punkt sentence boundary detection.NFc                     t         j                  | ||       t               | _        	 d| _        	 t               | _        	 t               | _        	 d| _        	 d| _        	 |r| j                  ||d       y y )Nr   r   r   T)finalize)
r   re   r   _type_fdist_num_period_toks_collocation_fdist_sent_starter_fdist_sentbreak_count
_finalizedtrainr#   
train_textverboser   r   s        r$   re   zPunktTrainer.__init__~  s     		YO#:	< !"H"**	H $,: 	 !"	@ 	% JJz7TJ: r&   c                 R    | j                   s| j                          | j                  S )zl
        Calculates and returns parameters for sentence boundary detection as
        derived from training.)r   finalize_trainingr   r"   s    r$   
get_paramszPunktTrainer.get_params  s!     ""$||r&   g333333?   gQ@   r    c                 n    | j                  | j                  |      |       |r| j                  |       yy)a8  
        Collects training data from a given text. If finalize is True, it
        will determine all the parameters for sentence boundary detection. If
        not, this will be delayed until get_params() or finalize_training() is
        called. If verbose is True, abbreviations found will be listed.
        N)_train_tokensr   r   )r#   textr   r   s       r$   r   zPunktTrainer.train  s5     	4//5w?""7+ r&   c                 d      j                   fd|D        |       |r j                  |       yy)zE
        Collects training data from a given list of tokens.
        c              3   @   K   | ]  }j                  |        y wr_   r   r   tr#   s     r$   r   z,PunktTrainer.train_tokens.<locals>.<genexpr>  s     ;FqDKKNF   N)r   r   )r#   r   r   r   s   `   r$   train_tokenszPunktTrainer.train_tokens  s1     	;F;WE""7+ r&   c                    d| _         t        |      }|D ]E  }| j                  |j                  xx   dz  cc<   |j                  s1| xj
                  dz  c_        G | j                  |      }| j                  |      D ]  \  }}}|| j                  k\  r>|s| j                  j                  j                  |       |sAt        d|dd|        T|rW| j                  j                  j                  |       |st        d|dd|         t        | j                  |            }| j                  |       | xj                   | j#                  |      z  c_        t%        |      D ]  \  }}	|j                  r|	s| j'                  ||	      rI| j                  j                  j                  |j(                         |rt        d|j                  z         | j+                  |	|      r!| j,                  |	j                  xx   dz  cc<   | j/                  ||	      s| j0                  |j(                  |	j2                  fxx   dz  cc<    y )NFr    z  Abbreviation: [6.4f] z  Removed abbreviation: [z  Rare Abbrev: %s)r   listr   r   r   r   _unique_types_reclassify_abbrev_typesABBREVr   r`   addprintremover   _get_orthography_datar   _get_sentbreak_countr[   _is_rare_abbrev_typer   _is_potential_sent_starterr   _is_potential_collocationr   r   )
r#   r   r   r   unique_typesr   scoreis_addaug_tok1aug_tok2s
             r$   r   zPunktTrainer._train_tokens  s    f
 GW\\*a/*##%%*%  ))&1#'#@#@#ND%#LL--11$7 1%RvFGLL--44T: 9%RvNO $O d//78 	""6* 	!:!:6!BB #-V"4Hh(( ((8<))--h.E.EF-=> ..xB((71<7 --hA'',,h.I.IJ ! #5r&   c                 @    |D ch c]  }|j                    c}S c c}w r_   )r   r   s      r$   r   zPunktTrainer._unique_types-  s    ,23FF333s   c           	         | j                   j                          | j                         D ]?  \  }}| j                   j                  j	                  |       |s.t        d|dd|       A | j                   j                          | j                         D ]G  \  \  }}}| j                   j                  j	                  ||f       |s3t        d|dd|d|       I d| _	        y)z~
        Uses data that has been gathered in training to determine likely
        collocations and sentence starters.
        z  Sent Starter: [r   r   z  Collocation: [+TN)
r   rl   _find_sent_startersrb   r  r  rj   _find_collocationsra   r   )r#   r   rp   log_likelihoodtyp1typ2s         r$   r   zPunktTrainer.finalize_training0  s    
 	((*#'#;#;#=CLL&&**3/).)>bHI $>
 	''),0,C,C,E(LT4.LL%%))4,7((=RxqQR -F
 r&   c                    |dkD  rr| j                   j                  }| j                   j                          | j                  D ]3  }| j                  |   }||k\  s||   | j                   j                  |<   5 | j	                  | j                  |      | _        | j	                  | j
                  |      | _        | j	                  | j                  |      | _        y)a  
        Allows memory use to be reduced after much training by removing data
        about rare tokens that are unlikely to have a statistical effect with
        further training. Entries occurring above the given thresholds will be
        retained.
        r    N)r   rd   rn   r   _freq_thresholdr   r   )r#   ortho_threshtype_threshcolloc_thressentstart_threshold_ocr   counts           r$   freq_thresholdzPunktTrainer.freq_thresholdG  s     !\\//FLL,,.''((-L(6<SkDLL..s3 (
  //0@0@+N"&"6"6##\#
 $(#7#7$$&6$
 r&   c                     t               }d}|D ]  }||   }||k  r|dz  }||xx   |z  cc<   ! |dxx   |z  cc<   |S )z
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        r   r    Nr   )r#   fdist	thresholdr   num_removedr   r  s          r$   r  zPunktTrainer._freq_threshold`  sX     jC#JEy q CE!  	D	[ 	
r&   c                    d}t        |      }|D ]  }|j                  r|dk7  rd}|j                  r|dk(  rd}|j                  }t        j                  ||j                  fd      }|r| j                  j                  ||       |j                  r|j                  s|j                  sd}d}|j                  s|j                  rd}d} y)z
        Collect information about whether each token type occurs
        with different case patterns (i) overall, (ii) at
        sentence-initial positions, and (iii) at sentence-internal
        positions.
        r   r   r   r   N)r   r~   r   r   
_ORTHO_MAPgetr   r   rr   r   r   r   r   r   )r#   r   rz   r   rp   rq   s         r$   r  z"PunktTrainer._get_orthography_datav  s     fG
   W	%9#   W
%:# ,,C >>7G,>,>"?CD..sD9   ))W-?-?'G'G!!W\\#$? r&   c              #     K   |D ]B  }t         j                  |      r|dk(  r|j                  d      r!|| j                  j                  v rI|dd }d}n|| j                  j                  vrjd}|j                  d      dz   }t        |      |z
  dz   }| j                  |dz      }| j                  |   }| j                  ||z   | j                  || j                  j                               }t        j                  |       }	|}
t        | j                        xs t        j                  ||       }||	z  |
z  |z  }|||f E yw)a  
        (Re)classifies each given token if
          - it is period-final and not a known abbreviation; or
          - it is not period-final and is otherwise a known abbreviation
        by checking whether its previous classification still holds according
        to the heuristics of section 3.
        Yields triples (abbr, score, is_add) where abbr is the type in question,
        score is its log-likelihood with penalties applied, and is_add specifies
        whether the present type is a candidate for inclusion or exclusion as an
        abbreviation, such that:
          - (is_add and score >= 0.3)    suggests a new abbreviation; and
          - (not is_add and score < 0.3) suggests excluding an abbreviation.
        r   r*   Nr   TFr    )r   r   r   r   r`   r  r   r   _dunning_log_likelihoodr   Nmathexprc   IGNORE_ABBREV_PENALTYpow)r#   typesrp   r  num_periodsnum_nonperiodscount_with_periodcount_without_periodr  f_length	f_periods	f_penaltyr
  s                r$   r   z%PunktTrainer._reclassify_abbrev_types  sa    $ C !'',|0C||C $,,333#2hdll777 ))C.1,K X3a7N !% 0 0s ;#'#3#3C#8 !99!$88%%!  ""$	N xx0H#ID667 488!5 5<I #X-	9IEEuf$$_ s   E
Ec                     | j                   j                          d | j                  D        }| j                  |      D ];  \  }}}|| j                  k\  s| j                   j
                  j                  |       = y)z
        Recalculates abbreviations given type frequencies, despite no prior
        determination of abbreviations.
        This fails to include abbreviations otherwise found as "rare".
        c              3   J   K   | ]  }|s|j                  d       s|  yw)r*   N)r   )r   rp   s     r$   r   z1PunktTrainer.find_abbrev_types.<locals>.<genexpr>  s      O!1#SS\\#=N#!1s   ###N)r   rg   r   r   r   r`   r  )r#   r   r   r
  _is_adds        r$   find_abbrev_typeszPunktTrainer.find_abbrev_types  se     	""$O!1!1O$($A$A&$I D%#))--d3 %Jr&   c                    |j                   s|j                  sy|j                  }| j                  |   | j                  |dd    z   }|| j                  j
                  v s|| j                  k\  ry|j                  dd | j                  j                  v ry|j                  r:|j                  }| j                  j                  |   }|t        z  r|t        z  syyyy)a  
        A word type is counted as a rare abbreviation if...
          - it's not already marked as an abbreviation
          - it occurs fewer than ABBREV_BACKOFF times
          - either it is followed by a sentence-internal punctuation
            mark, *or* it is followed by a lower-case word that
            sometimes appears with upper case, but never occurs with
            lower case at the beginning of sentences.
        FNr   r    T)r   r   r   r   r   r`   ABBREV_BACKOFFr   r   rQ   r   rd   rt   ru   )r#   cur_toknext_tokrp   r  r  typ2ortho_contexts          r$   r  z!PunktTrainer._is_rare_abbrev_type  s     <<w00 ((   %(8(8Sb(BB$,,+++u8K8K/K
 <<tCCC ..D $ : :4 @!M1!M1 2 2  r&   c                    ||z  }d}|t        j                  |dz         z  | |z
  t        j                  d|z
  dz         z  z   }|t        j                  |      z  | |z
  t        j                  d|z
        z  z   }||z
  }d|z  S )z
        A function that calculates the modified Dunning log-likelihood
        ratio scores for abbreviation candidates.  The details of how
        this works is available in the paper.
        gGz?g:0yE>      ?       )r)  log)	count_acount_bcount_abr(  p1p2	null_hypoalt_hypo
likelihoods	            r$   r'  z$PunktTrainer._dunning_log_likelihood(  s     q[txxT	22g6HDHH"HtOM
 6
 
	 dhhrl*g.@DHHSSUXDV-VV)
j  r&   c                    ||z  }|| z  }	 ||z
  || z
  z  }	 |t        j                  |      z  | |z
  t        j                  d|z
        z  z   }	 ||z
  t        j                  |      z  || z
  |z
  |z   t        j                  d|z
        z  z   }| |k(  s
|dk  s|dk\  rd}	n7|t        j                  |      z  | |z
  t        j                  d|z
        z  z   }	||k(  s
|dk  s|dk\  rd}
n@||z
  t        j                  |      z  || z
  |z
  |z   t        j                  d|z
        z  z   }
||z   |	z
  |
z
  }d|z  S # t         $ r d}Y 2w xY w# t        $ r d}Y w xY w# t        $ r d}Y w xY w)a=  
        A function that will just compute log-likelihood estimate, in
        the original paper it's described in algorithm 6 and 7.

        This *should* be the original Dunning log-likelihood values,
        unlike the previous log_l function where it used modified
        Dunning log-likelihood values
        r    r?  r   r@  )ZeroDivisionErrorr)  rA  
ValueError)rB  rC  rD  r(  r   rE  rF  summand1summand2summand3summand4rI  s               r$   _col_log_likelihoodz PunktTrainer._col_log_likelihood;  s    aK	H$W5B	$((1+-81CtxxPSVWPWGX0XXH	(*dhhqk9Gg%0q!=" "H h"'R1WH$((2,.'H2DbI 2 H h"'R1WH(*dhhrl:Gg%0r"># #H (83h>
j  ? ! 	B	
  	H	  	H	s5   D; 7E A E ;E
	E
EEE-,E-c                     | j                   xsD | j                  xr |j                  xs( |j                  xr |j                  xs |j
                  xr |j                  xr |j                  S )zt
        Returns True if the pair of tokens may form a collocation given
        log-likelihood statistics.
        )INCLUDE_ALL_COLLOCSINCLUDE_ABBREV_COLLOCSr   r   r   r   r   )r#   r  r  s      r$   r  z&PunktTrainer._is_potential_collocationn  so     (( X//AHMMX&&VH,>,>,U(BUBU&
 %%& %%	
r&   c              #   f  K   | j                   D ]  }	 |\  }}|| j                  j                  v r#| j                   |   }| j                  |   | j                  |dz      z   }| j                  |   | j                  |dz      z   }|dkD  s||dkD  s| j
                  |cxk  rt        ||      k  sn | j                  |||| j                  j                               }|| j                  k\  s| j                  j                         |z  ||z  kD  s||f|f  y# t        $ r Y w xY ww)zI
        Generates likely collocations and their log-likelihood.
        r*   r    N)
r   	TypeErrorr   rb   r   MIN_COLLOC_FREQminrQ  r(  COLLOCATION)r#   r-  r  r  	col_count
typ1_count
typ2_countr  s           r$   r  zPunktTrainer._find_collocations}  s8     ,,E"
d t||111//6I))$/$2B2B4#:2NNJ))$/$2B2B4#:2NNJQN((9SJ
8SS!%!9!9
It7G7G7I7I7K" "T%5%55$$&&(:5
Y8NN,661 -  sF   D1D!A1D1D1D11>D10#D1D1!	D.*D1-D..D1c                 p    |j                   xr) |j                  xs |j                   xr |j                  S )z
        Returns True given a token and the token that precedes it if it
        seems clear that the token is beginning a sentence.
        )r   r   r   r   )r#   r;  prev_toks      r$   r  z'PunktTrainer._is_potential_sent_starter  s<      !''>8+>+>?!  	
r&   c              #     K   | j                   D ]  }|s| j                   |   }| j                  |   | j                  |dz      z   }||k  r=| j                  | j                  ||| j                  j	                               }|| j
                  k\  s| j                  j	                         | j                  z  ||z  kD  s||f  yw)z~
        Uses collocation heuristics for each candidate token to
        determine if it frequently starts sentences.
        r*   N)r   r   rQ  r   r(  SENT_STARTER)r#   rp   typ_at_break_count	typ_countr  s        r$   r  z PunktTrainer._find_sent_starters  s     
 ++C!%!9!9#!>((-0@0@s0KKI--!55%%"  ""$	N $"3"33$$&&(4+@+@@001 >))- ,s   BC	-C	 	C	c                 &    t        d |D              S )zj
        Returns the number of sentence breaks marked in a given set of
        augmented tokens.
        c              3   :   K   | ]  }|j                   sd   yw)r    N)r   )r   r   s     r$   r   z4PunktTrainer._get_sentbreak_count.<locals>.<genexpr>  s     @Fg.?.?1Fs   )sumr#   r   s     r$   r  z!PunktTrainer._get_sentbreak_count  s    
 @F@@@r&   )FTF)r   r   r   r   )"rK   rL   rM   rN   r}   re   r   r   r+  r:  rY  r`  rS  rT  rW  r   r   r   r   r   r  r  r  r   r8  r  staticmethodr'  rQ  r  r  r  r  r  r!   r&   r$   r   r   {  s   F u
&;P F<! NLK L&  L #$
 OO,,;z40 OP
2,*%`A%F
4)` ! !$ ,! ,!d
7D
*:Ar&   r   c            
       
   e Zd ZdZdddefdZd dZd!dedede	e   fd	Z
dedeeeef      fd
Z	 d!dededeeeef      fdZ	 d!dedede	e   fdZdedefdZdedeeeef      fdZdedee   fdZdedee   dee   fdZdedefdZdedee   fdZdee   dee   fdZdee   dee   fdZdedee   dee   fdZdee   ddfdZ ed      Z dee   dee   fdZ!dede"e   de"e   fdZ#dede$eef   fdZ%y)"PunktSentenceTokenizera'  
    A sentence tokenizer which uses an unsupervised algorithm to build
    a model for abbreviation words, collocations, and words that start
    sentences; and then uses that model to find sentence boundaries.
    This approach has been shown to work well for many European
    languages.
    NFc                 h    t         j                  | ||       |r| j                  ||      | _        yy)z
        train_text can either be the sole training text for this sentence
        boundary detector, or can be a PunktParameters object.
        r   N)r   re   r   r   r   s        r$   re   zPunktSentenceTokenizer.__init__  s3     		YO::j':DL r&   c                     t        |t              s|S t        || j                  | j                        j                         S )z
        Derives parameters from a given training text, or uses the parameters
        given. Repeated calls to this method destroy previous parameters. For
        incremental training, instantiate a separate PunktTrainer instance.
        r   )
isinstancestrr   r   r   r   )r#   r   r   s      r$   r   zPunktSentenceTokenizer.train  s7     *c*$//T[[

*,	r&   r   realign_boundariesr   c                 8    t        | j                  ||            S )zM
        Given a text, returns a list of the sentences in that text.
        )r   sentences_from_text)r#   r   ro  s      r$   tokenizezPunktSentenceTokenizer.tokenize  s     D,,T3EFGGr&   c              #     K   | j                  |      D ]  \  }}| j                  |      }t        | j                  |            }|rx|d   j                  j                  | j                  j                        sF|j                  d       |r3|d   j                  j                  | j                  j                        sF|j                         dz
  ||d   j                  |d   j                  t        |d   j                        t        |d   j                        |d   j                  | j                  j                   v | j#                  |d         t%        | j                  j'                  |d   j                              |d   j                  |d   j                  f| j                  j(                  v | j+                  |d   |d         xs t,        |d   j.                  d  yw)z
        Classifies candidate periods as sentence breaks, yielding a dict for
        each that may be used to understand why the decision was made.

        See format_debug_decision() to help make this output readable.
        r   r    )period_indexr   type1type2type1_in_abbrstype1_is_initialtype2_is_sent_startertype2_ortho_heuristictype2_ortho_contextscollocationreasonbreak_decisionN)_match_potential_end_contextsr   r   r   r   r   r   r2   popendr   boolr   r   r   r   rb   _ortho_heuristicr5   r{   ra   _second_pass_annotationREASON_DEFAULT_DECISIONr   )r#   r   r   decision_textr   s        r$   debug_decisionsz&PunktSentenceTokenizer.debug_decisions  s     %)$F$Ft$L E=))-8F$33F;<F!7!78V8V!W

1 !7!78V8V!W !&		a%"&vay~~"6$()=)=$>)/)E)E<<--*.)-)>)>vay)I(+LL55fQi6R6RS) 1I001I00  <<,,	 -
 66vay&)L +*"()"5"5)  %Ms   B=G- D-G-c              #      K   | j                  |      }|r| j                  ||      }|D ]  }|j                  |j                  f  yw)z^
        Given a text, generates (start, end) spans of sentences
        in the text.
        N)_slices_from_text_realign_boundariesstartstop)r#   r   ro  slicessentences        r$   span_tokenizez$PunktSentenceTokenizer.span_tokenize&  sJ      ''---dF;FH>>8==11 s   AA
c                 ^    | j                  ||      D cg c]
  \  }}|||  c}}S c c}}w )z
        Given a text, generates the sentences in that text by only
        testing candidate sentence breaks. If realign_boundaries is
        True, includes in the sentence closing punctuation that
        follows the period.
        )r  )r#   r   ro  rE   es        r$   rq  z*PunktSentenceTokenizer.sentences_from_text3  s6     '+&8&8?Q&RS&RdaQq	&RSSSs   )c                 r    t        t        |      dz
  dd      D ]  }||   t        j                  v s|c S  y)z
        Given a text, find the index of the *last* occurrence of *any*
        whitespace character, i.e. " ", "
", "	", "", etc.
        If none is found, return 0.
        r    r   r   )ranger   string
whitespace)r#   r   is      r$   _get_last_whitespace_indexz1PunktSentenceTokenizer._get_last_whitespace_index>  s;     s4y1}b"-AAw&+++ . r&   c              #   J  K   t        dd      }d}| j                  j                         j                  |      D ]  }||j                  |j                          }| j                  |      }|r||j                  dz   z  }n|j
                  }t        ||j                               }|rE|j                  |j
                  k  r,|||   |j                         z   |j                  d      z   f |}|} |r-|||   |j                         z   |j                  d      z   f yyw)a  
        Given a text, find the matches of potential sentence breaks,
        alongside the contexts surrounding these sentence breaks.

        Since the fix for the ReDOS discovered in issue #2866, we no longer match
        the word before a potential end of sentence token. Instead, we use a separate
        regex for this. As a consequence, `finditer`'s desire to find non-overlapping
        matches no longer aids us in finding the single longest match.
        Where previously, we could use::

            >>> pst = PunktSentenceTokenizer()
            >>> text = "Very bad acting!!! I promise."
            >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +SKIP
            [<re.Match object; span=(9, 18), match='acting!!!'>]

        Now we have to find the word before (i.e. 'acting') separately, and `finditer`
        returns::

            >>> pst = PunktSentenceTokenizer()
            >>> text = "Very bad acting!!! I promise."
            >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +NORMALIZE_WHITESPACE
            [<re.Match object; span=(15, 16), match='!'>,
            <re.Match object; span=(16, 17), match='!'>,
            <re.Match object; span=(17, 18), match='!'>]

        So, we need to find the word before the match from right to left, and then manually remove
        the overlaps. That is what this method does::

            >>> pst = PunktSentenceTokenizer()
            >>> text = "Very bad acting!!! I promise."
            >>> list(pst._match_potential_end_contexts(text))
            [(<re.Match object; span=(17, 18), match='!'>, 'acting!!! I')]

        :param text: String of one or more sentences
        :type text: str
        :return: Generator of match-context tuples.
        :rtype: Iterator[Tuple[Match, str]]
        r   Nr    	after_tok)slicer   rJ   finditerr  r  r  group)r#   r   previous_sliceprevious_matchr   before_textindex_after_last_spaceprev_word_slices           r$   r  z4PunktSentenceTokenizer._match_potential_end_contextsI  s=    N q!__668AA$GE~22U[[]CK%)%D%D[%Q"%&.*=*=*AA&)7)=)=&#$:EKKMJO
 ."5"59N9N"N"($**,-$**;78  #N,N- H2 ^$ &&() &&{34  s   D!D#c              #   N  K   d}| j                  |      D ]f  \  }}| j                  |      st        ||j                                |j	                  d      r|j                  d      }W|j                         }h t        |t        |j                                      y w)Nr   r<  )r  text_contains_sentbreakr  r  r  r  r   rstrip)r#   r   
last_breakr   rz   s        r$   r  z(PunktSentenceTokenizer._slices_from_text  s     
"@@FNE7++G4J		44;;z*!&Z!8J "'J G JDKKM 233s   +B%A7B%r  c           
   #     K   d}t        |      D ]  \  }}t        |j                  |z   |j                        }|s
||   r| 5| j                  j
                  j                  ||         }|r\t        |j                  |j                  t        |j                  d      j                               z          |j                         }d}||   s|  yw)a@  
        Attempts to realign punctuation that falls after the period but
        should otherwise be included in the same sentence.

        For example: "(Sent1.) Sent2." will otherwise be split as::

            ["(Sent1.", ") Sent1."].

        This method will produce::

            ["(Sent1.)", "Sent2."].
        r   N)r[   r  r  r  r   rS   r   r   r  r  r  )r#   r   r  realign	sentence1	sentence2ms          r$   r  z*PunktSentenceTokenizer._realign_boundaries  s      $.v$6 Iyioo7HI	?#O77==d9oNAIOOY__s1771:CTCTCV?W-WXX%%'	?#O %7s   CCCc                 x    d}| j                  | j                  |            D ]  }|r y|j                  sd} y)zK
        Returns True if the given text includes a sentence break.
        FT)_annotate_tokensr   r   )r#   r   foundr   s       r$   r  z.PunktSentenceTokenizer.text_contains_sentbreak  sA     (()=)=d)CDC}}	 E
 r&   c                 f    | j                  | j                  |            }| j                  ||      S )z
        Given a text, generates the sentences in that text. Annotates all
        tokens, rather than just those with possible sentence breaks. Should
        produce the same results as ``sentences_from_text``.
        )r  r   _build_sentence_list)r#   r   r   s      r$   sentences_from_text_legacyz1PunktSentenceTokenizer.sentences_from_text_legacy  s2     &&t';';D'AB((v66r&   r   c              #       K   t         j                   fd|D                    }g }|D ]0  }|j                  |j                         |j                  s+| g }2 |r| yyw)zw
        Given a sequence of tokens, generates lists of tokens, each list
        corresponding to a sentence.
        c              3   @   K   | ]  }j                  |        y wr_   r   r   s     r$   r   z?PunktSentenceTokenizer.sentences_from_tokens.<locals>.<genexpr>  s     +KFqDKKNFr   N)rU   r  appendr   r   )r#   r   r  r   s   `   r$   sentences_from_tokensz,PunktSentenceTokenizer.sentences_from_tokens  se      d+++KF+KKLGOOGKK(  	 
 N s   AA&A&c                 J    | j                  |      }| j                  |      }|S )z
        Given a set of tokens augmented with markers for line-start and
        paragraph-start, returns an iterator through those tokens with full
        annotation including predicted sentence breaks.
        )r   _annotate_second_passrf  s     r$   r  z'PunktSentenceTokenizer._annotate_tokens  s-     **62
 ++F3 r&   c              #     K   d}t        j                  d      }d}|D ]  }|j                  }|j                  ||      j	                         }|t        |      z  }|||t        |      z    |k7  rOdj                  d |D              }	t        j                  |	      j                  ||      }
|
r|
j	                         }|||t        |      z    |k(  sJ |t        |      z  }|r||z  }||z  }|j                  s| d} |r| yyw)z
        Given the original text and the list of augmented word tokens,
        construct and return a tokenized list of sentence strings.
        r   z\s*r.   c              3   F   K   | ]  }t        j                  |        y wr_   )r/   r0   )r   cs     r$   r   z>PunktSentenceTokenizer._build_sentence_list.<locals>.<genexpr>  s     !<1"))A,s   !N)r/   r<   r   r   r  r   r1   r   )r#   r   r   poswhite_space_regexpr  r   r   white_spacepatr  s              r$   r  z+PunktSentenceTokenizer._build_sentence_list  s!       ZZ/G++C -224=CCEK3{##C C#C.)S0kk!<!<<JJsO))$4'')C cCHn-4443s8OC
 K'OH   A F N s   C8D;Dc                 :   t        d       t        dd      5 }|D ]i  }|j                  r|j                  d       n/|j                  r|j                  d       n|j                  d       |j                  t        |             k 	 d d d        y # 1 sw Y   y xY w)Nzwriting to /tmp/punkt.new...z/tmp/punkt.newwz

r    )r  openr~   writer   rn  )r#   r   outfiler   s       r$   dumpzPunktSentenceTokenizer.dump5  st    ,-"C(G!$$MM&)&&MM$'MM#&c'l+ " )((s   A/BBz;:,.!?c              #   ^   K   t        |      D ]  \  }}| j                  ||       |  yw)z
        Performs a token-based classification (section 4) over the given
        tokens, making use of the orthographic heuristic (4.1.1), collocation
        heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
        N)r[   r  )r#   r   token1token2s       r$   r  z,PunktSentenceTokenizer._annotate_second_passL  s0      )0NFF((8L 1s   +-r  r  c                    |sy|j                   sy|j                  }|j                  }|j                  }||f| j                  j
                  v rd|_        d|_        t        S |j                  s|j                  rV|sT| j                  |      }|dk(  rd|_        t        S |j                  r%|| j                  j                  v rd|_        t        S |s|dk(  ry| j                  |      }|dk(  rd|_        d|_        |rt        S t         S |dk(  rB|r@|j                  r4| j                  j"                  |   t$        z  sd|_        d|_        t&        S y)zr
        Performs token-based classification over a pair of contiguous tokens
        updating the first.
        NFTr   r   )r   r   r   r   r   ra   r   r   REASON_KNOWN_COLLOCATIONr   r  'REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTICr   rb   !REASON_ABBR_WITH_SENTENCE_STARTER*REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC)REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTICrd   	_ORTHO_LC2REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC)r#   r  r  rp   next_typtok_is_initialis_sent_starters          r$   r  z.PunktSentenceTokenizer._second_pass_annotationX  sR    $$%%..!,, ?dll777!&H HM++
 MMX.. #33H=O$&%)">> ##DLL4N4N(N%)"88
 SL0 #33H=O%'%*" $!EE@@  9,"((33H=	I%*" $IIr&   r   c                     |j                   | j                  v ry| j                  j                  |j                     }|j
                  r|t        z  r
|t        z  sy|j                  r|t        z  s	|t        z  syy)zR
        Decide whether the given token is the first token in a sentence.
        FTr   )r   PUNCTUATIONr   rd   r   r   r  ru   r   	_ORTHO_UCrw   )r#   r   rd   s      r$   r  z'PunktSentenceTokenizer._ortho_heuristic  ss    
 ;;$***2273M3MN *"]2 Y&0M r&   rg  )T)&rK   rL   rM   rN   r}   re   r   rn  r  r   rr  r   r   r   r  r
   rc   r  rq  r  r   r  r  r  r  r  r  r  r  r  r  tupler  r  r	   r  r   r  r!   r&   r$   rj  rj    sD    u

;
 HS Hd Hd3i H"C "HT#s(^,D "J 5922-12	%S/	"2 59	T	T-1	T	c	T	s 	s 	H# H(5PSCT:U HT4c 4huo 4$$!)%$	%$@
C 
D 
7s 7x} 7z*	*	"x
'; @T *66!)*!56	#6r,8J/ ,D ," /K
z*
	*	
N"N.6z.BN	#N`
 uT3Y7G r&   rj  c                   &    e Zd ZdZddZddZd Zy)PunktTokenizerzU
    Punkt Sentence Tokenizer that loads/saves its parameters from/to data files
    c                 P    t         j                  |        | j                  |       y r_   )rj  re   	load_lang)r#   langs     r$   re   zPunktTokenizer.__init__  s    ''-tr&   c                 V    ddl m}  |d| d      }t        |      | _        || _        y )Nr   )findztokenizers/punkt_tab//)	nltk.datar  load_punkt_paramsr   _lang)r#   r  r  lang_dirs       r$   r  zPunktTokenizer.load_lang  s,    "/vQ78(2
r&   c                 L    t        | j                  d| j                          y )Nz/tmp/)dir)save_punkt_paramsr   r  r"   s    r$   save_paramszPunktTokenizer.save_params  s    $,,eDJJ<,@Ar&   N)english)rK   rL   rM   rN   re   r  r  r!   r&   r$   r  r    s    Br&   r  c                    ddl m}  |       }t               }t        |  dd      5 }|j	                  |      |_        d d d        t        |  dd      5 }|j                  |      |_        d d d        t        |  dd      5 }|j                  |      |_        d d d        t        |  dd      5 }|j                  |      |_
        d d d        |S # 1 sw Y   xY w# 1 sw Y   uxY w# 1 sw Y   RxY w# 1 sw Y   |S xY w)	Nr   )PunktDecoder/collocations.tabzutf-8)encoding/sent_starters.txt/abbrev_types.txt/ortho_context.tab)nltk.tabdatar  r]   r  tab2tupsra   txt2setrb   r`   tab2intdictrd   )r  r  pdecr   fs        r$   r  r    s    )>DF	
+,w	?1"mmA. 
@	
,-	@A#||A 
A	
+,w	?1"ll1o 
@	
,-	@A#//2 
AM 
@	?	@	@	?	?	@Ms/   CC"C.5C:C"C+.C7:Dc                    ddl m} ddlm} ddlm}  ||      s ||        |       }t        | dd      5 }|j                  |j                  | j                                d d d        t        | dd      5 }|j                  |j                  | j                                d d d        t        | dd      5 }|j                  |j                  | j                                d d d        t        | d	d      5 }|j                  |j                  | j                                d d d        y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   exY w# 1 sw Y   y xY w)
Nr   )mkdir)isdir)
TabEncoderr  r  r  r  r  )osr  os.pathr  r  r  r  r  tups2tabra   set2txtrb   r`   
ivdict2tabrd   )r   r  r  r  r  tencr  s          r$   r  r    s   ':c
<D	&'	-	4==!4!4568 
.	'(#	.!	4<< 4 4568 
/	&'	-	4<< 3 3457 
.	'(#	.!	4??6#7#789; 
/	. 
.	-	.	.	-	-	.	.s/   ,D7<,E?,E,E7E EEE$a  Text: {text!r} (at offset {period_index})
Sentence break? {break_decision} ({reason})
Collocation? {collocation}
{type1!r}:
    known abbreviation: {type1_in_abbrs}
    is initial: {type1_is_initial}
{type2!r}:
    known sentence starter: {type2_is_sent_starter}
    orthographic heuristic suggests is a sentence starter? {type2_ortho_heuristic}
    orthographic contexts in training: {type2_ortho_contexts}
c                 ,    t        j                  di | S )Nr!   )DEBUG_DECISION_FMTr   )ds    r$   format_debug_decisionr    s    $$)q))r&   c                     d } |       }d|_         |j                  |         ||j                               }|j                  |       D ]  }t	         ||              y)z4Builds a punkt model and applies it to the same textc                     t        j                  dt         j                        j                  d|       j	                  dd      S )Nz(?:\r|^\s+)r.   r   r  )r/   r<   rR   r   replace)rE   s    r$   <lambda>zdemo.<locals>.<lambda>  s.    "**^R\\:>>r1EMMdTWXr&   TN)rS  r   r   rq  r  )r   tok_cls	train_clscleanuptrainersbdr  s          r$   demor    s_     	Y  kG"&GMM$
'$$&
'C++D1gh  2r&   )z/tmp/punkt_tab)3rN   r)  r/   r  collectionsr   typingr   r   r   r   r   r	   r
   r   nltk.probabilityr   nltk.tokenize.apir   rt   ru   rv   rw   rx   ry   r  r  r$  r  r  r  r  r  r  r  r   r<   r@   r   r[   r]   r}   r   r   rj  r  r  r  r   r  r  r!   r&   r$   <module>r     sv  X|  	  # K K K % (  D A K D A KM)M9	 3M)M9	 3 ((''('
* - ; *Q '$N !-O *,N ). 3o+ o+d 

:rzz2F,3 3vD DX] ]JT	A> T	Axo^Z odB+ B("<2
 * . 
!r&   