
    gS@                         d dl Z d dlZ	 d dlZd dlmZ d\  ZZd\  ZZ	d gZ
 G d de      Z G d d      Z G d d	      Zdd
ZddZy# e$ r Y Bw xY w)    N)
TokenizerI)r      c            	       `    e Zd ZdZddededdedf	dZd	 Zd
 Z	d Z
d Zd Zd Zd Zd Zd Zy)TextTilingTokenizera  Tokenize a document into topical sections using the TextTiling algorithm.
    This algorithm detects subtopic shifts based on the analysis of lexical
    co-occurrence patterns.

    The process starts by tokenizing the text into pseudosentences of
    a fixed size w. Then, depending on the method used, similarity
    scores are assigned at sentence gaps. The algorithm proceeds by
    detecting the peak differences between these scores and marking
    them as boundaries. The boundaries are normalized to the closest
    paragraph break and the segmented text is returned.

    :param w: Pseudosentence size
    :type w: int
    :param k: Size (in sentences) of the block used in the block comparison method
    :type k: int
    :param similarity_method: The method used for determining similarity scores:
       `BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`.
    :type similarity_method: constant
    :param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus)
    :type stopwords: list(str)
    :param smoothing_method: The method used for smoothing the score plot:
      `DEFAULT_SMOOTHING` (default)
    :type smoothing_method: constant
    :param smoothing_width: The width of the window used by the smoothing method
    :type smoothing_width: int
    :param smoothing_rounds: The number of smoothing passes
    :type smoothing_rounds: int
    :param cutoff_policy: The policy used to determine the number of boundaries:
      `HC` (default) or `LC`
    :type cutoff_policy: constant

    >>> from nltk.corpus import brown
    >>> tt = TextTilingTokenizer(demo_mode=True)
    >>> text = brown.raw()[:4000]
    >>> s, ss, d, b = tt.tokenize(text)
    >>> b
    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]
       
   N   r   Fc
                     |ddl m} |j                  d      }| j                  j	                  t                      | j                  d= y )Nr   )	stopwordsenglishself)nltk.corpusr   words__dict__updatelocals)
r   wksimilarity_methodr   smoothing_methodsmoothing_widthsmoothing_roundscutoff_policy	demo_modes
             M/var/www/openai/venv/lib/python3.12/site-packages/nltk/tokenize/texttiling.py__init__zTextTilingTokenizer.__init__@   s;     -!	2IVX&MM&!    c                    |j                         }| j                  |      }t        |      }dj                  d |D              }| j                  |      }| j	                  |      }|D ]3  }|j
                  D 	cg c]  }	|	d   | j                  vs|	 c}	|_        5 | j                  ||      }
| j                  t        k(  r| j                  ||
      }n7| j                  t        k(  rt        d      t        d| j                   d      | j                  t        k(  r| j!                  |      }nt        d| j                   d      | j#                  |      }| j%                  |      }| j'                  |||      }g }d}|D ]  }|dk(  r	|j)                  |||        |}  ||k  r|j)                  ||d        |s|g}| j*                  r||||fS |S c c}	w )	zZReturn a tokenized copy of *text*, where each "token" represents
        a separate topic. c              3   N   K   | ]  }t        j                  d |      s|  yw)z[a-z\-' \n\t]N)rematch).0cs     r   	<genexpr>z/TextTilingTokenizer.tokenize.<locals>.<genexpr>^   s"      
%!2BA)FA~s   %%r   z'Vocabulary introduction not implementedzSimilarity method z not recognizedzSmoothing method N)lower_mark_paragraph_breakslenjoin_divide_to_tokensequenceswrdindex_listr   _create_token_tabler   BLOCK_COMPARISON_block_comparisonVOCABULARY_INTRODUCTIONNotImplementedError
ValueErrorr   DEFAULT_SMOOTHING_smooth_scores_depth_scores_identify_boundaries_normalize_boundariesappendr   )r   textlowercase_textparagraph_breakstext_lengthnopunct_textnopunct_par_breakstokseqstswitoken_table
gap_scoressmooth_scoresdepth_scoressegment_boundariesnormalized_boundariessegmented_textprevbbs                      r   tokenizezTextTilingTokenizer.tokenizeS   s    66t<.)
 ww 
%
 
 "88F00> B-- -rAdnn1L- B 
 ..w8JK !!%55//EJ##'>>%&OPP$T%;%;$<OL    $55 //
;M01F1F0GWXX ))-8!66|D $ : :$&6!
 &AAv!!$uQ-0E	 ' ;!!$uv,/"VN>>}l<NNNa s   :GGc                 p   fd}g }t        |      dz
  }t        |      D ]  }d\  }}}	d}
|| j                  dz
  k  r|dz   }n$||| j                  z
  kD  r||z
  }n| j                  }|||z
  dz   |dz    D cg c]  }|j                   }}||dz   ||z   dz    D cg c]  }|j                   }}D ]6  }| |||       |||      z  z  }| |||      dz  z  }|	 |||      dz  z  }	8 	 |t	        j
                  ||	z        z  }
|j                  |
        |S c c}w c c}w # t        $ r Y *w xY w)z&Implements the block comparison methodc                 f    t        fd|    j                        }t        d |D              }|S )Nc                     | d   v S Nr    )oblocks    r   <lambda>zHTextTilingTokenizer._block_comparison.<locals>.blk_frq.<locals>.<lambda>   s    qtu}r   c              3   &   K   | ]	  }|d      yw)r   NrO   )r#   tsoccs     r   r%   zITextTilingTokenizer._block_comparison.<locals>.blk_frq.<locals>.<genexpr>   s     5WEuQxWs   )filterts_occurencessum)tokrQ   ts_occsfreqrA   s    `  r   blk_frqz6TextTilingTokenizer._block_comparison.<locals>.blk_frq   s0    4k#6F6T6TUG5W55DKr   r   )        r\   r\   r\   r	   )r(   ranger   indexmathsqrtZeroDivisionErrorr7   )r   r>   rA   r[   rB   numgapscurr_gapscore_dividendscore_divisor_b1score_divisor_b2scorewindow_sizer?   b1b2ts     `             r   r.   z%TextTilingTokenizer._block_comparison   s   	
 
g,"gHAN>N,.>E$&&1*$&lGdff,,%0"ff%,X-Ca-G(UV,%WX%Wr"((%WBX%,X\H{<RUV<V%WX%Wr"((%WBX '!R.71b>"AA GArNa$77  GArNa$77  !&3CFV3V)WW e$/ '2  YX % s   9DD$/D))	D54D5c           	      t    t        t        t        j                  |dd       | j                  dz               S )z1Wraps the smooth function from the SciPy CookbookNr   )
window_len)listsmoothnumpyarrayr   )r   rB   s     r   r3   z"TextTilingTokenizer._smooth_scores   s2    5;;z!}-$:N:NQR:RS
 	
r   c                     d}t        j                  d      }|j                  |      }d}dg}|D ]H  }|j                         |z
  |k  r|j	                  |j                                |j                         }J |S )zNIdentifies indented text or line breaks as the beginning of
        paragraphsd   z[ 	]*
[ 	]*
[ 	]*r   )r!   compilefinditerstartr7   )r   r8   MIN_PARAGRAPHpatternmatches
last_breakpbreakspbs           r   r'   z*TextTilingTokenizer._mark_paragraph_breaks   sx     **GH""4(
#BxxzJ&6rxxz*XXZ
  r   c           
      .   | j                   }g }t        j                  d|      }|D ]1  }|j                  |j	                         |j                         f       3 t        dt        |      |      D cg c]  }t        ||z  ||||z           c}S c c}w )z3Divides the text into pseudosentences of fixed sizez\w+r   )	r   r!   ru   r7   grouprv   r]   r(   TokenSequence)r   r8   r   r+   ry   r"   is          r   r*   z-TextTilingTokenizer._divide_to_tokensequences   s    FF++fd+E  %++-!?@  1c-0!4
4 !a%q1q5!9:4
 	
 
s   3Bc           
         i }d}d}|j                         }t        |      }|dk(  r	 t        |      }|D ]  }	|	j                  D ]  \  }
}	 ||kD  rt        |      }|dz  }||kD  r|
|v r||
   xj
                  dz  c_        ||
   j                  |k7  r"|||
   _        ||
   xj                  dz  c_        ||
   j                  |k7  r+|||
   _        ||
   j                  j                  |dg       ||
   j                  d   dxx   dz  cc<   t        ||dggdd||      ||
<    |dz  } |S # t        $ r}t        d      |d}~ww xY w# t        $ r Y w xY w)z#Creates a table of TokenTableFieldsr   z7No paragraph breaks were found(text too short perhaps?)Nr   )	first_posrV   total_count	par_countlast_parlast_tok_seq)__iter__nextStopIterationr1   r+   r   r   r   r   rV   r7   TokenTableField)r   token_sequences
par_breaksrA   current_parcurrent_tok_seqpb_itercurrent_par_breaker?   wordr^   s               r   r,   z'TextTilingTokenizer._create_token_table   s   %%' M!$(M!
 "B!//e"33,0M)#q(  "33 ;&%11Q61"4(11[@5@D)2#D)33q83"4(55H9HD)6#D)77>>QR?ST#D)77;A>!C>(7"'(7';&<$%"#!,%4)K%-  0> q OA "D M !  M % s)   D3 E3	E<EE	EEc           
        
 |D cg c]  }d }}t        |      t        |      z  }t        j                  |      }| j                  t
        k(  r||z
  
n||dz  z
  
t        t        |t        t        |                        }|j                          t        t        
fd|            }|D ]I  }d||d   <   |D ]:  }	|d   |	d   k7  st        |	d   |d   z
        dk  s'||	d      dk(  s3d||d   <   < K |S c c}w )zJIdentifies boundaries at the peaks of similarity score
        differencesr   g       @c                     | d   kD  S rN   rO   )xcutoffs    r   rR   z:TextTilingTokenizer._identify_boundaries.<locals>.<lambda>,  s    1Q4&=r   r      )rW   r(   rp   stdr   LCsortedzipr]   reversern   rU   abs)r   rD   r   
boundariesavgstdevdepth_tupleshpdtdt2r   s             @r   r5   z(TextTilingTokenizer._identify_boundaries  s    "..Aa
.,#l"33		,'#5[F53;&Fc,c,6G0HIJ&0,?@B !Jr!uqESVOCFRUN+a/"3q6*a/()Jr!u%   / /s   	C=c                    |D cg c]  }d }}t        t        t        |      dz  d      d      }|}|||  D ]B  }|}||dd   D ]  }||k\  r|} n |}	||d D ]  }||	k\  r|}	 n ||	z   d|z  z
  ||<   |dz  }D |S c c}w )zzCalculates the depth of each gap, i.e. the average difference
        between the left and right peaks and the gap's scorer   r   r	      Nr   r   )minmaxr(   )
r   scoresr   rD   clipr^   gapscorelpeakrg   rpeaks
             r   r4   z!TextTilingTokenizer._depth_scores9  s     $**6a6*
 3s6{b(!,a0tTE*HE	r	*E>!E	 +
 EE>!E	 (
 #(%-!h,">LQJE +  1 +s   	Bc                 v   g }d\  }}}d}|D ]  }	|dz  }|	dv r	|rd}|dz  }|	dvr|sd}|t        |      k  s,|t        || j                  z  | j                        kD  sS||   dk(  rJt        |      }
|D ]%  }|
t        ||z
        kD  rt        ||z
        }
|}% n |vr|j	                  |       |dz  } |S )zSNormalize the boundaries identified to the original text's
        paragraph breaks)r   r   r   Fr   z 	
T)r(   r   r   r   r7   )r   r8   r   r:   norm_boundaries
char_count
word_count	gaps_seen	seen_wordcharbest_fitbrbestbrs                r   r6   z)TextTilingTokenizer._normalize_boundariesW  s     ,3)
J		D!OJw9!	a
7"9 	3z?*zI&/0 i(A-"4yH.#c"z/&::'*2
?';H%'F! / _4'..v6Q	+ . r   )__name__
__module____qualname____doc__r-   r2   HCr   rJ   r.   r3   r'   r*   r,   r5   r4   r6   rO   r   r   r   r      s[    %R 
**"&KZ$L
$

0d:<r   r   c                        e Zd ZdZ	 	 	 	 ddZy)r   z[A field in the token table holding parameters for each token,
    used later in the processNc                 d    | j                   j                  t                      | j                   d= y Nr   )r   r   r   )r   r   rV   r   r   r   r   s          r   r   zTokenTableField.__init__}  s$     	VX&MM&!r   )r   r   r   Nr   r   r   r   r   rO   r   r   r   r   y  s    ! 
"r   r   c                       e Zd ZdZddZy)r   z3A token list with its original length and its indexNc                     |xs t        |      }| j                  j                  t                      | j                  d= y r   )r(   r   r   r   )r   r^   r+   original_lengths       r   r   zTokenSequence.__init__  s1    )?S-?VX&MM&!r   Nr   rO   r   r   r   r     s
    9"r   r   c                    | j                   dk7  rt        d      | j                  |k  rt        d      |dk  r| S |dvrt        d      t        j                  d| d   z  | |dd	   z
  | d| d	   z  | d	| d	   z
  f   }|d
k(  rt        j
                  |d      }nt        d|z   dz         }t        j                  ||j                         z  |d      }||dz
  | dz    S )a  smooth the data using a window with requested size.

    This method is based on the convolution of a scaled window with the signal.
    The signal is prepared by introducing reflected copies of the signal
    (with the window size) in both ends so that transient parts are minimized
    in the beginning and end part of the output signal.

    :param x: the input signal
    :param window_len: the dimension of the smoothing window; should be an odd integer
    :param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
        flat window will produce a moving average smoothing.

    :return: the smoothed signal

    example::

        t=linspace(-2,2,0.1)
        x=sin(t)+randn(len(t))*0.1
        y=smooth(x)

    :see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve,
        scipy.signal.lfilter

    TODO: the window parameter could be the window itself if an array instead of a string
    r   z'smooth only accepts 1 dimension arrays.z1Input vector needs to be bigger than window size.   )flathanninghammingbartlettblackmanzDWindow is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'r	   r   r   r   dznumpy.z(window_len)same)mode)	ndimr1   sizerp   r_onesevalconvolverW   )r   rm   windowsr   ys         r   ro   ro     s   6 	vv{BCCvv
LMMA~KKR
 	
 	QqTAj2o..1qu9qZKPRAR?S3SSTA JJz3'F"^34q1557{AF3AZ!^zkAo..r   c                 :   ddl m} ddlm} t	        d      }| |j                         d d } |j                  |       \  }}}}|j                  d       |j                  d       |j                  t        t        |            |d	       |j                  t        t        |            |d
	       |j                  t        t        |            |d	       |j                  t        t        |            |       |j                          |j                          y )Nr   )pylab)brownT)r   i'  zSentence Gap indexz
Gap Scores)labelzSmoothed Gap scoreszDepth scores)
matplotlibr   r   r   r   rawrJ   xlabelylabelplotr]   r(   stemlegendshow)r8   r   r   ttr   ssr   rI   s           r   demor     s     !	t	,B|yy{6E"++d#KAr1a	LL%&	LL	JJuSV}a|J4	JJuSW~r)>J?	JJuSV}a~J6	JJuSV}a 	LLN	JJLr   )   r   r   )r_   r!   rp   ImportErrornltk.tokenize.apir   r-   r/   r   r   r2   r   r   r   ro   r   rO   r   r   <module>r      sz     		 ),0 ) )	BC ^* ^B" """ "3/ly  		s   A	 	AA