
    g                         d dl mZmZ d dlZd dlmZ d dlZddlmZ ddl	  G d de      Z
 G d	 d
e
      Z G d de
      Z G d de
      Z G d de
      Z G d de
      Z G d de
      Z G d de
      Zy)    )ABCabstractmethodN)Counter   load_nltk_punkt)*c                   *    e Zd ZdZededefd       Zy)ChunkingStrategyz6
    Abstract base class for chunking strategies.
    textreturnc                      y)z
        Abstract method to chunk the given text.
        
        Args:
            text (str): The text to chunk.
        
        Returns:
            list: A list of chunks.
        N selfr   s     O/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/chunking_strategy.pychunkzChunkingStrategy.chunk   s     	    N)__name__
__module____qualname____doc__r   strlistr   r   r   r   r   r   	   s)     
# 
$ 
 
r   r   c                        e Zd ZdZdedefdZy)IdentityChunkingzJ
    Chunking strategy that returns the input text as a single chunk.
    r   r   c                     |gS Nr   r   s     r   r   zIdentityChunking.chunk    s	    vr   N)r   r   r   r   r   r   r   r   r   r   r   r      s    # $ r   r   c                   (    e Zd ZdZddZdedefdZy)RegexChunkingzR
    Chunking strategy that splits text based on regular expression patterns.
    Nc                     |dg}|| _         y)z
        Initialize the RegexChunking object.
        
        Args:
            patterns (list): A list of regular expression patterns to split text.
        Nz\n\n)patterns)r   r"   kwargss      r   __init__zRegexChunking.__init__(   s     yH r   r   r   c                     |g}| j                   D ]2  }g }|D ]'  }|j                  t        j                  ||             ) |}4 |S r   )r"   extendresplit)r   r   
paragraphspatternnew_paragraphs	paragraphs         r   r   zRegexChunking.chunk3   sL    V
}}GN'	%%bhhw	&BC ('J	 %
 r   r   r   r   r   r   r$   r   r   r   r   r   r   r    r    $   s    	!# $ r   r    c                   &    e Zd ZdZd ZdedefdZy)NlpSentenceChunkingz\
    Chunking strategy that splits text into sentences using NLTK's sentence tokenizer.
    c                     t                y)z<
        Initialize the NlpSentenceChunking object.
        Nr   )r   r#   s     r   r$   zNlpSentenceChunking.__init__A   s
     	r   r   r   c                     ddl m}  ||      }|D cg c]  }|j                          }}t        t	        |            S c c}w )Nr   )sent_tokenize)nltk.tokenizer2   stripr   set)r   r   r2   	sentencessentsenss         r   r   zNlpSentenceChunking.chunkH   s>     	0!$'	*34)$)4CI 5s   A Nr-   r   r   r   r/   r/   =   s    # $ r   r/   c                   H    e Zd ZdZd	dZdedefdZdedefdZdedefdZ	y)
TopicSegmentationChunkingz
    Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer.
    
    How it works:
    1. Segment the text into topics using TextTilingTokenizer
    2. Extract keywords for each topic segment
    c                 X    ddl }|j                  j                         | _        || _        y)z
        Initialize the TopicSegmentationChunking object.
        
        Args:
            num_keywords (int): The number of keywords to extract for each topic segment.
        r   N)nltktokenizeTextTilingTokenizer	tokenizernum_keywords)r   r@   r#   nls       r   r$   z"TopicSegmentationChunking.__init___   s#     	88:(r   r   r   c                 <    | j                   j                  |      }|S r   )r?   r=   )r   r   segmented_topicss      r   r   zTopicSegmentationChunking.chunkj   s    >>2248r   c                 z   dd l }|j                  j                  |      }|D cg c]M  }||j                  j                  j                  d      vs+|t        j                  vs>|j                         O }}t        |      }|j                  | j                        D cg c]  \  }}|	 }}}|S c c}w c c}}w )Nr   english)r<   toknizeword_tokenizecorpus	stopwordswordsstringpunctuationlowerr   most_commonr@   )	r   r   rA   tokenstoken	freq_distwordfreqkeywordss	            r   extract_keywordsz*TopicSegmentationChunking.extract_keywordso   s    ))$/-3  NVEuBIIDWDWD]D]^gDh7hmr  {A  {M  {M  nM%++-V  N FO	+4+@+@ARAR+ST+SZT4D+ST N Us   ,B2B2$B2!B7c                 r    | j                  |      }|D cg c]  }|| j                  |      f }}|S c c}w r   )r   rU   )r   r   segmentssegmentsegments_with_topicss        r   chunk_with_topicsz+TopicSegmentationChunking.chunk_with_topicsz   sC    ::d#YabYag$*?*?*H IYab##  cs   4N)   )
r   r   r   r   r$   r   r   r   rU   rZ   r   r   r   r:   r:   V   sE    	) #  $  
	S 	T 	$c $d $r   r:   c                   (    e Zd ZdZddZdedefdZy)FixedLengthWordChunkingz
    Chunking strategy that splits text into fixed-length word chunks.
    
    How it works:
    1. Split the text into words
    2. Create chunks of fixed length
    3. Return the list of chunks
    c                     || _         y)z
        Initialize the fixed-length word chunking strategy with the given chunk size.
        
        Args:
            chunk_size (int): The size of each chunk in words.
        N)
chunk_size)r   r_   r#   s      r   r$   z FixedLengthWordChunking.__init__   s     %r   r   r   c           	          |j                         }t        dt        |      | j                        D cg c]#  }dj	                  |||| j                  z          % c}S c c}w Nr    )r(   rangelenr_   join)r   r   rJ   is       r   r   zFixedLengthWordChunking.chunk   sT    

@EaUUYUdUd@ef@e1qT__!456@efffs   (AN)d   r-   r   r   r   r]   r]      s"    %g# g$ gr   r]   c                   (    e Zd ZdZddZdedefdZy)SlidingWindowChunkingz
    Chunking strategy that splits text into overlapping word chunks.
    
    How it works:
    1. Split the text into words
    2. Create chunks of fixed length
    3. Return the list of chunks
    c                      || _         || _        y)a  
        Initialize the sliding window chunking strategy with the given window size and
        step size.
        
        Args:
            window_size (int): The size of the sliding window in words.
            step (int): The step size for sliding the window in words.
        N)window_sizestep)r   rk   rl   r#   s       r   r$   zSlidingWindowChunking.__init__   s     '	r   r   r   c                    |j                         }g }t        |      | j                  k  r|gS t        dt        |      | j                  z
  dz   | j                        D ]4  }dj                  |||| j                  z          }|j                  |       6 | j                  z   t        |      k  r.|j                  dj                  || j                   d               |S )Nr   r   rb   )r(   rd   rk   rc   rl   re   append)r   r   rJ   chunksrf   r   s         r   r   zSlidingWindowChunking.chunk   s    

u:)))6Mq#e*t'7'77!;TYYGAHHU1Q)9)9%9:;EMM%  H
 t#e*,MM#((5$*:*:):);#<=>r   N)rg   2   r-   r   r   r   ri   ri      s    
# $ r   ri   c                   (    e Zd ZdZddZdedefdZy)OverlappingWindowChunkinga  
    Chunking strategy that splits text into overlapping word chunks.
    
    How it works:
    1. Split the text into words using whitespace
    2. Create chunks of fixed length equal to the window size
    3. Slide the window by the overlap size
    4. Return the list of chunks
    c                      || _         || _        y)a)  
        Initialize the overlapping window chunking strategy with the given window size and
        overlap size.
        
        Args:
            window_size (int): The size of the window in words.
            overlap (int): The size of the overlap between consecutive chunks in words.
        N)rk   overlap)r   rk   rt   r#   s       r   r$   z"OverlappingWindowChunking.__init__   s     'r   r   r   c                 F   |j                         }g }t        |      | j                  k  r|gS d}|t        |      k  rc|| j                  z   }dj                  |||       }|j	                  |       |t        |      k\  r	 |S || j
                  z
  }|t        |      k  rc|S ra   )r(   rd   rk   re   rn   rt   )r   r   rJ   ro   startendr   s          r   r   zOverlappingWindowChunking.chunk   s    

u:)))6Mc%j $***CHHU5-.EMM% c%j   $,,&E c%j  r   N)i  rg   r-   r   r   r   rr   rr      s    
# $ r   rr   )abcr   r   r'   collectionsr   rK   model_loaderr   utilsr   r   r    r/   r:   r]   ri   rr   r   r   r   <module>r|      s    # 	   ) s &' $ 2* 2)$ 0 )$Xg. g.$, $L( 0 (r   