
    gP                        d dl Z d dlmZ d dlZd dlmZmZmZmZm	Z	 d dl
Z
d dlmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d
dlmZ d dlZd dlZd dlZd dlmZ d dl Z de_!        dede"fdZ# G d d      Z$y)    N)Path)DictListTupleOptionalAny)tqdm)	BM25Okapi)word_tokenize)	stopwords)WordNetLemmatizer)
completionbatch_completion   )AsyncLogger)fnmatchF	file_pathreturnc                    t        j                         }| j                  d      5 t        fdd      D ]  }|j	                  |        	 ddd       |j                         S # 1 sw Y   |j                         S xY w)z/Compute MD5 hash for the file's entire content.rbc                  &     j                  d      S )Ni   )read)fs   D/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/llmtxt.py<lambda>z$_compute_file_hash.<locals>.<lambda>   s    !&&,    r   N)hashlibmd5openiterupdate	hexdigest)r   hash_md5chunkr   s      @r   _compute_file_hashr%      sg    {{}H		.4EOOE" 5 
  
 s   &A&&A>c                   2   e Zd Z	 	 	 d"dedee   dededdf
dZdee   ddfd	Z	d
e
deeee
   f   fdZdedefdZdededdfdZde
dee
   fdZd#defdZd#d$dZd%dededdfdZd&dee
   de
de
fdZd'de
dede
fdZdee   dedee
   defd Zd$d!Zy)(AsyncLLMTextManagerNdocs_dirloggermax_concurrent_calls
batch_sizer   c                     || _         || _        || _        || _        d | _        i | _        g | _        | j                   dz  | _        y )Nzbm25_index.pkl)r(   r)   r*   r+   
bm25_indexdocument_maptokenized_factsbm25_index_file)selfr(   r)   r*   r+   s        r   __init__zAsyncLLMTextManager.__init__!   sJ     !$8!$,.*,#}}/??r   	doc_batchc           
        K   g }|D ]9  }	 t        |dd      5 }|j                  |j                                ddd       ; d}|D cg c]  }|rd	| d
| dg }}	 t        d|d      }	t        |	|      D ]  \  }
}	 t        j                  d|
j                  d   j                  j                  t        j                        }|s| j                  j                  d|        ot        j                   dd|j#                  d            j%                         }|rX|j'                  d      }t        |dd      5 }|j)                  |       ddd       | j                  j+                  d|        n| j                  j                  d|         y# 1 sw Y   bxY w# t        $ rF}| j                  j                  d| dt        |              |j                  d       Y d}~d}~ww xY wc c}w # 1 sw Y   xY w# t        $ r5}| j                  j                  d| dt        |              Y d}~d}~ww xY w# t        $ r1}| j                  j                  dt        |              Y d}~yd}~ww xY ww)z(Process a batch of documents in parallelrutf-8encodingNError reading :  a  Given a documentation file, generate a list of atomic facts where each fact:
1. Represents a single piece of knowledge
2. Contains variations in terminology for the same concept
3. References relevant code patterns if they exist
4. Is written in a way that would match natural language queries

Each fact should follow this format:
<main_concept>: <fact_statement> | <related_terms> | <code_reference>

Example Facts:
browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True)
redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0)
pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5]

Wrap your response in <index>...</index> tags.
userz*

Generate index for this documentation:

)rolecontentz"anthropic/claude-3-5-sonnet-latest)modelmessages	logger_fnz<index>(.*?)</index>r   z(No <index>...</index> content found for z\n\s*\n
r   .q.mdwzCreated index file: z'No index content found in response for zError processing response for zError in batch completion: )r   appendr   	Exceptionr)   errorstrr   zipresearchchoicesmessager>   DOTALLwarningsubgroupstripwith_suffixwriteinfo)r1   r3   contentsr   r   epromptr>   messages_list	responsesresponseindex_content_matchindex_content
index_files                 r   _process_document_batchz+AsyncLLMTextManager._process_document_batch1   sx    "I$)S7;qOOAFFH- < #, $	
 $w  vh6deldm,no $	 	 
"	F(:&I (+9i'@#)^*,))/ ((+33;;		+'
 /++.VW`Va,bc $&FF"D*=*C*CA*F%eg " %%.%:%:7%C
!*cGDGGM2 E((+?
|)LM++.UV_U`,ab) (AO <; $!!N9+RAx"HI##$,
> ED ! ^KK%%(FykQSTWXYTZS[&\]]^  	FKK ;CF8DEE	Fs   JF& FF&J	G8J !I
 A&H	(I
 )AH	>G=AH	I
 JF#	F&&	G5/;G0*J0G55J=H	H			I*I<I
 II
 
	J'I?:J?JJlinec                     d|vry|j                  d      D cg c]  }|j                          }}t        |      dk7  rddt        |       fS |d   }d|vryy	c c}w )
N|)FzMissing separator '|'   FzExpected 3 parts, got r   :)Fz!Missing ':' in concept definition)TN)splitrR   len)r1   r`   ppartsconcept_parts        r   _validate_fact_linez'AsyncLLMTextManager._validate_fact_liney   sl    d?1$(JJsO4OqO4u:?23u:,???Qxl"= 5s   A	fact_filec           	      "   |j                  d      }t        |      }|j                         r`	 t        |d      5 }t	        j
                  |      }ddd       j                  d      |k(  r|S | j                  j                  d| d       i |dS # 1 sw Y   CxY w# t        j                  $ r" | j                  j                  d| d       Y Et        $ r4}| j                  j                  d	| d
t        |              Y d}~|d}~ww xY w)z
        Load token cache from .q.tokens if present and matching file hash.
        Otherwise return a new structure with updated file-hash.
        	.q.tokensr5   Ncontent_hashzHash changed for z, reindex needed.zCorrupt token cache for z, rebuilding.zError reading cache for r:   factsrn   )rS   r%   existsr   jsonloadgetr)   rU   JSONDecodeErrorrO   rF   rH   )r1   rk   
cache_filecurrent_hashr   cacherW   s          r   _load_or_create_token_cachez/AsyncLLMTextManager._load_or_create_token_cache   s    
 **;7
))4V*c*a IIaLE + 99^,< L  #4YK?P!QR \:: +* '' Y##&>yk$WX V##&>ykCPQF8$TUUVs:   B BB .B BB 2DD*D		Drx   c                     |j                  d      }t        |      |d<   t        |d      5 }t        j                  ||       d d d        y # 1 sw Y   y xY w)Nrm   rn   rD   )rS   r%   r   rr   dump)r1   rk   rx   rv   r   s        r   _save_token_cachez%AsyncLLMTextManager._save_token_cache   sG    **;7
 29 =n*c"aIIeQ #""s   AAtextc           	         d|v r-|j                  d      D cg c]  }|j                          c}n|g}t        j                  dd|d         |d<   t	               }t        t        j                  d            h dz
  }g }|D ]|  }d|v r+d|v r't        j                  d	|      }|j                  |       t        |j                               }	|j                  |	D 
cg c]  }
|
|vr|j                  |
       c}
       ~ |S c c}w c c}
w )
Nrb   z^(.*?):z\1r   english>   howwhywhatwhenwherewhich()z.[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1}))re   rR   rJ   rP   r   setr   wordsfindallextendr   lower	lemmatize)r1   r}   xrh   
lemmatizer
stop_wordstokenspartcode_tokensr   tokens              r   preprocess_textz#AsyncLLMTextManager.preprocess_text   s	   8;tDJJsO4OqO4$66*eU1X6a&(
34 8
 

 Dd{sd{ jjEt k*!$**,/EMM "'!&J. ((/!&   5 5&s   DD	
c                 :   |st         j                  j                  | j                        rd| j                  j                  d       t        | j                  d      5 }t        j                  |      }ddd       d   | _	        |d   | _
        yy# 1 sw Y   xY w)zW
        Load existing BM25 index from disk, if present and clear_cache=False.
        z&Loading existing BM25 index from disk.r   Nr/   r-   TF)ospathrq   r0   r)   rU   r   picklers   r/   r-   )r1   clear_cacher   datas       r   maybe_load_bm25_indexz)AsyncLLMTextManager.maybe_load_bm25_index   s|     rww~~d.B.BCKKEFd**D1Q{{1~ 2#'(9#:D "<0DO 21s   BBc                 X   |rO| j                   j                  d       | j                  j                         r| j                  j	                          t        j                         }| j                   j                  d       t        j                  | j                        D cg c]#  }|j                  d      s| j                  |z  % }}g }g }g }g }|D ]  }	|	j                  d      }
|s|
j                         s|j                  |	       8| j                  |	      }t        |d         dk(  s|j                  d      t!        |	      k7  r|j                  |	       |d   j#                         D ]9  \  }}|j                  |       |j                  |d          |	| j$                  |<   ;  |s|s| j'                  d	
      r| j                   j                  d       y| j                   j                  d       |r| j                   j                  dt        |       d       t)        |      | _        || _        t/        | j                  d      5 }t1        j2                  | j*                  | j,                  d|       ddd       y| j                   j5                  d       y| j                   j                  t        |       d       g }g }t7        t        |      d      5 }|D ]  }i t!        |      d}	 t/        |dd      5 }|j9                         j;                         }|j=                  d      D cg c]#  }|j;                         s|j;                         % }}ddd       D ]  }| j?                  |      \  }}|s|j                  |||f       .| jA                  |      }|tC        jB                         d|d   |<   |j                  |       |j                  |       || j$                  |<    | jE                  ||       |jG                         jH                  dz  dz  }| j                   jK                  d|jL                   d|dd        |jU                  d"        	 ddd       |rZ| j                   j5                  d#t        |       d$       |D ]-  \  }}}| j                   j5                  | d| d%|dd&  d'       / ||z   }||z   }| j                   j                  dt        |       d(       t)        |      | _        || _        t/        | j                  d      5 }t1        j2                  | j*                  | j,                  d|       ddd       |jG                         jH                  dz  dz  }| j                   j                  d)|dd        yc c}w # 1 sw Y   yxY wc c}w # 1 sw Y   _xY w# tN        $ r5}| j                   jQ                  d!| dtS        |              Y d}~d}~ww xY w# 1 sw Y   xY w# 1 sw Y   xY w)*a  
        Checks for new or modified .q.md files by comparing file-hash.
        If none need reindexing and clear_cache is False, loads existing index if available.
        Otherwise, reindexes only changed/new files and merges or creates a new index.
        z0Clearing cache and rebuilding full search index.z/Checking which .q.md files need (re)indexing...rC   rm   rp   r   rn   r   Fr   z<No new/changed .q.md files found. Using existing BM25 index.Nz9No existing BM25 index found. Building from cached facts.zBuilding BM25 index with z cached facts.wb)r-   r/   z+No facts found at all. Index remains empty.z( file(s) need reindexing. Parsing now...zIndexing changed files)totaldescro   r5   r6   r7   rB   )r   addedi   zMemory usage after r:   z.2fMBzError processing r   zFound z invalid fact lines:z
 in line: 2   z...z total facts (old + new).z*Search index updated. Final memory usage: )+r)   rU   r0   rq   unlinkpsutilProcessr   listdirr(   endswithrS   rE   ry   rf   rt   r%   itemsr.   r   r
   r-   r/   r   r   r{   rO   r	   r   rR   re   rj   r   timer|   memory_inforssdebugnamerF   rG   rH   r!   )r1   r   processr   q_filesexisting_factsexisting_tokensinvalid_linesneedSetqftoken_cache_filerx   r`   
cache_data	new_facts
new_tokens	file_pbarfilefresh_cachef_objr>   llinesis_validrG   r   	mem_usagerW   	all_facts
all_tokens	final_mems                                  r   build_search_indexz&AsyncLLMTextManager.build_search_index   s    KKOP##**,$$++-.."JK /1jj.G_.G1::V]K^4==1$.G_ %'+- B!~~k: "2"9"9";r" 44R8E5>"a'599^+DHZ[]H^+^r" ).g(<(<(>$D*"))$/#**:h+?@.0D%%d+ )? ( {))e)<  !_`   !\]!KK$$'@^AT@UUc%de&/&@DO+9D(d22D9Q*.///3/C/C%  :  KK''(UV 	CL>)QRS 	
G+CD	(*<Nt<TULdC':e"'**,"4"4"64;MM$4G U4Gq17794G U ; !&*.*B*B4*H%')00$e1DE$!%!5!5d!;&,%)YY[6G,T2 "((."))&126))$/ !&  **4= ' 3 3 5 9 9D @4 GIKK%%(;DII;bSVWY&Z[
   #A   EF KK&]);(<<P QR%2!dE##tfBugZSb	{#$NO &3 #Y.	$z1
 	4S^4DD]^_#J/( $&&-KK"oo#'#7#7  . '')--4t;	EiPS_TVWXm `T : & !V ;:2 ! LKK%%(9$r#a&&JKKL= EDd .-s   "V/9V/.V4(X>W2W>W 
W 
&W(C8W X8.X 4V= WW
W	X*XXXXX X)force_generate_factsclear_bm25_cachec           	      t  K   | j                   j                  d       t        j                  | j                        D cg c]6  j                  d      r#t        fddD              s| j                  z  8 }}|sF|D cg c];  }| j                  |j                  j                  dd      z  j                         s|= }}|s| j                   j                  d       nt        dt        |      | j                        D ]w  }|||| j                  z    }| j                   j                  d|| j                  z  d	z    d
t        |      | j                  z  d	z           | j                  |       d{    y | j                   j                  d       | j                  |       yc c}w c c}w 7 >w)a	  
        Generate index files for all documents in parallel batches
        
        Args:
            force_generate_facts (bool): If True, regenerate indexes even if they exist
            clear_bm25_cache (bool): If True, clear existing BM25 index cache
        z2Starting index generation for documentation files..mdc              3   @   K   | ]  }j                  |        y wN)r   ).0r   r   s     r   	<genexpr>z;AsyncLLMTextManager.generate_index_files.<locals>.<genexpr>e  s     ,XDWqQZZ]DWs   )rC   .xs.mdrC   z4All index files exist. Use force=True to regenerate.r   zProcessing batch r   /Nz:Index generation complete, building/updating search index.r   )r)   rU   r   r   r(   r   anyr   replacerq   rangerf   r+   r_   r   )r1   r   r   r   md_filesibatchs      `   r   generate_index_filesz(AsyncLLMTextManager.generate_index_filesY  s     	MN (*zz$--'@
'@!zz% ,XDW,X)X MMA'@ 	 
 $##aug(FFNNP 8  
 KKST 1c(mT__= 1t#67  #4Q5G!5K4LAsS[}^b^m^mOmqrNrMs!tu225999 >
 	UV,<=-
 :s+   >F8;F,<F8A F1B3F87F68?F8sectionsmodec                 n   t        j                   t        | j                  dz              t        j                   t        | j                  dz              z   }|D ch c]M  }t        |      j                  j                  d      s't        |      j                  j                  d      d   O }}|r$|D ch c]  t        fd|D              r }}g }t        |d       D ]  |d	k(  rQ| j                   d
z  }| j                   dz  }	|j                  t        |j                         r|n|	             Y|j                  t        | j                   dz                g }
|D ]Y  }	 t        |dd      5 }t        |      j                  }|
j                  d d| dd d|j                                 d d d        [ |
rdj!                  |
      S dS c c}w c c}w # 1 sw Y   *xY w# t        $ r4}| j                  j                  d| dt        |              Y d }~d }~ww xY w)Nz	[0-9]*.mdz[0-9]*.xs.mdrC   .r   c              3   ^   K   | ]$  }|j                         j                         v  & y wr   )r   )r   sectiondocs     r   r   z/AsyncLLMTextManager.generate.<locals>.<genexpr>  s#     VX'w}}#))+=Xs   *-c                     | j                  d      d   j                         rt        | j                  d      d         S dS )N_r   i?B )re   isdigitintr   s    r   r   z.AsyncLLMTextManager.generate.<locals>.<lambda>  s7    177SV<XY?KbKbKd3qwws|A3G3pjp3pr   )key	condensedr   r   r5   r6   r7   ####################z
# rB   z

r9   r:   

---

r;   )globrH   r(   r   r   r   re   r   sortedrE   rq   r   r   rF   r)   rG   join)r1   r   r   	all_filesr   	base_docsr   filesxs_fileregular_filer>   r   fnamerW   s         `       r   generatezAsyncLLMTextManager.generate{  s	   IIc$--+"=>?IIc$--."@ABC	 :C ?A#Aw||44W= !W\\'',Q/	 ? (1 X	VXVV 	I X ))pqC{"--SE.8#}}#c{:SGNN,<,OPSC5!<=> r DE$g6! JOOENNfXT%6($qvvxj#QR 7  /6}!!'*=2=9?
X" 76  E!!N4&3q6("CDDEs>   AG!8G&4G7A G+G7+G4	0G77	H4 *H//H4querytop_kc                    | j                   sy| j                  |      }| j                   j                  |      }t        j                  |      }t        j
                  |      }|d|z  z   }| j                  |||      }t        |j                         d d      d | }	g }
|	D ]  \  }}t        |      j                  dd      }t        j                  j                  | j                  |z        sNt        | j                  |z  d	d
      5 }|j!                  d      d   }dd| dd|j#                         g}|
j%                  dj'                  |             d d d         dj'                  |
      S # 1 sw Y   xY w)Nz;No search index available. Call build_search_index() first.g      ?)
doc_scoresscore_thresholdquery_tokensc                 B    | d   d   dz  | d   d   dz  z   | d   d   z   S )Nr   code_match_scoreg       @match_countg      ?total_score r   s    r   r   z,AsyncLLMTextManager.search.<locals>.<lambda>  s;    !'(3.A$}%+,A$}%&r   T)r   reverserC   r   r5   r6   r7   r   r   z# r;   rB   r   )r-   r   
get_scoresnpmeanstd_aggregate_search_scoresr   r   rH   r   r   r   rq   r(   r   re   r   rE   r   )r1   r   r   r   r   
mean_score	std_scorer   	file_dataranked_filesresultsr   r   main_docr   only_file_namer>   s                    r   rK   zAsyncLLMTextManager.search  st   P++E2__//=
WWZ(
FF:&	$y(8911!+% 2 
	 OO
 
 5 #GD!4y((%8Hww~~dmmh67$--(2C'Ja%-^^C%8%<N()FFHG NN499W#56 KJ $ !!'** KJs   AE55E>	r   r   r   c                 $   i }t        |      D ]  \  }}||k  r| j                  |   }| j                  |   }||vr
dddg d||<   d|v r|j                  d      n|g}	d}
t	        |	      dk(  rP|	d   j                         }| j                  |      }t	        t        |      t        |      z        t	        |      z  }
||   dxx   |z  cc<   ||   dxx   dz  cc<   t        ||   d	   |
      ||   d	<   ||   d
   j                  |        |S )Nr   )r   r   r   matched_factsrb   rc      r   r   r   r   r  )
	enumerater/   r.   re   rf   rR   r   r   maxrE   )r1   r   r   r   r  idxscorefactr   
componentsr   code_refr   s                r   r  z,AsyncLLMTextManager._aggregate_search_scores  sK    	#J/JC''',D))$/I	)#$#$()%'	(	)$ -04KCdVJ :!#%a=..0"228<#&s<'83{;K'K#LsS_O`#` i /58/i /14/7:)$%78:J8Ii !34 i 188>9 0< r   c                 (    | j                  d       y)z&Convenience method for a full rebuild.Tr   N)r   )r1   s    r   refresh_indexz!AsyncLLMTextManager.refresh_index  s    D1r   )N   rc   )F)r   N)FF)extended)r  )__name__
__module____qualname__r   r   r   r   r2   r   r_   rH   r   boolrj   r   ry   r|   r   r   r   r   r   rK   floatr  r  r   r   r   r'   r'       so    )-$%@@ %@ "	@
 @ 
@ FFtDz FFd FFP dHSM6I0J ;T ;d ;2 4      C DI :$ FYP >t  >_c  >pt  >D">c "># ">s ">H*+C *+ *+C *+X#u+#8=#MQRUY#	#J2r   r'   )%r   pathlibr   rJ   typingr   r   r   r   r   rr   r	   r   r   numpyr   	rank_bm25r
   nltk.tokenizer   nltk.corpusr   	nltk.stemr   litellmr   r   async_loggerr   r   r   r   r   set_verboserH   r%   r'   r   r   r   <module>r(     sk    	  	 3 3       ' ! ' 0 %       $  3  R2 R2r   