
    kg^                        S SK r S SKrS SKrS SKrS SKJr  S SKJrJrJ	r	  S SK
Jr  S SKJrJr  S SKJr  S SKJr  S SKJrJrJrJr  S S	KJr  S S
KJr  \" 5         \" \ R6                  " S5      S9r\" 5       r\ " S S5      5       rS#S\S\ S\\   4S jjr!S\S\S\\\4   4S jr"S\S\\#   4S jr$S\S\ S\S\4S jr%S\4S jr&S\S\4S jr'S$S\\   S\ 4S jjr(S\S\\   4S  jr)S! r*\+S":X  a  \RX                  " \*" 5       5        gg)%    N)ElementTree)ListDictAny)	dataclass)datetimetimezone)urlparse)load_dotenv)AsyncWebCrawlerBrowserConfigCrawlerRunConfig	CacheMode)AsyncOpenAI)init_collectionOPENAI_API_KEY)api_keyc                   l    \ rS rSr% \\S'   \\S'   \\S'   \\S'   \\S'   \\\4   \S'   \	\
   \S'   S	rg
)ProcessedChunk   urlchunk_numbertitlesummarycontentmetadata	embedding N)__name__
__module____qualname____firstlineno__str__annotations__intr   r   r   float__static_attributes__r       QC:\xampp\htdocs\LEARNing\AI\Crawl4AI\crawl4ai-rag-system\gen-rag-crawl\crawler.pyr   r      s4    	HJLL38nE{r(   r   text
chunk_sizereturnc                    / nSn[        U 5      nX4:  a  X1-   nXT:  a$  UR                  XS R                  5       5         U$ XU nUR                  S5      nUS:w  a  XqS-  :  a  X7-   nOJSU;   a  UR                  S5      nXS-  :  a  X8-   nO&SU;   a   UR                  S5      n	XS-  :  a  X9-   S-   nXU R                  5       nU(       a  UR                  U5        [	        US-   U5      nX4:  a  M  U$ )	z>Split text into chunks, respecting code blocks and paragraphs.r   Nz```g333333?z

z.    )lenappendstriprfindmax)
r*   r+   chunksstarttext_lengthendchunk
code_block
last_breaklast_periods
             r)   
chunk_textr=   &   s   FEd)K

 MM$v,,,./( M% 3[['

#-= =$Cu_V,J,,(U]++d+K#--)A-3%%'MM% EAIs#- 
0 Mr(   r9   r   c                   #    Sn [         R                  R                  R                  [        R
                  " SS5      SUS.SSU SU S	S
  S3S./SS0S9I S	h  vN n[        R                  " UR                  S   R                  R                  5      $  N;! [         a  n[        SU 35        SSS.s S	nA$ S	nAff = f7f)z&Extract title and summary using GPT-4.a  You are an AI that extracts titles and summaries from web content chunks.
    Return a JSON object with 'title' and 'summary' keys.
    For the title: If this seems like the start of a document, extract its title. If it's a middle chunk, derive a descriptive title.
    For the summary: Create a concise summary of the main points in this chunk.
    Keep both title and summary concise but informative.	LLM_MODELzgpt-4-0125-previewsystem)roler   userzURL: z

Content:
Ni  z...typejson_object)modelmessagesresponse_formatr   z!Error getting title and summary: zError processing titlezError processing summary)r   r   )openai_clientchatcompletionscreateosgetenvjsonloadschoicesmessager   	Exceptionprint)r9   r   system_promptresponsees        r)   get_title_and_summaryrW   G   s     <M
&++77>>))K)=>!m<"!&se>%,sK $]3 ? 

 

 zz(**1-55==>>

  
1!56-1
 	

sG   CAB B:B CB 
C #B;5C 6C;C  Cc                    #     [         R                  R                  SU S9I Sh  vN nUR                  S   R                  $  N! [
         a  n[        SU 35        S/S-  s SnA$ SnAff = f7f)z!Get embedding vector from OpenAI.ztext-embedding-3-small)rE   inputNr   zError getting embedding: i   )rH   
embeddingsrK   datar   rR   rS   )r*   rU   rV   s      r)   get_embeddingr\   d   sx     &1188*$ 9 
 
 }}Q)))
  )!-.sTzsD   A0!A AA A0A 
A-A("A-#A0(A--A0r   c           
      T  #    [        X5      I Sh  vN n[        U 5      I Sh  vN n[        U5      R                  [	        U 5      [
        R                  " [        R                  5      R                  5       [        U5      R                  S.n[        UUUS   US   U UUS9$  N N7f)zProcess a single chunk of text.N)sourcer+   
crawled_aturl_pathr   r   )r   r   r   r   r   r   r   )rW   r\   r
   netlocr0   r   nowr	   utc	isoformatpathr   )r9   r   r   	extractedr   r   s         r)   process_chunkrg   p   s     +E77I#E**I 3-&&%jll8<<0::<SM&&	H ! )$  8*s    B(B$B(B&B B(&B(c           	        #     [         R                  U R                  /U R                  /U R                  U R
                  U R                  U R                  S.U R                  E/U R                   SU R
                   3/S9  [        SU R
                   SU R                   35        g! [         a  n[        SU 35         SnAgSnAff = f7f)z'Insert a processed chunk into ChromaDB.)r   r   r   r   _)	documentsrZ   	metadatasidszInserted chunk z for zError inserting chunk: N)chroma_collectionaddr   r   r   r   r   r   r   rS   rR   )r9   rV   s     r)   insert_chunkro      s     -}}o( !99$)$6$6"[[$}}	
 nn II;a 2 2345 	 	
 	 2 235DE -'s+,,-s/   CB%B* )C*
C4CCCCmarkdownc           	      4  #    [        U5      n[        U5       VVs/ s H  u  p4[        XCU 5      PM     nnn[        R                  " U6 I Sh  vN nU Vs/ s H  n[        U5      PM     nn[        R                  " U6 I Sh  vN   gs  snnf  NAs  snf  N7f)z4Process a document and store its chunks in parallel.N)r=   	enumeraterg   asynciogatherro   )r   rp   r5   ir9   tasksprocessed_chunksinsert_taskss           r)   process_and_store_documentry      s     !F:CF:KL:Kha]5S):KEL$^^U335EF5EEL'5ELF
..,
''' M3F's9   BB	BBBB+BBBBurlsmax_concurrentc                 D  ^^^^^	#    [        S[        U 5       S35        [        SS/ SQS9n[        [        R
                  S9m[        US9mTR                  5       I S	h  vN    [        R                  " U5      m[        U 5      m	S
mS[        4UUUUU	4S jjn[        R                  " U  Vs/ s H
  oC" U5      PM     sn6 I S	h  vN   [        ST ST	 S35        TR                  5       I S	h  vN   g	 Ns  snf  N6 N! TR                  5       I S	h  vN    f = f7f)z9Crawl multiple URLs in parallel with a concurrency limit.Found z URLs to crawlTF)z--disable-gpuz--disable-dev-shm-usagez--no-sandbox)headlessverbose
extra_args)
cache_mode)configNr   r   c           
        >#    T IS h  vN   TR                  U TSS9I S h  vN nUR                  (       aB  TS-  m[        SU  ST ST S35        [        XR                  R
                  5      I S h  vN   O[        SU  S	UR                   35        S S S 5      IS h  vN   g  N N N5 N! , IS h  vN  (       d  f       g = f7f)
Nsession1)r   r   
session_idr/   zSuccessfully crawled: z (/)zFailed: z
 - Error: )arunsuccessrS   ry   markdown_v2raw_markdownerror_message)r   resultcrawl_configcrawlerprocessed_urls	semaphore
total_urlss     r)   process_url#crawl_parallel.<locals>.process_url   s      y&||LZ  ,    >>"a'N0R7GqTUV 5//<<   HSEF4H4H3IJK !yy !yyysg   C	B'C	B/B)AB/5B+6 B/C	!B-"C	)B/+B/-C	/C5B86CC	zCompleted crawling z out of z URLs)rS   r0   r   r   r   BYPASSr   r6   rs   	Semaphorer#   rt   close)
rz   r{   browser_configr   r   r   r   r   r   r   s
        @@@@@r)   crawl_parallelr      s     
F3t9+^
,-"ON
 $y/?/?@L^4G
--/%%n5	 Y
	L3 	L 	L" nn4@4C{3/4@AAA#N#38J<uMNmmo; 4 AA 	gmmosg   AD  C7!D &A	D /C9 D C>D D 1D 2D 9D  D DDDD sitemap_urlc                 ~    [         R                  " U 5      nUR                  5         [        R                  " UR
                  5      nSS0nUR                  SU5       Vs/ s H  oDR                  PM     nn[        S[        U5       SU  35        U$ s  snf ! [         a  n[        SU 35        / s SnA$ SnAff = f)zGet URLs from a sitemap.nsz+http://www.sitemaps.org/schemas/sitemap/0.9z	.//ns:locr}   z URLs in sitemap: zError fetching sitemap: N)requestsgetraise_for_statusr   
fromstringr   findallr*   rS   r0   rR   )r   rU   root	namespacelocrz   rV   s          r)   get_urls_from_sitemapr      s    <<,!!#%%h&6&67HI	$(LLi$HI$HS$HIs4yk!3K=AB J  (,-	s0   AB  B5B B 
B<"B71B<7B<c                  <   #    S/n [        U 5      I S h  vN   g  N7f)Nzhttps://example.com)r   )rz   s    r)   mainr      s     !"D

s   __main__)i  )   )-rL   rN   rs   r   	xml.etreer   typingr   r   r   dataclassesr   r   r	   urllib.parser
   dotenvr   crawl4air   r   r   r   openair   dbr   rM   rH   rm   r   r#   r%   r=   rW   r&   r\   rg   ro   ry   r   r   r   r   runr   r(   r)   <module>r      sZ   	    ! " " ! ' !  P P    BII.>$?@ $%    S c T#Y B
s 
 
c3h 
:	c 	d5k 	s # C N .-n -,(# ( (*tCy *# *Zs tCy " zKK r(   