
    Ogb%                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZ  e         e e j6                  d            Z e       Ze G d d             Zd#dede dee   fdZ!dededeeef   fdZ"dedee#   fdZ$dede dedefdZ%defdZ&dedefdZ'd$dee   de fdZ(dedee   fd Z)d! Z*e+d"k(  r ejX                   e*              yy)%    N)ElementTree)ListDictAny)	dataclass)datetimetimezone)urlparse)load_dotenv)AsyncWebCrawlerBrowserConfigCrawlerRunConfig	CacheMode)AsyncOpenAI)init_collectionOPENAI_API_KEY)api_keyc                   d    e Zd ZU eed<   eed<   eed<   eed<   eed<   eeef   ed<   ee	   ed<   y)	ProcessedChunkurlchunk_numbertitlesummarycontentmetadata	embeddingN)
__name__
__module____qualname__str__annotations__intr   r   r   float     (/var/www/openai/gen-rag-crawl/crawler.pyr   r      s4    	HJLL38nE{r%   r   text
chunk_sizereturnc                    g }d}t        |       }||k  r||z   }||k\  r%|j                  | |d j                                	 |S | || }|j                  d      }|dk7  r||dz  kD  r||z   }nHd|v r|j                  d      }||dz  kD  r+||z   }n%d|v r!|j                  d      }	|	|dz  kD  r||	z   dz   }| || j                         }|r|j                  |       t	        |dz   |      }||k  r|S )	z>Split text into chunks, respecting code blocks and paragraphs.r   Nz```g333333?z

z.    )lenappendstriprfindmax)
r'   r(   chunksstarttext_lengthendchunk
code_block
last_breaklast_periods
             r&   
chunk_textr:   &   s.   FEd)K
+
j +MM$uv,,,./( M% U3[['

Z#-= =*$Cu_V,JJ,,j(U]++d+KZ#--k)A-U3%%'MM% EAIs#- +
0 Mr%   r6   r   c                   K   d}	 t         j                  j                  j                  t	        j
                  dd      d|ddd| d| d	d
  ddgddi       d	{   }t        j                  |j                  d   j                  j                        S 7 :# t        $ r}t        d|        dddcY d	}~S d	}~ww xY ww)z&Extract title and summary using GPT-4.a:  You are an AI that extracts titles and summaries from web content chunks.
    Write an 800 word article describing the things to see and do close and Return  Refer to the data repo that you created when you crawled <siteurl>
    and include internal links to relevant pages.
    Include images in the article taken from the data repo and include our affiliate links directly to our partners where appropriate.
    Add shortcodes that populate our article with a Places to Stay slider unit with 8 hotels that are close to the location. Make these hotels all 4* and above.	LLM_MODELzgpt-4-0125-previewsystem)roler   userzURL: z

Content:
Ni  z...typejson_object)modelmessagesresponse_formatr   z!Error getting title and summary: zError processing titlezError processing summary)r   r   )openai_clientchatcompletionscreateosgetenvjsonloadschoicesmessager   	Exceptionprint)r6   r   system_promptresponsees        r&   get_title_and_summaryrT   G   s     dM
&++77>>))K)=>!m<"!&se>%,sK $]3 ? 

 

 zz(**1-55==>>

  
1!56-1
 	

sG   CAB B9B CB 	C #B;5C 6C;C  Cc                    K   	 t         j                  j                  d|        d{   }|j                  d   j                  S 7 # t
        $ r}t        d|        dgdz  cY d}~S d}~ww xY ww)z!Get embedding vector from OpenAI.ztext-embedding-3-small)rB   inputNr   zError getting embedding: i   )rE   
embeddingsrH   datar   rO   rP   )r'   rR   rS   s      r&   get_embeddingrY   d   sy     &1188*$ 9 
 
 }}Q)))
  )!-.sTzsD   A2$A AA A2A 	A/A*$A/%A2*A//A2r   c           	      Z  K   t        | |       d{   }t        |        d{   }t        |      j                  t	        |       t        j                  t        j                        j                         t        |      j                  d}t        |||d   |d   | ||      S 7 7 w)zProcess a single chunk of text.N)sourcer(   
crawled_aturl_pathr   r   )r   r   r   r   r   r   r   )rT   rY   r
   netlocr-   r   nowr	   utc	isoformatpathr   )r6   r   r   	extractedr   r   s         r&   process_chunkrd   p   s     +E377I#E**I 3-&&%jll8<<0::<SM&&	H ! )$  8*s    B+B'B+B)BB+)B+c           	        K   	 t         j                  | j                  g| j                  g| j                  | j
                  | j                  | j                  d| j                  g| j                   d| j
                   g       t        d| j
                   d| j                          y# t        $ r}t        d|        Y d}~yd}~ww xY ww)z'Insert a processed chunk into ChromaDB.)r   r   r   r   _)	documentsrW   	metadatasidszInserted chunk z for zError inserting chunk: N)chroma_collectionaddr   r   r   r   r   r   r   rP   rO   )r6   rS   s     r&   insert_chunkrl      s     -}}o( !99$)$6$6"[[$}}	
 nn II;a 2 2345 	 	
 	 2 235DE -'s+,,-s/   CB(B- ,C-	C6C	C	CCmarkdownc           	      (  K   t        |      }t        |      D cg c]  \  }}t        |||        }}}t        j                  |  d{   }|D cg c]  }t        |       }}t        j                  |  d{    yc c}}w 7 =c c}w 7 w)z4Process a document and store its chunks in parallel.N)r:   	enumeraterd   asynciogatherrl   )r   rm   r2   ir6   tasksprocessed_chunksinsert_taskss           r&   process_and_store_documentrv      s     !F:CF:KL:Kha]5!S):KEL$^^U335EF5EEL'5ELF
..,
''' M3F's9   BBBB	BB&B=B>BBurlsmax_concurrentc                   	
K   t        dt        |        d       t        ddg d      }t        t        j
                        t        |      j                          d {    	 t        j                  |      	t        |       
d	d
t        f	
fd}t        j                  | D cg c]
  } ||       c}  d {    j                          d {    t        d       y 7 c c}w 7 /# t        $ r}t        d|        Y d }~Id }~ww xY w7 =# j                          d {  7   t        d       w xY ww)NFound z URLs to crawlTF)z--disable-gpuz--disable-dev-shm-usagez--no-sandbox)headlessverbose
extra_args)
cache_mode)configr   r   c           
        K   4 d {    j                  | d       d {   }|j                  rCdz  t        d|  d d d       t        | |j                  j
                         d {    nt        d|  d	|j                          d d d       d {    y 7 7 7 57 # 1 d {  7  sw Y   y xY ww)
Nsession1)r   r   
session_idr,   u   ✅ Successfully crawled: z (/)u   ❌ Failed: z
 - Error: )arunsuccessrP   rv   markdown_v2raw_markdownerror_message)r   resultcrawl_configcrawlerprocessed_urls	semaphore
total_urlss     r&   process_urlz#crawl_parallel.<locals>.process_url   s      y&||LZ  ,    >>"a'N6se2n=MQzlZ[\]4S&:L:L:Y:YZZZLZ8L8L7MNO !yy [ !yyysg   CB&CB.B(AB.4B*5 B.C B,!C(B.*B.,C.C 4B75C <Cu   ❌ Error in crawl_parallel: u    ✅ Crawler closed successfully.)rP   r-   r   r   r   BYPASSr   r3   rp   	Semaphorer    rq   rO   close)rw   rx   browser_configr   r   rS   r   r   r   r   r   s         @@@@@r&   crawl_parallelr      s!    	F3t9+^
,-"ON
 $y/?/?@L^4G
--/2%%n5	Y
	P3 	P 	P nn4@4C{3/4@AAA mmo017 ( AA 3-aS1223 	gmmo01s   A"E)C3*E/AC< 4C5C< 
C:C< E"D #E5C< <	DDD" DD"  E"E6D97EEsitemap_urlc                 t   	 t        j                  |       }|j                          t        j                  |j
                        }ddi}|j                  d|      D cg c]  }|j                   }}t        dt        |       d|         |S c c}w # t        $ r}t        d|        g cY d}~S d}~ww xY w)zGet URLs from a sitemap.nsz+http://www.sitemaps.org/schemas/sitemap/0.9z	.//ns:locrz   z URLs in sitemap: zError fetching sitemap: N)requestsgetraise_for_statusr   
fromstringr   findallr'   rP   r-   rO   )r   rR   root	namespacelocrw   rS   s          r&   get_urls_from_sitemapr      s    <<,!!#%%h&6&67HI	$(LLi$HI$HS$HIs4yk!3K=AB J  (,-	s0   AB B1B B 	B7B2,B72B7c                  <   K   dg} t        |        d {    y 7 w)Nzhttps://example.com)r   )rw   s    r&   mainr     s     !"D

s   __main__)i  )   )-rI   rK   rp   r   	xml.etreer   typingr   r   r   dataclassesr   r   r	   urllib.parser
   dotenvr   crawl4air   r   r   r   openair   dbr   rJ   rE   rj   r   r    r"   r:   rT   r#   rY   rd   rl   rv   r   r   r   r   runr$   r%   r&   <module>r      s_   	    ! " " ! ' !  P P    IBII.>$?@ $%    S c T#Y B
s 
 
c3h 
:	c 	d5k 	s # C N .-n -,(# ( (p&2tCy &2# &2Vs tCy " zGKK r%   