
    g1)                        d dl Z d dlZde j                  d<   d dlmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZ ddl ddl ddl ddl d d	lmZ d d
lmZ ddlmZ ddl d dlZd dlZ ej6                  dd        G d d      Zy)    NfalseTOKENIZERS_PARALLELISM)Path   )UrlModelCrawlResult)init_dbget_cached_url	cache_urlDB_PATHflush_db)*)List)ThreadPoolExecutor)WebScrapingStrategyignorezBField "model_name" has conflict with protected namespace "model_".)messagec                   H   e Zd ZddededefdZd Zeddedddd e	       f	d	e
d
ededededededededefdZeddedddd e	       f	dee
   d
ededededededededee   fdZed e	       dddddfdedededededededefdZdededededededededededefdZy) 
WebCrawlerNFcrawler_strategyalways_by_pass_cacheverbosec                 ~   |xs t        |      | _        || _        t        j                  j                  t        j                  dt        j                               d      | _	        t        j                  | j                  d       t        j                  | j                   dd       t                d| _        y )N)r   CRAWL4_AI_BASE_DIRECTORYz	.crawl4aiT)exist_okz/cacheF)LocalSeleniumCrawlerStrategyr   r   ospathjoingetenvr   homecrawl4ai_foldermakedirsr	   ready)selfr   r   r   s       I/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/web_crawler.py__init__zWebCrawler.__init__   s     0 a4PY`4a$8!!ww||BII6PRVR[R[R],^`kl
D((48
t++,F3dC	
    c                 z    t        d       | j                  ddt               dd       d| _        t        d       y )Nu(   [LOG] 🌤️  Warming up the WebCrawlerzhttps://google.com/   F)urlword_count_thresholdextraction_strategybypass_cacher   Tu'   [LOG] 🌞 WebCrawler is ready to crawl)printrunNoExtractionStrategyr$   )r%   s    r&   warmupzWebCrawler.warmup   s@    89%!" 4 6 	 	
 
78r(   T	url_modelprovider	api_tokenextract_blocks_flagcss_selector
screenshotuse_cached_htmlr-   chunking_strategyreturnc                 v     | j                   |j                  ||	xs
 t               |
f|j                  ||d|S )N)r.   r7   r8   )r0   r+   r1   forced)r%   r3   r4   r5   r6   r,   r7   r8   r9   r-   r:   kwargss               r&   
fetch_pagezWebCrawler.fetch_page*   sP     txxMM 9#7#9		

 #))%!	
 	
 		
r(   
url_modelsc                     |	xs
 t               }	 fd}t               5 }t         |j                  |||gt	        |      z  |gt	        |      z  |gt	        |      z  |gt	        |      z  |gt	        |      z  |gt	        |      z  |gt	        |      z  |	gt	        |      z  |
gt	        |      z  g|gt	        |      z         }d d d        |S # 1 sw Y   S xY w)Nc                 0     j                   | g|i |S )N)r?   )r3   argsr>   r%   s      r&   fetch_page_wrapperz2WebCrawler.fetch_pages.<locals>.fetch_page_wrapperS   s    "4??9>t>v>>r(   )r1   r   listmaplen)r%   r@   r4   r5   r6   r,   r9   r7   r8   r-   r:   r>   rD   executorresultss   `              r&   fetch_pageszWebCrawler.fetch_pagesD   s     2K5I5K	?  !X&JZ0K#j/1()C
O;)*S_<!NS_4L3z?2$%J7()C
O;&'#j/9 XJ/G "$ % "$ s   B)CCr+   r.   
user_agentc
                     	 |xs
 t               }|	|_        t        |t              st	        d      t        |t
              st	        d      t        |t              }d }d }d }|s| j                  st        |      }|
j                  dd      r| j                  sy |r't        |d         }t        |d         }|r	|d   }|sd }|rs|r| j                  j                  |       t        j                         }t         | j                  j                   |fi |
      }t        j                         }|	r"t#        d| d	t%        |       d
||z
  dd       |r| j                  j'                         } | j(                  |||||||||	t%        |      f
i |
}t%        |      |_        |S # t,        $ rZ}t/        |d      st1        |      |_        t#        d| d|j2                          t5        |dd|j2                        cY d }~S d }~ww xY w)NzUnsupported extraction strategyzUnsupported chunking strategyr2   Tr      	   u   [LOG] 🚀 Crawling done for z, success: , time taken: .2f secondsmsgu   [ERROR] 🚫 Failed to crawl z	, error:  F)r+   htmlsuccesserror_message)r1   r   
isinstanceExtractionStrategy
ValueErrorChunkingStrategymaxMIN_WORD_THRESHOLDr   r
   getr$   sanitize_input_encoder   update_user_agenttimecrawlr/   booltake_screenshotprocess_htmlrU   	ExceptionhasattrstrrR   r   )r%   r+   r,   r-   r:   r.   r7   r8   rK   r   r>   cachedscreenshot_dataextracted_contentrT   t1t2crawl_resultes                      r&   r0   zWebCrawler.runj   s:   .Y&9&S=Q=S#.5#+!"57IJ$%FGG!"35EF$%DEE'*+?AS'T$"&$(!#D,E,E+C0F::h-djj0;D(=fQi(H%!*0).%)FT!--??
KB01L1F1F1L1LS1[TZ1[\DB =cU+dSWj\Yghjmohopsgtt|}~!*.*?*?*O*O*Q  1t00d<MOcex  {L  NZ  \k  mt  vz  {A  vB   M  FL   M'+Dz$## Yq%(FAE5cU)AEE7KL"sURSRWRWXX	Ys&   BF* DF* *	H3AHHHrT   rj   r,   	is_cachedc                    t        j                          }	 t        j                          }t               }|j                         D ci c]  \  }}|dvs|| }}} |j                  ||f|||j	                  dd      |j	                  dt
              d|}|	r(t        d| dt        j                          |z
  dd	       |t        d
|       	 t        |j	                  dd            }t        |j	                  dd            }|j	                  dg       }|j	                  dg       }|j	                  di       }||	rt        d| d|j                          |j                  |      }|j                  ||      }t        j                  |dt        d      }|	r(t        d| dt        j                          |z
  dd       |sd n|}|
sNt!        |||||dt        j                  |      t        j                  |      t        j                  |      |
       t#        ||t%        |      ||||||dd      S c c}}w # t        $ r}t        t        |            d }~ww xY w)N)	only_text$image_description_min_word_thresholdrq   Frr   )r,   r7   rq   rr   u!   [LOG] 🚀 Content extracted for z, success: True, time taken: rP   rQ   z,Failed to extract content from the website: cleaned_htmlrS   markdownmedialinksmetadatau*   [LOG] 🔥 Extracting semantic blocks for z, Strategy: rM   )indentdefaultensure_asciiu   [LOG] 🚀 Extraction done for rO   z	 seconds.T)r8   )r+   rT   rs   rt   ru   rv   rw   r8   rj   rU   rV   )r`   r   itemsscrapr]   $IMAGE_DESCRIPTION_MIN_WORD_THRESHOLDr/   rY   InvalidCSSSelectorErrorrg   r^   namechunkr0   jsondumpsr   r   format_html)r%   r+   rT   rj   r,   r-   r:   r7   r8   r   ro   r>   trk   scrapping_strategykvextra_paramsresultrn   rs   rt   ru   rv   rw   sectionss                             r&   rd   zWebCrawler.process_html   s    		A)YY[%8%:"17   CA1  MB  DB1   C1+11
 *>!-$jje<9?>@d:
 #
 =cUB_`d`i`i`knp`pqt_uu}~>$'STWSX%YZZ "
 1NB1OPL,VZZ
B-GHHJJw+EJJw+Ezz*b1H (Fse<XkXpXpWqrs,228<$7$;$;C$J!$(JJ/@TWfk$l!;C5tyy{]^_bNcclmn%/ZJ %JJu%JJu%JJx() (6!!%"3  o  C& + ) Q(()s0   1I H>H>A:I >I 	I&I!!I&)NFF)__name__
__module____qualname__CrawlerStrategyrb   r'   r2   DEFAULT_PROVIDERr\   RegexChunkingr   rg   rX   rZ   r   r?   r   rJ   r0   intrd    r(   r&   r   r      s/    W[ nr 
9 )$(/   %26.;o  	
 "    0 , 
: )$(/ %  26.;o$N$ $ 	$
 "$ $ $ $ 0$ ,$ 
k	$R "46:2?/!& $$";Y;Y "4	;Y
  0;Y ;Y ;Y ;Y ;Y ;YzVV V  #	V
 #&V "4V  0V V V V V Vr(   r   )r   r`   environpathlibr   modelsr   r   databaser	   r
   r   r   r   utilsr:   r-   r   typingr   concurrent.futuresr   content_scraping_strategyr   configwarningsr   filterwarningsr   r   r(   r&   <module>r      s`    '.

# $  ) K K    "   1 :      *n oi ir(   