
    g<                        d dl mZmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d	d
l d dlZd dlZd dlZd dlmZmZmZ d dlmZ d dlmZmZ d dl Z d dl!Z!d dl"m#Z# d	d
l$  ejJ                  d      Z&e&jO                  ejP                          ejJ                  d      Z)e)jO                  ejP                          ejJ                  d      Z*e*jO                  ejP                          ejJ                  d      Z+e+jO                  ejP                          ejJ                  d      Z,e,jO                  ejP                          G d de      Z- G d de-      Z. G d de-      Z/y)    )ABCabstractmethod)	webdriver)Service)By)WebDriverWait)expected_conditions)Options)InvalidArgumentExceptionWebDriverException   )*N)Image	ImageDraw	ImageFont)BytesIO)ListCallable)Pathz+selenium.webdriver.remote.remote_connectionz!selenium.webdriver.common.servicezurllib3.connectionpoolzhttp.clientz'selenium.webdriver.common.driver_finderc                   l    e Zd Zededefd       Zedefd       Zedefd       Zeded	efd
       Z	y)CrawlerStrategyurlreturnc                      y N )selfr   kwargss      N/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/crawler_strategy.pycrawlzCrawlerStrategy.crawl-           	save_pathc                      y r   r   )r   r#   s     r   take_screenshotzCrawlerStrategy.take_screenshot1   r!   r"   
user_agentc                      y r   r   r   r&   s     r   update_user_agentz!CrawlerStrategy.update_user_agent5   r!   r"   	hook_typehookc                      y r   r   r   r*   r+   s      r   set_hookzCrawlerStrategy.set_hook9   r!   r"   N)
__name__
__module____qualname__r   strr    r%   r)   r   r.   r   r"   r   r   r   ,   sz     3      C   # X  r"   r   c                   0     e Zd Zd fd	ZdedefdZ xZS )CloudCrawlerStrategyc                 0    t         |           || _        y r   )super__init__use_cached_html)r   r8   	__class__s     r   r7   zCloudCrawlerStrategy.__init__>   s    .r"   r   r   c                     |gdddd}t        j                  d|      }|j                         }|d   d   d   }t        |      S )	NTF)urlsinclude_raw_htmlforcedextract_blockszhttp://crawl4ai.uccode.io/crawl)jsonresultsr   html)requestspostr?   sanitize_input_encode)r   r   dataresponserA   s        r   r    zCloudCrawlerStrategy.crawlB   sT    E $#	
 ==!BN==?	"1%f-$T**r"   )F)r/   r0   r1   r7   r2   r    __classcell__r9   s   @r   r4   r4   =   s    /+ + +r"   r4   c                   ~     e Zd Zd fd	ZdedefdZdefdZdefdZde	fd	Z
dd
ZdedefdZdefdZd Z xZS )LocalSeleniumCrawlerStrategyc                    t         |           t        d       t               | _        d| j                  _        |j                  d      r9| j                  j                  dj                  |j                  d                   |j                  d      r.| j                  j                  d|j                  d      z          nK|j                  dd      }| j                  j                  d|        | j                  j                  d       |j                  d	d      | j                  _        | j                  j
                  r| j                  j                  d
       | j                  j                  d       | j                  j                  d       | j                  j                  d       | j                  j                  d       | j                  j                  d       | j                  j                  d       | j                  j                  d       || _	        || _	        || _
        |j                  dd      | _        d d d d d d| _        t               | _        t        j                   | j                        | _        | j%                  d| j"                        | _        |j                  d      r2|j                  d      D ]  }| j"                  j'                  |        y y )Nu4   [LOG] 🚀 Initializing LocalSeleniumCrawlerStrategyTproxyz--proxy-server={}r&   z--user-agent=zsMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36z~user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36headlessz
--headlessz--disable-gpuz--window-size=1920,1080z--no-sandboxz--disable-dev-shm-usagez---disable-blink-features=AutomationControlledz--log-level=3verboseF)on_driver_createdon_user_agent_updatedbefore_get_urlafter_get_urlbefore_return_html)optionsrO   cookies)r6   r7   printr
   rT   rM   getadd_argumentformatr8   js_coderN   hooksr   servicer   Chromedriverexecute_hook
add_cookie)r   r8   rZ   r   r&   cookier9   s         r   r7   z%LocalSeleniumCrawlerStrategy.__init__P   sB   DEy $::gLL%%&9&@&@GAT&UV::l#LL%%o

<8P&PQL  3h  iJLL%%j\&BCLL%%  'g  h &

:t <<<  LL%%l3!!/2!!";<!!.1!!";<!!"QR 	!!/2 	!!/2..zz)U3 "&%)"!"&

. y&&t||<''(;T[[I::i  **Y/&&v. 0 !r"   r*   r+   c                 Z    || j                   v r|| j                   |<   y t        d|       )NzInvalid hook type: )r[   
ValueErrorr-   s      r   r.   z%LocalSeleniumCrawlerStrategy.set_hook   s/    

"$(DJJy!29+>??r"   c                     | j                   j                  |      }|r2 || }|+t        |t        j                        r|S t        d| d      | j                  S )NzHook z5 must return an instance of webdriver.Chrome or None.)r[   rW   
isinstancer   r]   	TypeErrorr^   )r   r*   argsr+   results        r   r_   z)LocalSeleniumCrawlerStrategy.execute_hook   sZ    zz~~i(4[F!fi&6&67!M#eI;6k$lmm{{r"   r&   c                    | j                   j                  d|        | j                  j                          t	        j
                  | j                  | j                         | _        | j                  d| j                        | _        y )Nzuser-agent=r\   rT   rP   )rT   rX   r^   quitr   r]   r\   r_   r(   s     r   r)   z.LocalSeleniumCrawlerStrategy.update_user_agent   s`    !!K
|"<=&&t||T\\R''(?Mr"   headersc                 x    | j                   j                  di        | j                   j                  dd|i       y )NzNetwork.enablezNetwork.setExtraHTTPHeadersrl   )r^   execute_cdp_cmd)r   rl   s     r   set_custom_headersz/LocalSeleniumCrawlerStrategy.set_custom_headers   s1    ##$4b9##$AIwCWXr"   c                 ,   t        | j                  j                        }t        |      D ]R  }t	        j
                  |       t        | j                  j                        }||k7  s= | j                  j                  S  | j                  j                  S r   )lenr^   page_sourcerangetimesleep)r   
max_checkscheck_intervalinitial_lengthixcurrent_lengths         r   _ensure_page_loadz.LocalSeleniumCrawlerStrategy._ensure_page_load   sr    T[[445
#BJJ~& !8!89N/{{&&& $ {{&&&r"   r   r   c                 R   dd l }|j                  |j                               j                         }| j                  rt
        j                  j                  t        j                  dt        j                               dd|      }t
        j                  j                  |      r/t        |d      5 }t        |j                               cd d d        S 	 | j                  d| j                         | _        | j"                  rt%        d| d       | j                   j'                  |       t)        | j                   d	      j+                  d
        t)        | j                   d      j+                  t-        j.                  t0        j2                  df             | j                   j5                  d       | j                  d| j                         | _        t        | j7                               }d}|j'                  dd      s|dk(  rt%        d       d}t9               }	d|	_        |	j=                  d       t?        j@                  | jB                  |	      }
|
j'                  |       | j                  d|
      | _        t        |
jD                        }|
jG                          |j'                  d| jH                        | _$        | jH                  rhtK        | jH                        tL        k(  rL| j                   j5                  | jH                         t)        | j                   d      j+                  d        nz| jH                  rntK        | jH                        tN        k(  rR| jH                  D ]C  }| j                   j5                  |       t)        | j                   d      j+                  d        E |j'                  dd      }|rtQ        |      r1t%        d       t)        | j                   d	      j+                  |       nSt%        d       t)        | j                   d	      j+                  t-        jR                  t0        jT                  |f             |st        | j                   jD                        }| j                  d| j                   |      | _        t
        j                  j                  t        j                  dt        j                               dd|      }t        |dd      5 }|jW                  |       d d d        | j"                  rt%        d| d        |S # 1 sw Y   ]xY w# 1 sw Y   3xY w# tX        $ rE}t[        |d!      st        tM        |            |_.        tY        d"| d#|j\                         d }~wt^        $ rE}t[        |d!      st        tM        |            |_.        t_        d"| d#|j\                         d }~wt`        $ rE}t[        |d!      st        tM        |            |_.        ta        d"| d#|j\                         d }~ww xY w)$Nr   CRAWL4_AI_BASE_DIRECTORYz	.crawl4aicacherrQ   u   [LOG] 🕸️ Crawling z& using LocalSeleniumCrawlerStrategy...   c                 *    | j                  d      dk(  S Nzreturn document.readyStatecompleteexecute_script)ds    r   <lambda>z4LocalSeleniumCrawlerStrategy.crawl.<locals>.<lambda>   s    !**+GHJVr"   
   bodyz/window.scrollTo(0, document.body.scrollHeight);rR   Fbypass_headlessz'<html><head></head><body></body></html>uQ   [LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...Tz--window-size=5,5rj   rZ   c                 *    | j                  d      dk(  S r   r   r^   s    r   r   z4LocalSeleniumCrawlerStrategy.crawl.<locals>.<lambda>   s    6#8#89U#VZd#dr"   c                 *    | j                  d      dk(  S r   r   r   s    r   r   z4LocalSeleniumCrawlerStrategy.crawl.<locals>.<lambda>  s    v'<'<=Y'Z^h'hr"   wait_foru#   [LOG] 🔄 Waiting for condition...rS   wutf-8)encodingu   [LOG] ✅ Crawled z successfully!msgzFailed to crawl z: )1hashlibmd5encode	hexdigestr8   ospathjoingetenvr   homeexistsopenrD   readr_   r^   rN   rV   rW   r   untilEC presence_of_all_elements_locatedr   TAG_NAMEr   r{   r
   rM   rX   r   r]   r\   rr   rk   rZ   typer2   listcallablepresence_of_element_locatedCSS_SELECTORwriter   hasattrr   r   	Exception)r   r   r   r   url_hashcache_file_pathfrA   can_not_be_done_headlessrT   r^   jsr   es                 r   r    z"LocalSeleniumCrawlerStrategy.crawl   s   ;;szz|,668 ggll2995OQUQZQZQ\+]_jlsu}~Oww~~o./3/10: 0/U	?++,<dkkJDK||/u4Z[\KKOOC $++r*00V $++r*0033R[[&4IJ KK&&'XY++OT[[IDK()?)?)ABD',$ zz+U3t?h7hij+/(!)#( $$%89"))$,,P

3"//H,V-?-?@ "::i>DL||T\\ 2c 9**4<<8dkk2.44d $t||"4"<,,BKK..r2!$++r288h ' zz*e4HH%?@!$++r288B?@!$++r288667RS ,,T[[-D-DE++,@$++tTDK !ggll2995OQUQZQZQ\+]_jlsu}~OosW= > ||*3%~>?KW 0/J >= ( 	N1e$-c!f5*-=cU"QUUG+LMM! 	H1e$-c!f5$'7uBquug%FGG 	?1e$-c!f5.se2aeeW=>>	?sX   1T'PU  0T4$U  'T14T=9U   	X&	A V		X&A WX&!A X!!X&c                 B   	 | j                   j                  d      }| j                   j                  d      }| j                   j                  ||       | j                   j                         }t	        j
                  t        |            }|j                  d      }t               }|j                  |dd       t        j                  |j                               j                  d      }| j                  rt        d       |S # t        $ r}t!        d	t#        |             }	t        |	       t	        j$                  dd
d      }
t'        j(                  |
      }	 t+        j,                  dd      }n## t.        $ r t+        j0                         }Y nw xY wd}d}t3        ||	||      }d}|j5                  ||||       t               }|
j                  |d       t        j                  |j                               j                  d      }|cY d }~S d }~ww xY w)Nz return document.body.scrollWidthz!return document.body.scrollHeightRGBJPEGU   )rY   qualityr   u3   [LOG] 📸 Screenshot taken and converted to base64zFailed to take screenshot: )i   iX  black)colorz	arial.ttf(   )   r   r   i  )r   r   )fillfont)rY   )r^   r   set_window_sizeget_screenshot_as_pngr   r   r   convertsavebase64	b64encodegetvaluedecoderN   rV   r   rD   r2   newr   Drawr   truetypeIOErrorload_default	wrap_texttext)r   total_widthtotal_height
screenshotimage	rgb_imagebuffered
img_base64r   error_messageimgdrawr   
text_color	max_widthwrapped_texttext_positions                    r   r%   z,LocalSeleniumCrawlerStrategy.take_screenshot,  s   8	++445WXK;;556YZL KK''\B ::<J JJwz23E e,I yHNN8FBN?))(*;*;*=>EEgNJ||KM 	14OPSTUPVx2XYM-  ))E:W=C>>#&D0 ))+r: 0 --/0 )JI$T=$	JL %M IIm\
IN yHHHXfH-))(*;*;*=>EEgNJ=	sD   DD 
HAH E76H7FHFA<HHHc                 8    | j                   j                          y r   )r^   rk   )r   s    r   rk   z!LocalSeleniumCrawlerStrategy.quitg  s    r"   )FN)   g{Gz?)r/   r0   r1   r7   r2   r   r.   r_   r)   dictro   r{   r    r%   rk   rG   rH   s   @r   rJ   rJ   O   sv    K/^@# @X @
c 
NC NY$ Y'`? `?3 `?D9 9vr"   rJ   )0abcr   r   seleniumr   !selenium.webdriver.chrome.servicer   selenium.webdriver.common.byr   selenium.webdriver.support.uir   selenium.webdriver.supportr	   r   !selenium.webdriver.chrome.optionsr
   selenium.common.exceptionsr   r   configloggingrt   r   PILr   r   r   ior   typingr   r   rB   r   pathlibr   utils	getLoggerloggersetLevelWARNINGlogger_driverurllib3_loggerhttp_client_loggerdriver_finder_loggerr   r4   rJ   r   r"   r   <module>r      s"   #  5 + 7 @ 5 S
    + +  !  	  			H	I   !!!"EF   w '"""#;<    ( 'W&&}5    GOO , )w(()RS    goo .
c "+? +$Y? Yr"   