
    gW                       d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZmZm Z  d dl!m"Z" d d	lm#Z# d d
l$m%Z% d dl&Z&d dl'Z'd dl(Z(ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8 d dl9m:Z:m;Z; ddl<m=Z=  e:dddddddddddd      Z>g dZ? G d d      Z@ G d d      ZA G d de      ZB G d deB      ZCy)     N)ABCabstractmethod)CallableDictAnyListOptional	AwaitableUnion)async_playwrightPageBrowserErrorBrowserContext)TimeoutError)BytesIO)Image	ImageDraw	ImageFont)PathProxySettings)	BaseModel   )load_js_script)AsyncCrawlResponse)get_error_context)UserAgentGenerator)SCREENSHOT_HEIGHT_TRESHOLDDOWNLOAD_PAGE_TIMEOUT)BrowserConfigCrawlerRunConfig)AsyncLogger)StealthConfigstealth_async)SSLCertificateT)	webdriver
chrome_app
chrome_csichrome_load_timeschrome_runtimenavigator_languagesnavigator_pluginsnavigator_permissionswebgl_vendorouterdimensionsnavigator_hardware_concurrencymedia_codecs)z--disable-background-networking%--disable-background-timer-throttlingz(--disable-backgrounding-occluded-windowsz--disable-breakpadz(--disable-client-side-phishing-detectionz4--disable-component-extensions-with-background-pagesz--disable-default-appsz--disable-extensionsz--disable-features=TranslateUIz--disable-hang-monitor!--disable-ipc-flooding-protectionz--disable-popup-blockingz--disable-prompt-on-repostz--disable-sync--force-color-profile=srgbz--metrics-recording-only--no-first-runz--password-store=basicz--use-mock-keychainc                       e Zd ZU dZeed<   eed<   eed<   ej                  ed<   eed<   e	ed<   eed<   	 	 	 	 	 	 ddede
e   dedede	f
d
ZdefdZd ZdefdZdee   fdZd Zy	)ManagedBrowsera6  
    Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
    
    Attributes:
        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
                            Default: "chromium".
        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
                                     temporary directory may be used. Default: None.
        headless (bool): Whether to run the browser in headless mode (no visible GUI).
                         Default: True.
        browser_process (subprocess.Popen): The process object for the browser.
        temp_dir (str): Temporary directory for user data if not provided.  
        debugging_port (int): Port for debugging the browser.
        host (str): Host for debugging the browser.
        
        Methods:
            start(): Starts the browser process and returns the CDP endpoint URL.
            _get_browser_path(): Returns the browser executable path based on OS and browser type.
            _get_browser_args(): Returns browser-specific command line arguments.
            _get_user_data_dir(): Returns the user data directory path.
            _cleanup(): Terminates the browser process and removes the temporary directory. 
    browser_typeuser_data_dirheadlessbrowser_processtemp_dirdebugging_porthostNc                     || _         || _        || _        d| _        d| _        || _        || _        || _        d| _        y)a;  
        Initialize the ManagedBrowser instance.
        
        Args:
            browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
                                Default: "chromium".
            user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
                                         temporary directory may be used. Default: None.
            headless (bool): Whether to run the browser in headless mode (no visible GUI).
                             Default: True.
            logger (logging.Logger): Logger instance for logging messages. Default: None.
            host (str): Host for debugging the browser. Default: "localhost".
            debugging_port (int): Port for debugging the browser. Default: 9222.
        NF)	r9   r:   r;   r<   r=   r>   r?   loggershutting_down)selfr9   r:   r;   rA   r?   r>   s          T/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/async_crawler_strategy.py__init__zManagedBrowser.__init__a   sH    . )* #,	"    returnc                 Z  K   | j                   s,t        j                  d      | _        | j                  | _         | j	                         }| j                         }	 t        j                  |t        j                  t        j                        | _	        t        j                  | j                                t        j                  d       d{    d| j                   d| j                   S 7  # t         $ r,}| j#                          d{  7   t!        d|       d}~ww xY ww)	z
        Starts the browser process and returns the CDP endpoint URL.
        If user_data_dir is not provided, creates a temporary directory.
        zbrowser-profile-)prefix)stdoutstderr   Nhttp://:zFailed to start browser: )r:   tempfilemkdtempr=   _get_browser_path_get_browser_args
subprocessPopenPIPEr<   asynciocreate_task_monitor_browser_processsleepr?   r>   	Exceptioncleanup)rC   browser_pathargses       rD   startzManagedBrowser.start   s      !!$,,4FGDM!%D --/%%'
	=#-#3#3Z__Z__$D   = = ?@--"""TYYKq)<)<(=>> # 	=,,.  7s;<<	=sI   AD+A4C3 C1C3 0D+1C3 3	D(<D#DD##D((D+c           	      d  K   | j                   rK	 t        j                  t        j                  | j                   j                  j
                        t        j                  | j                   j                  j
                               d{   \  }}| j                   j                         | j                  sk| j                  j                  dd| j                   j                  |j                         |j                         d       | j                          d{    y| j                  j                  ddd| j                   j                  i       yyy7 7 ># t        $ rD}| j                  s.| j                  j                  d	dd
t!        |      i       Y d}~yY d}~yd}~ww xY ww)a  
        Monitor the browser process for unexpected termination.
        
        How it works:
        1. Read stdout and stderr from the browser process.
        2. If the process has terminated, log the error message and terminate the browser.
        3. If the shutting_down flag is set, log the normal termination message.
        4. If any other error occurs, log the error message.
        
        Note: This method should be called in a separate task to avoid blocking the main event loop.
        Nz\Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}ERROR)coderJ   rK   messagetagparamsz2Browser process terminated normally | Code: {code}INFOrb   z)Error monitoring browser process: {error}error)r<   rV   gather	to_threadrJ   readrK   pollrB   rA   rh   
returncodedecoder[   inforZ   str)rC   rJ   rK   r^   s       rD   rX   z'ManagedBrowser._monitor_browser_process   sw     '.~~%%d&:&:&A&A&F&FG%%d&:&:&A&A&F&FG( " '',,.:--)) %C '(,(<(<(G(G*0--/*0--/$ *  #lln,,(($X &$*D,@,@,K,K#L )  ;  "" -  ))KK%% K# 'Q0 &   *s`   F0A;E  EBE  E E  $F0%4E  F0E  E   	F-)5F(
F0(F--F0c                     t         j                  dk(  rdddd}n t         j                  dk(  rddd	d}nd
dd	d}|j                  | j                        S )z@Returns the browser executable path based on OS and browser typedarwinz</Applications/Google Chrome.app/Contents/MacOS/Google Chromez0/Applications/Firefox.app/Contents/MacOS/firefoxz./Applications/Safari.app/Contents/MacOS/Safari)chromiumfirefoxwebkitwin32z5C:\Program Files\Google\Chrome\Application\chrome.exez,C:\Program Files\Mozilla Firefox\firefox.exeNzgoogle-chromert   )sysplatformgetr9   )rC   pathss     rD   rQ   z ManagedBrowser._get_browser_path   se    <<8#ZMJE
 \\W$XLE ,$E yy**++rF   c                    | j                         g}| j                  dk(  r@d| j                   d| j                   g}| j                  r|j                  d       ||z   S | j                  dk(  rEdt        | j                        d| j                  g}| j                  r|j                  d       ||z   S t        d	| j                   d
      )z/Returns browser-specific command line argumentsrs   z--remote-debugging-port=z--user-data-dir=z--headless=newrt   z--remote-debugging-portz	--profilez
--headlesszBrowser type z not supported)rQ   r9   r>   r:   r;   appendrp   NotImplementedError)rC   	base_argsr]   s      rD   rR   z ManagedBrowser._get_browser_args   s    ++-.	
**4+>+>*?@"4#5#5"67D }},- 4 )+)D''(""	D }}L) 4 &d6G6G5H&WXXrF   c           	      N  K   d| _         | j                  r	 | j                  j                          t        d      D ];  }| j                  j	                          nt        j                  d       d{    = | j                  j	                         7| j                  j                          t        j                  d       d{    | j                  rKt        j                  j                  | j                        r!	 t!        j"                  | j                         yyy7 7 ^# t        $ r3}| j                  j                  dddt        |      i       Y d}~d}~ww xY w# t        $ r3}| j                  j                  d	ddt        |      i       Y d}~yd}~ww xY ww)
z/Cleanup browser process and temporary directoryT
   N皙?z"Error terminating browser: {error}ra   rh   rc   z+Error removing temporary directory: {error})rB   r<   	terminaterangerl   rV   rY   killrZ   rA   rh   rp   r=   ospathexistsshutilrmtree)rC   _r^   s      rD   r[   zManagedBrowser.cleanup   sb     "$$..0rA++002>!--,,, # '',,.6((--/!--,,, ==RWW^^DMM:dmm, ;= -
 - !!@#SV, "    !!I#SV, "  s~   F%AD' 3D#4AD' D%D' 5F%E&  F%#D' %D' '	E#0)EF%E##F%&	F"/)FF%F""F%)rs   NFN	localhosti$  )__name__
__module____qualname____doc__rp   __annotations__boolrS   rT   intr	   rE   r_   rX   rQ   r   rR   r[    rF   rD   r8   r8   B   s    . N%%%M
I ''+"##  }# 	# # #B=S =8,\,3 ,. 49  2"rF   r8   c                   n    e Zd ZdZddefdZd ZdefdZ	 dde	d	e
fd
Zd Zd	e
fdZdefdZd Zd Zy)BrowserManageraO  
    Manages the browser instance and context.
    
    Attributes: 
        config (BrowserConfig): Configuration object containing all browser settings
        logger: Logger instance for recording events and errors
        browser (Browser): The browser instance
        default_context (BrowserContext): The default browser context    
        managed_browser (ManagedBrowser): The managed browser instance
        playwright (Playwright): The Playwright instance
        sessions (dict): Dictionary to store session information
        session_ttl (int): Session timeout in seconds
    Nbrowser_configc                    || _         || _        d| _        d| _        d| _        d| _        i | _        d| _        | j                   j                  rpt        | j                   j                  | j                   j                  | j                   j                  | j                  | j                   j                        | _        yy)a  
        Initialize the BrowserManager with a browser configuration.

        Args:
            browser_config (BrowserConfig): Configuration object containing all browser settings
            logger: Logger instance for recording events and errors
        Ni  )r9   r:   r;   rA   r>   )configrA   browserdefault_contextmanaged_browser
playwrightsessionssession_ttluse_managed_browserr8   r9   r:   r;   r>   )rC   r   rA   s      rD   rE   zBrowserManager.__init__/  s     &4 ##  ;;**#1![[55"kk77--{{#{{99$D  +rF   c                   K   | j                   (ddlm}  |       j                          d{   | _         | j                  j
                  r| j                  j                          d{   }| j                   j                  j                  |       d{   | _	        | j                  j                  }|r|d   | _        n| j                          d{   | _        | j                  | j                         d{    y| j                         }| j                  j                  dk(  r4 | j                   j                   j"                  di | d{   | _	        n| j                  j                  dk(  r4 | j                   j$                  j"                  di | d{   | _	        n3 | j                   j                  j"                  di | d{   | _	        | j                  | _        y7 7 7 c7 &7 7 7 `7 .w)a  
        Start the browser instance and set up the default context.
        
        How it works:
        1. Check if Playwright is already initialized.
        2. If not, initialize Playwright.
        3. If managed browser is used, start it and connect to the CDP endpoint.
        4. If managed browser is not used, launch the browser and set up the default context.
        
        Note: This method should be called in a separate task to avoid blocking the main event loop.
        Nr   )r   rt   ru   r   )r   playwright.async_apir   r_   r   r   r   rs   connect_over_cdpr   contextsr   create_browser_contextsetup_context_build_browser_argsr9   rt   launchru   )rC   r   cdp_urlr   browser_argss        rD   r_   zBrowserManager.startN  s     ??"=$4$6$<$<$>>DO;;** 006688G!%!9!9!J!J7!SSDL||,,H'/{$-1-H-H-J'J$ $$T%9%9:::335L {{''94%CT__%<%<%C%C%Sl%SS))X5%BT__%;%;%B%B%R\%RR%DT__%=%=%D%D%T|%TT#'<<D G ? 9S
 (K ;  TRTs   +H	G4<H	*G7+,H	G:?H	G='H	?H  AH	HAH	$H%3H	HH	7H	:H	=H	 H	H	H	H	rG   c                    ddddddddd	d
ddddddddd| j                   j                   d| j                   j                   g}| j                   j                  r|j	                  t
               | j                   j                  r|j	                  g d       | j                   j                  r%|j	                  | j                   j                         | j                   j                  |d}| j                   j                  r| j                   j                  |d<   | j                   j                  rg| j                   j                  xs2 t        j                  j                  t        j                         d      |d<   t        j                   |d   d       | j                   j"                  s| j                   j$                  rddlm} | j                   j"                  r || j                   j"                        ns || j                   j$                  j+                  d      | j                   j$                  j+                  d       | j                   j$                  j+                  d!      "      }||d#<   |S )$z+Build browser launch arguments from config.z--disable-gpuz--disable-gpu-compositing--disable-software-rasterizerz--no-sandbox--disable-dev-shm-usager6   z--no-default-browser-checkz--disable-infobarsz--window-position=0,0z--ignore-certificate-errorsz%--ignore-certificate-errors-spki-listz---disable-blink-features=AutomationControlledz--window-position=400,0z --disable-renderer-backgroundingr4   r5   z--mute-audior3   z--window-size=,)z$--blink-settings=imagesEnabled=falsez--disable-remote-fontsz--disable-imagesz--disable-javascriptr   r   )r;   r]   channel	downloadsdownloads_pathT)exist_okr   r   )serverr   usernamepassword)r   r   r   proxy)r   viewport_widthviewport_height
light_modeextendBROWSER_DISABLE_OPTIONS	text_mode
extra_argsr;   chrome_channelaccept_downloadsr   r   r   joingetcwdmakedirsr   proxy_configr   r   ry   )rC   r]   r   r   proxy_settingss        rD   r   z"BrowserManager._build_browser_args  s    '+%( #)3;%./(3T[[778$++:U:U9VW)
. ;;!!KK/0;;  KK	 ;;!!KK../$(KK$8$8$G;;%%&*kk&@&@L#;;''-1[[-G-G .277<<		[LL)* KK%56F;; 8 8: ;;$$ T[[%6%67";;3377A![[5599*E![[5599*E  %3L!rF   contextcrawlerRunConfigc                   K   | j                   j                  r-|j                  | j                   j                         d{    | j                   j                  r-|j	                  | j                   j                         d{    | j                   j
                  r|j                  d       d{    | j                   j                  r|j                  t               |j                  t               | j                   j                  rFd|j                  j                  d<   | j                   j                  |j                  j                  d<   | j                   j                  rk| j                   j                  | j                   j                  d}|j                  | j                   j                         |j                  |       d{    |j	                  dd|j                   d	g       d{    |j"                  s|j$                  s|j&                  r#|j)                  t+        d
             d{    yy7 7 7 7 }7 W7 w)aW  
        Set up a browser context with the configured options.

        How it works:
        1. Set extra HTTP headers if provided.
        2. Add cookies if provided.
        3. Load storage state if provided.
        4. Accept downloads if enabled.
        5. Set default timeouts for navigation and download.
        6. Set user agent if provided.
        7. Set browser hints if provided.
        8. Set proxy if provided.
        9. Set downloads path if provided.
        10. Set storage state if provided.
        11. Set cache if provided.
        12. Set extra HTTP headers if provided.
        13. Add cookies if provided.
        14. Set default timeouts for navigation and download if enabled.
        15. Set user agent if provided.
        16. Set browser hints if provided.
        
        Args:
            context (BrowserContext): The browser context to set up
            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
            is_default (bool): Flag indicating if this is the default context        
        Returns:
            None
        Nr   Tr   r   )
User-Agentz	sec-ch-uacookiesEnabledtruenamevalueurlnavigator_overrider)r   headersset_extra_http_headerscookiesadd_cookiesstorage_stater   set_default_timeoutr    set_default_navigation_timeoutr   	_impl_obj_options
user_agentbrowser_hintupdater   override_navigatorsimulate_usermagicadd_init_scriptr   )rC   r   r   
is_defaultcombined_headerss        rD   r   zBrowserManager.setup_context  s    D ;;001D1DEEE;;%%dkk&9&9:::;;$$''T'222;;''''(=>223HI{{))AE!!**+=>KK.. !!**+;<
 ;;!!"kk44![[55  ##DKK$7$78001ABBB !!&@P@T@TUV
 	
 	
 //--%%)).9N*OPPP &G F ; 3$ C	
 Qsn   ?IIAII	/I4I5DII'I9I:AI?I I	IIIIIc           	        K   | j                   j                  j                  d| j                   j                        }| j                   j                  | j                   j
                  d}| j                   j                  rd| j                   j                  ind}g d}|||| j                   j                  | j                   j                  | j                   j                  d| j                   j                  d}| j                   j                  rddd	}|j                  |        | j                  j                  di | d{   }| j                   j                  r%|D ]   }|j                  d
| d        d{    " |S 7 A7 
w)a  
        Creates and returns a new browser context with configured settings.
        Applies text-only mode settings if text_mode is enabled in config.
        
        Returns:
            Context: Browser context object with the specified configurations
        r   widthheightr   N),jpgjpegpnggifwebpsvgicobmptiffpsdwoffwoff2ttfotfeotmp4webmoggavimovwmvflvm4vmp3wavaacm4aopusflacpdfdocdocxxlsxlsxpptpptxziprar7ztargzxmlswfwasmg      ?)r   viewportr   r   r   ignore_https_errorsdevice_scale_factorjava_script_enabledF)	has_touch	is_mobilez**/*.c                 "    | j                         S N)abort)routes    rD   <lambda>z7BrowserManager.create_browser_context.<locals>.<lambda>O  s
    rF   r   )r   r   ry   r   r   r   r   r   r   r  r  r   r   r   new_contextr  )	rC   r   viewport_settingsr   blocked_extensionscontext_settingstext_mode_settingsr   exts	            rD   r   z%BrowserManager.create_browser_context  sZ     [[((,,\4;;;Q;QR
[[//kk11
 ;?++:K:K(DKK$5$56QU
( %)# $ < <![[66#';;#B#B#&#';;#B#B	
 ;;  """
 ##$67 100D3CDD ;;  )mmeC5M3NOOO * E Ps$   EF
F8F
=F>	F
F
c                   K   | j                          |j                  rg|j                  | j                  v rO| j                  |j                     \  }}}||t        j                         f| j                  |j                  <   ||fS | j                  j
                  r%| j                  }|j                          d{   }nJ| j                          d{   }| j                  ||       d{    |j                          d{   }|j                  r.||t        j                         f| j                  |j                  <   ||fS 7 7 v7 ^7 Hw)ai  
        Get a page for the given session ID, creating a new one if needed.
        
        Args:
            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings

        Returns:
            Page: The page object for the given session ID.
            BrowserContext: The browser context for the given session ID.
        N)
_cleanup_expired_sessions
session_idr   timer   r   r   new_pager   r   )rC   r   r   pager   s        rD   get_pagezBrowserManager.get_pageS  s     	&&(&&+;+F+F$--+W#}}-=-H-HIGT1:A49UDMM*556= ;;****G ))++D 7799G$$W.>??? ))++D&&:A49UDMM*556W} ,9?+sI   B9E;E	<EEE.E/EEAEEEEr%  c                    K   || j                   v rg| j                   |   \  }}}|j                          d{    | j                  j                  s|j                          d{    | j                   |= yy7 A7 w)z
        Kill a browser session and clean up resources.  
        
        Args:
            session_id (str): The session ID to kill.
        N)r   closer   r   )rC   r%  r   r(  r   s        rD   kill_sessionzBrowserManager.kill_sessionr  sl      &#}}Z8GT1**,;;22mmo%%j) '%s!   5A=A9-A=%A;&A=;A=c                    t        j                          }| j                  j                         D cg c]  \  }\  }}}||z
  | j                  kD  r| }}}}|D ]&  }t	        j
                  | j                  |             ( yc c}}}w )z'Clean up expired sessions based on TTL.N)r&  r   itemsr   rV   rW   r,  )rC   current_timesidr   	last_usedexpired_sessionss         rD   r$  z(BrowserManager._cleanup_expired_sessions  s    yy{ +/--*=*=*?
*?&&aIi'$*:*:: *? 	 

 $C 1 1# 67 $
s   "Bc                   K   | j                   j                  rt        j                  d       d{    t	        | j
                  j                               }|D ]  }| j                  |       d{     | j                  r)| j                  j                          d{    d| _        | j                  rFt        j                  d       d{    | j                  j                          d{    d| _
        | j                  r*| j                  j                          d{    d| _        yy7 7 7 7 k7 K7 w)z)Close all browser resources and clean up.g      ?N)r   sleep_on_closerV   rY   listr   keysr,  r   r+  r   r[   r   stop)rC   session_idsr%  s      rD   r+  zBrowserManager.close  s    ;;%%--$$$4==--/0%J##J/// & <<,,$$&&&DL--$$$&&..000#'D ??//&&((("DO  % 0 ' %0 )sj   /ED7A E2D:3/E"D<#/ED>!E4E 54E)E*E:E<E>E EEr  )F)r   r   r   r   r!   rE   r_   dictr   r   r"   r   r   r)  rp   r,  r$  r+  r   rF   rD   r   r   !  st    } >20hDT DT 	HQHQ +HQT>B/? >*S *	8#rF   r   c                   *    e Zd ZdZededefd       Zy)AsyncCrawlerStrategyze
    Abstract base class for crawler strategies.
    Subclasses must implement the crawl method.
    r   rG   c                    K   y wr  r   )rC   r   kwargss      rD   crawlzAsyncCrawlerStrategy.crawl  s	     s   N)r   r   r   r   r   rp   r   r>  r   rF   rD   r;  r;    s*     s 1C  rF   r;  c            
       ,   e Zd ZdZ	 d8dedefdZd Zd Zd Z	d	 Z
d
efdZdedefdZdefdZdefdZdeeef   fdZd9dededefdZd9dededefdZd ZdefdZdededefdZdededefd Zd:ded!efd"Zd# Zdeddfd$Zdede fd%Z!defd&Z"d'e defd(Z#dedefd)Z$dedefd*Z%d;d+ede&fd,Z'ded-e(ee)e   f   deee*f   fd.Z+ded-e(ee)e   f   deee*f   fd/Z,d0 Z-d:ded1e.d2e.d3efd4Z/ded1e.d2e.deee*f   fd5Z0defd6Z1dede2fd7Z3y)<AsyncPlaywrightCrawlerStrategya  
    Crawler strategy using Playwright.
    
    Attributes:
        browser_config (BrowserConfig): Configuration object containing browser settings.
        logger (AsyncLogger): Logger instance for recording events and errors.
        _downloaded_files (List[str]): List of downloaded file paths.   
        hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior.
        browser_manager (BrowserManager): Manager for browser creation and management.

        Methods:
            __init__(self, browser_config=None, logger=None, **kwargs):
                Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
            __aenter__(self):
                Start the browser and initialize the browser manager.
            __aexit__(self, exc_type, exc_val, exc_tb):
                Close the browser and clean up resources.
            start(self):
                Start the browser and initialize the browser manager.
            close(self):
                Close the browser and clean up resources.
            kill_session(self, session_id):
                Kill a browser session and clean up resources.
            crawl(self, url, **kwargs):
                Run the crawler for a single URL.
            
    Nr   rA   c           	          |xs t        j                  |      | _        || _        g | _        ddddddddd| _        t        | j                  | j                        | _        y)a  
        Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.

        Args:
            browser_config (BrowserConfig): Configuration object containing browser settings.
                                          If None, will be created from kwargs for backwards compatibility.
            logger: Logger instance for recording events and errors.
            **kwargs: Additional arguments for backwards compatibility and extending functionality.
        N)on_browser_createdon_page_context_createdon_user_agent_updatedon_execution_startedbefore_goto
after_gotobefore_return_htmlbefore_retrieve_html)r   rA   )r!   from_kwargsr   rA   _downloaded_fileshooksr   browser_manager)rC   r   rA   r=  s       rD   rE   z'AsyncPlaywrightCrawlerStrategy.__init__  sn     -Q0I0I&0Q "$ #''+%)$("&$(	

  ...t{{ 
rF   c                 B   K   | j                          d {    | S 7 wr  )r_   rC   s    rD   
__aenter__z)AsyncPlaywrightCrawlerStrategy.__aenter__  s     jjl 	s   c                 @   K   | j                          d {    y 7 wr  )r+  )rC   exc_typeexc_valexc_tbs       rD   	__aexit__z(AsyncPlaywrightCrawlerStrategy.__aexit__  s     jjls   c                    K   | j                   j                          d{    | j                  d| j                   j                  | j                   j                         d{    y7 I7 w)zG
        Start the browser and initialize the browser manager.
        NrB  r   )rM  r_   execute_hookr   r   rO  s    rD   r_   z$AsyncPlaywrightCrawlerStrategy.start  si      ""((***   ((((88   
 	
 	
 	+	
s"   A.A*AA.$A,%A.,A.c                 T   K   | j                   j                          d{    y7 w)z;
        Close the browser and clean up resources.
        N)rM  r+  rO  s    rD   r+  z$AsyncPlaywrightCrawlerStrategy.close  s       ""((***s   (&(r%  c                    K   | j                   j                  dd       | j                  j                  |       d{    y7 w)z
        Kill a browser session and clean up resources.
        
        Args:
            session_id (str): The ID of the session to kill.
            
        Returns:
            None
        zSSession auto-kill is enabled in the new version. No need to manually kill sessions.WARNINGrd   re   N)rA   warningrM  r,  )rC   r%  s     rD   r,  z+AsyncPlaywrightCrawlerStrategy.kill_session  sA      	i 	 	
 ""//
;;;s   <AAA	hook_typehookc                 Z    || j                   v r|| j                   |<   yt        d|       )a  
        Set a hook function for a specific hook type. Following are list of hook types:
        - on_browser_created: Called when a new browser instance is created.
        - on_page_context_created: Called when a new page context is created.    
        - on_user_agent_updated: Called when the user agent is updated.    
        - on_execution_started: Called when the execution starts.    
        - before_goto: Called before a goto operation.    
        - after_goto: Called after a goto operation.    
        - before_return_html: Called before returning HTML content.    
        - before_retrieve_html: Called before retrieving HTML content.  
        
        All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs.
        
        Args:
            hook_type (str): The type of the hook.
            hook (Callable): The hook function to set.
            
        Returns:
            None
        zInvalid hook type: N)rL  
ValueError)rC   r^  r_  s      rD   set_hookz'AsyncPlaywrightCrawlerStrategy.set_hook  s1    * 

"$(DJJy!29+>??rF   c                    K   | j                   j                  |      }|r-t        j                  |      r ||i | d{   S  ||i |S |r|d   S dS 7 w)aH  
        Execute a hook function for a specific hook type.
        
        Args:
            hook_type (str): The type of the hook.
            *args: Variable length positional arguments.
            **kwargs: Keyword arguments.
            
        Returns:
            The return value of the hook function, if any.
        Nr   )rL  ry   rV   iscoroutinefunction)rC   r^  r]   r=  r_  s        rD   rX  z+AsyncPlaywrightCrawlerStrategy.execute_hook0  sd      zz~~i(**40!426222T,V,,tAw(D( 3s   >A AAr   c                     || _         y)z
        Update the user agent for the browser.
        
        Args:
            user_agent (str): The new user agent string.
            
        Returns:
            None
        N)r   )rC   r   s     rD   update_user_agentz0AsyncPlaywrightCrawlerStrategy.update_user_agentD  s     %rF   r   c                     || _         y)z 
        Set custom headers for the browser. 
        
        Args:
            headers (Dict[str, str]): A dictionary of headers to set.
            
        Returns:
            None
        N)r   )rC   r   s     rD   set_custom_headersz1AsyncPlaywrightCrawlerStrategy.set_custom_headersP  s     rF   r(  wait_fortimeoutc                 P  K   |j                         }|j                  d      r.|dd j                         }| j                  |||       d{   S |j                  d      r0|dd j                         }	 |j                  ||       d{    y|j                  d      s|j                  d      r| j                  |||       d{   S 	 |j                  ||       d{    y7 7 a# t        $ r3}dt        |      v rt        d| d	| d
      t        d| d
      d}~ww xY w7 d7 J# t        $ rf}dt        |      v rt        d| d	| d
      	 | j                  |d| d|       d{  7  cY d}~S # t        $ r t        d| d      w xY wd}~ww xY ww)a   
        Wait for a condition in a smart way. This functions works as below:
        
        1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true.
        2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present.
        3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true.
        4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present.
        
        This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl().        
        Args:
            page: Playwright page object
            wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'.
            timeout (float): Maximum time to wait in milliseconds
            
        Returns:
            None
        zjs:   Nzcss:   rj  TimeoutzTimeout after zms waiting for selector ''zInvalid CSS selector: 'z()functionz() => {}zInvalid wait_for parameter: 'zp'. It should be either a valid CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'.)strip
startswithcsp_compliant_waitwait_for_selectorr   rp   r   ra  )rC   r(  ri  rj  js_codecss_selectorr^   s          rD   
smart_waitz)AsyncPlaywrightCrawlerStrategy.smart_wait\  s    $ >>#u%qrl((*G00wHHH  (#AB<--/LP,,\7,KKK ""4(H,?,?
,K!44T8WMMM0070KKK+ I
 L PA&&(	1J<.XYZ  %'>|nA%NOOP N L  CF**,WI5NxjXYZ 	)-)@)@ $
"&=w* $ $   % ","?z JO !O# s   AF&C-(F&7C1 C/C1 9F&D0F&D4 'D2(D4 ,F&/C1 1	D-:.D((D--F&2D4 4	F#=FF7E:8F<F#=F&FFF##F&user_wait_functionc                    K   d| d| d}	 |j                  |       d{   }|S 7 # t        $ r.}dt        |      v rt        dt        |             Y d}~yd}~ww xY ww)a  
        Wait for a condition in a CSP-compliant way.
        
        Args:
            page: Playwright page object
            user_wait_function: JavaScript function as string that returns boolean
            timeout: Maximum time to wait in milliseconds
            
        Returns:
            bool: True if condition was met, False if timed out
            
        Raises:
            RuntimeError: If there's an error evaluating the condition
        z8
        async () => {
            const userFunction = z;
            const startTime = Date.now();
            try {
                while (true) {
                    if (await userFunction()) {
                        return true;
                    }
                    if (Date.now() - startTime > aR  ) {
                        return false;  // Return false instead of throwing
                    }
                    await new Promise(resolve => setTimeout(resolve, 100));
                }
            } catch (error) {
                throw new Error(`Error evaluating condition: ${error.message}`);
            }
        }
        NzError evaluating conditionz#Failed to evaluate wait condition: F)evaluaterZ   rp   RuntimeError)rC   r(  rz  rj  
wrapper_jsresultr^   s          rD   ru  z1AsyncPlaywrightCrawlerStrategy.csp_compliant_wait  s~     ""4!5 62 3: 	;	
(	==44FM 5 	+s1v5"%HQ#QRR		s6   
A$* (* A$* 	A!$AA$A!!A$c           
        K   |j                  d       d{   }t        |      D ]  \  }}	 |j                  d| d       d{    |j                          d{   }|ro|j	                  dd       d{    |j                  d       d{   }d	| }|j                  d
d      }|j                  d| d| d| d       d{    n | j                  j                  ddd|i        |S 7 7 7 7 7 k7 3# t        $ r5}	| j                  j                  dd|t        |	      d       Y d}	~	d}	~	ww xY ww)a  
        Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content.
        
        Args:
            page: Playwright page object
            
        Returns:
            Playwright page object
        iframeNz"(element) => element.id = "iframe-"load0u  rn  z() => document.body.innerHTMLzextracted-iframe-content-`z\`zl
                        () => {
                            const iframe = document.getElementById('iframe-zx');
                            const div = document.createElement('div');
                            div.innerHTML = `z0`;
                            div.className = 'zf';
                            iframe.replaceWith(div);
                        }
                    z1Could not access content frame for iframe {index}SCRAPEindexrc   z(Error processing iframe {index}: {error}ra   )r  rh   )query_selector_all	enumerater|  content_framewait_for_load_statereplacerA   r]  rZ   rh   rp   )
rC   r(  iframesir  frameiframe_content
class_name_iframer^   s
             rD   process_iframesz.AsyncPlaywrightCrawlerStrategy.process_iframes  s     //99"7+IAv-oo(J1#Q&OPPP %224433 4   
 ,1>>7, &N
 $=QC!@J -44S%@G--LLM3 O..5Y 7..8\ :
 
 
 KK'' S$ '| ( I ,b g :
 Q 5
&
"  !!F%&Q8 "  s   EC8EDC:DC<D;C><DD 9DD%D4E:D<D>D DD	E*D=7E=EErG   c                 ,  K   | j                          d{    |j                  d      xs t        t        j                               }|j                  d| j
                        }| j                  j                  ||       d{   \  }}|S 7 y7 w)a  
        Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls.
        This function is asynchronous and returns a string representing the session ID.
        
        Args:
            **kwargs: Optional keyword arguments to configure the session.
        
        Returns:
            str: The session ID.
        Nr%  r   )r_   ry   rp   uuiduuid4r   rM  r)  )rC   r=  r%  r   r(  r   s         rD   create_sessionz-AsyncPlaywrightCrawlerStrategy.create_session  sz      jjlZZ-BTZZ\1B
ZZdoo>
"22;;J
SSg 	 Ts"   BBA/BB
BBr   r   c                   K   |xs t        j                  |      }i }d}d}|j                  d      r| j                  ||       d{   S |j                  d      r|dd }t        j
                  j                  |      st        d|       t        |dd	      5 }|j                         }	ddd       |j                  r| j                  	       d{   }t        	|||d
      S |j                  d      s|j                  d      rI|dd dk(  r|dd n|dd }
|
}	|j                  r| j                  |	       d{   }t        |	|||d
      S t        d      7 # 1 sw Y   xY w7 7 0w)a  
        Crawls a given URL or processes raw HTML/local file content based on the URL prefix.

        Args:
            url (str): The URL to crawl. Supported prefixes:
                - 'http://' or 'https://': Web URL to crawl.
                - 'file://': Local file path to process.
                - 'raw://': Raw HTML content to process.
            **kwargs: Additional parameters:
                - 'screenshot' (bool): Whether to take a screenshot.
                - ... [other existing parameters]

        Returns:
            AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
           N)rM   zhttps://zfile://   zLocal file not found: rutf-8)encoding)htmlresponse_headersstatus_code
screenshotget_delayed_contentzraw:zraw://rm  z?URL must start with 'http://', 'https://', 'file://', or 'raw:')r"   rJ  rt  
_crawl_webr   r   r   FileNotFoundErroropenrk   r  _generate_screenshot_from_htmlr   ra  )rC   r   r   r=  r  r  screenshot_datalocal_file_pathfr  raw_htmls              rD   r>  z$AsyncPlaywrightCrawlerStrategy.crawl  s      ?+77?>>12f555^^I&!!"gO77>>/2'*@@Q(RSSosW=vvx >  (,(K(KD(Q"Q%!1'*$(  ^^F#s~~h'?"%bq'V"3s12wQRHD  (,(K(KD(Q"Q%!1'*$(  Q C 6 >= #R #RsJ   AE5E"	AE5E%/(E5E1A*E5E3 E5%E.*E53E5c           
      n   K   |_         i }d}g  _         j                  j                  }|j                  rZ j                  j
                  dk7  rA t               j                  dNi  j                  j                  xs i  j                  _         j                  j                  |       d{   \  }|j                  dddg       d{    |j                  s|j                  s|j                  r"|j                  t        d             d{     j!                  d|	       d{    |j"                  r.	 dO fd
	j%                  d       j%                  dfd       	 d}|j&                  rt)        j*                        } j                  j,                  rj%                  d fd       |j.                  s j!                  d|       d{    	 t1        j2                  t5        j6                  d            j9                         }j;                  dd| di       d{    j=                  |j>                  |j@                         d{   }	 j!                  d||	       d{    |	d}i }n|	jH                  }|	jJ                  }nd}i }	 jM                  ddd       d{     jO                  dd        d{   }|s3|jP                  s' jS                         d{   }tC        d!|        j                  j\                  s|j^                  s|j`                  r}jc                  d&       d{    te        jf                  d'       d{     jO                  d(d)        d{   }|s) jX                  r jX                  ji                  d*d+,        j                  j\                  s|j`                  r	  jk                         d{   }|d-   }|d.   } j                  jl                  }to        ||z  |z  d/z        }jq                  ||d0       d{    ts        ||z  ||z        }jt                  jw                         d{   }|jy                  d1||d2d3|d4       d{    |j|                  r$ j                  |j                         d{    |j                  rt j                  |j                         d{   }|d8   s/ jX                  ji                  d9d:d7|j                  d7      i%        j!                  d;|	       d{    |j                  s|j                  rj                  j                  d<d<       d{    j                  j                          d{    j                  j                          d{    j                  j                  d=       d{    |j                  r1	  j                  |j                  |j@                          d{     j                  j\                  sAt        d?      }	 	 jc                  d&d@        d{    j                  |       d{    |j                  r j                         d{    j!                  dC|	       d{    |j                  r'te        jf                  |j                         d{    |j                  r j                         d{    j                          d{   } j!                  dD||E       d{    t        j                         }d}d}|j                  r j                         d{   }|j                  rX|j                  r'te        jf                  |j                         d{     j                  |j                  F       d{   }|s|r5 jX                  j                  dGdHdIt        j                         |z
  i%       dPdJt        dKtF        f fdL}t        ||||||| j                  r j                  ndM      |j                  sj                          d{    S S 7 7 7 7 v7 7 t7 G# tB        $ r}
tE        dtG        |
             d}
~
ww xY w7 T7 7 7 # tB        $ ru}
 jS                         d{  7  } jT                  jV                  r  jX                  j[                  d"d#d$|i%       |jP                  stC        d!|       Y d}
~
Bd}
~
ww xY w7 7 7 7 q7 $7 7 # tz        $ r4}
 jX                  ji                  d5d6d7tG        |
      i%       Y d}
~
d}
~
ww xY w7 7 7 n7 57 7 7 7 # tz        $ r}
tE        d>tG        |
             d}
~
ww xY w7 # t        $ r Y w xY w7 # tz        $ r4}
 jX                  j                  dAdBd7tG        |
      i%       Y d}
~
d}
~
ww xY w7 7 7 Y7 77 "7 7 7 7 p7 # tz        $ r}
|
d}
~
ww xY w# |j                  sj                          d{  7   w w xY ww)Qac  
        Internal method to crawl web URLs with the specified configuration.

        Args:
            url (str): The web URL to crawl
            config (CrawlerRunConfig): Configuration object controlling the crawl behavior

        Returns:
            AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data
        Nrandom)r   r   r   r   r   rC  rW  c                     |dk(  r.j                   j                  d|  dd| j                  i       y |dk(  r.j                   j                  d|  dd| j                  i       y y )Nrh   zConsole error: CONSOLEmsgrc   debugz	Console: )rA   rh   textr  )r  console_log_typerC   s     rD   
log_consolz=AsyncPlaywrightCrawlerStrategy._crawl_web.<locals>.log_consol  s|     $w.KK%%"1# 7% %sxx0 & 
 &0KK%%"+C5 1% %sxx0 &  1rF   console	pageerrorc                      | d      S )Nrh   r   )r^   r  s    rD   r  z;AsyncPlaywrightCrawlerStrategy._crawl_web.<locals>.<lambda>  s    :a+ArF   downloadc                 L    t        j                  j                  |             S r  )rV   rW   _handle_download)r  rC   s    rD   r  z;AsyncPlaywrightCrawlerStrategy._crawl_web.<locals>.<lambda>  s    W%8%8--h7&rF   rF  )r   r       zContent-Security-Policyz-default-src 'self'; script-src 'self' 'nonce-z' 'strict-dynamic')
wait_untilrj  zFailed on navigating ACS-GOTO:
rG  )r   r   responser  bodyattachedr  )staterj  a  () => {
                        const element = document.body;
                        if (!element) return false;
                        const style = window.getComputedStyle(element);
                        const isVisible = style.display !== 'none' && 
                                        style.visibility !== 'hidden' && 
                                        style.opacity !== '0';
                        return isVisible;
                    }rn  zBody element is hidden: zBody visibility info: {info}DEBUGro   rc   domcontentloadedr   zQ() => Array.from(document.getElementsByTagName('img')).every(img => img.complete)  z)Some images failed to load within timeoutr  r\  r   r   gffffff?r   z"Emulation.setDeviceMetricsOverrider   F)r   r   deviceScaleFactormobilescalez-Failed to adjust viewport to content: {error}VIEWPORTrh   successz)User script execution had issues: {error}JS_EXECrE  d   	ArrowDownzWait condition failed: update_image_dimensions   z(Error updating image dimensions: {error}ra   rI  rH  )r(  r  r   )screenshot_height_thresholdz8Exporting PDF and taking screenshot took {duration:.2f}sEXPORTdurationdelayrG   c                    K   j                   j                  dd| d       t        j                  |        d {    j	                          d {   S 7 7 w)Nz?Waiting for {delay} seconds before retrieving content for {url}rg   )r  r   rc   )rA   ro   rV   rY   content)r  r(  rC   r   s    rD   r  zFAsyncPlaywrightCrawlerStrategy._crawl_web.<locals>.get_delayed_content  sX       ]%*37 ! 
 mmE***!\\^++ ++s!   :AAAAAA)r  r  r  r  pdf_datar  ssl_certificatedownloaded_filesr   )r  )g      @)`r   rK  r   r   r   user_agent_moder   generateuser_agent_generator_configrM  r)  r   r   r   r   r   rX  log_consoleonfetch_ssl_certificater&   from_urlr   js_onlyhashlibsha256r   urandom	hexdigestr   gotor  page_timeoutr   r}  rp   statusr   rv  ru  ignore_body_visibilitycheck_visibilityr   verboserA   r  r   wait_for_imagesadjust_viewport_to_contentr  rV   rY   r]  get_page_dimensionsr   r   set_viewport_sizeminr   new_cdp_sessionsendrZ   scan_full_page_handle_full_page_scanscroll_delayrw  robust_execute_user_scriptry   mousemovedownupkeyboardpressri  ry  PlaywrightTimeoutErrorr|  rh   r  delay_before_return_htmlremove_overlay_elementsr  r&  perf_counterr  
export_pdfr  screenshot_wait_fortake_screenshotr  ro   floatr   r%  r+  )rC   r   r   r  r  r   r   ssl_certnoncer  r^   
is_visiblevisibility_infoimages_loaded
dimensionspage_height
page_widthtarget_widthtarget_heightr  cdpexecution_resultupdate_image_dimensions_jsr  start_export_timer  r  r  r  r(  s   ``                          @@rD   r  z)AsyncPlaywrightCrawlerStrategy._crawl_webW  s
     
 "$ ((33
<<D//??8K-J-?-A-J-J .&&BBHb.D*
 #22;;V;TTg !!&DE
 	
 	

 $$(<(<)).9N*OPPP  94QQQ  '.  GGIz*GGK!ABu	#H++)2237 ""33 >>''tWRU'VVVT#NN2::b>:DDFE 5515bchbii{3|7    &*YY(9(96CVCV &/ &  H ''dGQT_g'hhh#"%K')$"*//K'/'7'7$ "#% !N,,V:u,UUU $(#:#: " $; $ 
 "&*G*G,0,A,A$,G&GO":?:K LMMv &&00&&&*K*K../ABBBmmC((( '+&=&=g  '> ' ! %KK'' K$ (  &&00V5V5V"'+'?'?'E!EJ",X"6K!+G!4J $(#6#6#E#EL$'z(AK(ORV(V$WM00".-H    z 9=;;VWE $ < <T BBC((<%/&112&+%*	 	 	$ $$11$8K8KLLL ~~)-)H)Hv~~)^#^ '	2KK'' K% ')9)=)=g)FG (  ''(>g'VVV ##v||jjooc3///jjoo'''jjmmo%%mm))+666 K//foov7J7J *    &&00-;<U-V*"667IST6UUU --(BCCC %%!11$77 ##$:D'#RRR..mmF$C$CDDD --224888 'D##$8d\c#ddd !% 1 1 3H"Ozz!%!66  --!--(B(BCCC(,(<(<f6X6X )= ) # (  V &(9(9(;>O(OP ! , , , &!1'*!$7 (.2.D.DD**$$ $$jjl"" %q
 U	
 Q 	RR W   T&)I#a&'RSST i V 'H  N(,(=(=d(C"C"C;;&&KK%% ># &8 &  44":?:K LMM 5Nx C(! "F
 C	 ! KK'' O& 'Q0 (   M $_ W 0'%6
 ! K&)@Q'IJJK V1 C  KK%% J# 'Q0 &   8 SD 9 (d 7 D#R #  	G	
 $$jjl"" %s  B.j54b$5 j5b'Aj5b*j57b-8>j57A2i7 )b0*i7 /Ab9 ?b3 /b9 /b60b9 4i7 c!(i7 7c- c$c- *c'+&c- c*c- $Ai7 &e.'i7 e1i7 e4 Ai7 2f e7Af e:4f e=f +f ,f 0+i7 g/i7 gAi7 g	;i7 g!i7 9g:!i7 g"i7 >g?i7 +g ;g<g  !i7 #h 9h:h >h hh  i7 7i8i7 i2i7 i$i7 +i",i7 i%i7  i(!<i7 i+>i7 i.$i7 i1A<i7 >j5i4j5'j5*j5-j50i7 3b9 6b9 9	ccci7 $c- 'c- *c- -	e+6e&
dAe& i7 &e++i7 1i7 4i7 7f :f =f  f 	g )f;5i7 ;g  i7 i7 	i7 i7 i7 i7 i7 g 	h $g;;h  i7 h 	hh hh 	i")ii7 ii7 i7 i7 "i7 %i7 (i7 +i7 .i7 1i7 4j57	j jjj
 
 j2*j-+j22j5r  c           	        K   	 |j                   j                  d| j                  j                        }|}| j	                  |d||       d{    | j                  |       d{   }|d   }||k  rWt        ||z   |      }| j	                  |d||       d{    | j                  |       d{   }|d   }||kD  r|}||k  rW| j	                  |dd       d{    | j	                  |d|       d{    y7 7 7 j7 S7 (7 # t        $ r3}| j                  j                  dddt        |      i       Y d}~yd}~ww xY ww)	a  
        Helper method to handle full page scanning. 
        
        How it works:
        1. Get the viewport height.
        2. Scroll to the bottom of the page.
        3. Get the total height of the page.
        4. Scroll back to the top of the page.
        5. Scroll to the bottom of the page again.  
        6. Continue scrolling until the bottom of the page is reached.
        
        Args:
            page (Page): The Playwright page object
            scroll_delay (float): The delay between page scrolls
        
        r   r   )r  Nz)Failed to perform full page scan: {error}	PAGE_SCANrh   rc   )viewport_sizery   r   r   safe_scrollr  r  rZ   rA   r]  rp   )	rC   r(  r  r   current_positionr  total_height
new_heightr^   s	            rD   r  z5AsyncPlaywrightCrawlerStrategy._handle_full_page_scan  s    "'	:"0044$--==O  / ""4,<L"QQQ
  $77==J%h/L"\1#&'7/'I<#X &&tQ0@&UUU
 $(#;#;D#AA
'1
,#-L #\1 ""4A... ""4L999A R
 >
 V
 B / :  	KKCQ(    	s   EA
D DD 'D(5D DD 6D
7D D #D$D (E>D?ED D D 
D D E	E)EEEEc           	      H  K   	 |j                   }t        j                  j                  | j                  |      }| j
                  j                  dd||d       t        j                         }|j                  |       d{    t        j                         }| j                  j                  |       | j
                  j                  dd||||z
  dd	d
       y7 ]# t        $ r3}| j
                  j                  dddt        |      i       Y d}~yd}~ww xY ww)a  
        Handle file downloads.
        
        How it works:
        1. Get the suggested filename.
        2. Get the download path.
        3. Log the download.
        4. Start the download.
        5. Save the downloaded file.
        6. Log the completion.
        
        Args:
            download (Download): The Playwright download object
            
        Returns:
            None
        z Downloading {filename} to {path}FETCH)filenamer   rc   Nz"Downloaded {filename} successfullyCOMPLETEz.2fs)r  r   r  z"Failed to handle download: {error}ra   rh   )suggested_filenamer   r   r   r   rA   ro   r&  r  save_asrK  r|   r  rZ   rh   rp   )rC   r  r  download_path
start_timeend_timer^   s          rD   r  z/AsyncPlaywrightCrawlerStrategy._handle_download  s    $	!)!<!<GGLL)<)<>PQMKK:$6N   **,J""=111((*H""))-8KK< 2)#+j#8"=Q ?   	 2  	KK<Q(   	sB   D"A?C# C!AC#  D"!C# #	D,)DD"DD"c           	        K   t        d      }	 |j                  d| d       d{    |j                  d       d{    y7 7 # t        $ r3}| j                  j                  dddt        |      i	       Y d}~yd}~ww xY ww)
z
        Removes popup overlays, modals, cookie notices, and other intrusive elements from the page.

        Args:
            page (Page): The Playwright page instance
        r  zL
                (() => {
                    try {
                        a\  
                        return { success: true };
                    } catch (error) {
                        return {
                            success: false,
                            error: error.toString(),
                            stack: error.stack
                        };
                    }
                })()
            Ni  z*Failed to remove overlay elements: {error}r  rh   rc   )r   r|  wait_for_timeoutrZ   rA   r]  rp   )rC   r(  remove_overlays_jsr^   s       rD   r  z6AsyncPlaywrightCrawlerStrategy.remove_overlay_elements7  s      ,,EF	-- % ,, 
-!    '',,, - 	KKDQ(    	sP   B	A
 AA
  AA
 B	A
 A
 
	B)B<B	BB	c                 F   K   |j                  d       d{   }|S 7 w)z
        Exports the current page as a PDF.
        
        Args:
            page (Page): The Playwright page object
            
        Returns:
            bytes: The PDF data
        T)print_backgroundN)r  )rC   r(  r  s      rD   r  z)AsyncPlaywrightCrawlerStrategy.export_pdfW  s%      488 9s   !!c                    K   | j                  |       d{   }|s| j                  |       d{   S  | j                  |fi | d{   S 7 :7 !7 w)a  
        Take a screenshot of the current page.
        
        Args:
            page (Page): The Playwright page object
            kwargs: Additional keyword arguments
        
        Returns:
            str: The base64-encoded screenshot data
        N)page_need_scrolltake_screenshot_naivetake_screenshot_scroller)rC   r(  r=  need_scrolls       rD   r  z.AsyncPlaywrightCrawlerStrategy.take_screenshotd  sa      !11$7733D999 766tFvFFF 8 : Gs1   AAAAAAAAAr  c                   K   	 ddl m}  ||      }|d   j                  d      }t               }|j	                  |d       t        j                  |j                               j                  d      S # t        $ r}dt        |       }| j                  j                  dd	d
|i       t        j                  ddd      }t        j                   |      }	t#        j$                         }
|	j'                  d|d|
       t               }|j	                  |d       t        j                  |j                               j                  d      cY d}~S d}~ww xY ww)a
  
        Convert the first page of the PDF to a screenshot.     
        
        Requires pdf2image and poppler.
        
        Args:
            pdf_data (bytes): The PDF data
        
        Returns:
            str: The base64-encoded screenshot data
        r   )convert_from_bytesRGBJPEGformatr  z%Failed to take PDF-based screenshot: zPDF Screenshot failed: {error}ra   rh   rc   i   iX  blackcolorr   r      r6  r6  fillfontN)	pdf2imager+  convertr   savebase64	b64encodegetvaluern   rZ   rp   rA   rh   r   newr   Drawr   load_defaultr  )rC   r  r+  images	final_imgbufferedr^   error_messageimgdrawr9  s              rD   take_screenshot_from_pdfz7AsyncPlaywrightCrawlerStrategy.take_screenshot_from_pdfy  s/    	I4'1Fq	))%0IyHNN8FN3##H$5$5$78??HH 	ICCF8LMKK8/   ))E:W=C>>#&D))+DIIhO$IOyHHHXfH-##H$5$5$78??HH	Is6   EA0A5 4E5	E>CEEEEEc                   K   	 | j                  |       d{   }|d   }|d   }t        ||j                  dt                    }|j	                  ||d       d{    g }|j
                  }|d   }	||	z  dz   }
t        |
      D ]  }||	z  }|j                  d| d       d{    t        j                  d	       d{    |j                  d
       d{   }t        j                  t        |            j                  d      }|j                  |        t!        d |D              }t        j"                  d|d   j$                  |f      }d}|D ]4  }|j'                  |j                  d      d|f       ||j(                  z  }6 t               }|j                  d      }|j+                  |dd       t-        j.                  |j1                               j3                  d      }||j5                          d{    S 7 7 7 o7 U7 >7 # t6        $ r}dt9        |       }| j:                  j=                  ddd|i       t        j"                  ddd      }t?        j@                  |      }tC        jD                         }|jG                  d|d|       t               }|j+                  |d       t-        j.                  |j1                               j3                  d      cY d}~|j5                          d{  7   S d}~ww xY w# |j5                          d{  7   w xY ww) a  
        Attempt to set a large viewport and take a full-page screenshot.
        If still too large, segment the page as before.
        
        Requires pdf2image and poppler.
        
        Args:
            page (Page): The Playwright page object
            kwargs: Additional keyword arguments
            
        Returns:
            str: The base64-encoded screenshot data
        Nr   r   r  r   r   zwindow.scrollTo(0, )g{Gz?F	full_pager,  c              3   4   K   | ]  }|j                     y wr  )r   ).0rG  s     rD   	<genexpr>zJAsyncPlaywrightCrawlerStrategy.take_screenshot_scroller.<locals>.<genexpr>  s     >XcszzXs   r   BMPU   )r/  qualityr  z*Failed to take large viewport screenshot: z)Large viewport screenshot failed: {error}ra   rh   rc   r0  r1  r2  r4  r5  r7  r-  r.  )$r  r  ry   r   r  r  r   r|  rV   rY   r  r   r  r   r;  r|   sumr@  r   paster   r<  r=  r>  r?  rn   r+  rZ   rp   rA   rh   r   rA  r   rB  r  )rC   r(  r=  r  r  r  large_viewport_heightsegmentsr  r   num_segmentsr  y_offsetseg_shotrG  r  stitchedoffsetrE  encodedr^   rF  rH  r9  s                           rD   r(  z7AsyncPlaywrightCrawlerStrategy.take_screenshot_scroller  s    =	#77==J#G,J$X.K
 %(

8:TU%! (($0EF  
 H ..M+H5O'?:a?L<(.mm&9(1$EFFFmmD)))!%5!AAjj!23;;EB$ ) >X>>Lyy!):):L(IJHFs{{51Av;?#**$  
 yH''.HMM(5"M=&&x'8'8':;BB7KG" **,w > G)AD !  	IHQQMKKC/   ))E:W=C>>#&D))+DIIhO$IOyHHHXfH-##H$5$5$78??HH**,!	I  **,s   MH# HAH# HA
H# )H*H# HH#  H!DH# 9MH!MH# H# H# H# H# !M#	L",CL>L"?L% MLML""L% %M 9L<:M  Mc                    K   	 |j                  d       d{   }t        j                  |      j                  d      |j	                          d{    S 7 @7 # t
        $ r}dt        |       }| j                  j                  ddd|i	       t        j                  d
dd      }t        j                  |      }t        j                         }|j                  d|d|       t!               }|j#                  |d       t        j                  |j%                               j                  d      cY d}~|j	                          d{  7   S d}~ww xY w# |j	                          d{  7   w xY ww)z
        Takes a screenshot of the current page.

        Args:
            page (Page): The Playwright page instance

        Returns:
            str: Base64-encoded screenshot image
        FrL  Nr  zFailed to take screenshot: zScreenshot failed: {error}ra   rh   rc   r,  r0  r1  r2  r4  r5  r7  r-  r.  )r  r=  r>  rn   r+  rZ   rp   rA   rh   r   r@  r   rA  r   rB  r  r   r<  r?  )	rC   r(  r  r^   rF  rG  rH  r9  rE  s	            rD   r'  z4AsyncPlaywrightCrawlerStrategy.take_screenshot_naive  s=    	#??J##J/66w?& **,) @( %  	I9#a&BMKK4/   ))E:W=C>>#&D))+DIIhO$IOyHHHXfH-##H$5$5$78??HH**,%	I$ **,s   E>A A'A E>AE>A E>	E'CE9E:E  >E>EE>EE   E;4E75E;;E>r   c                    K   | j                   rF| j                   j                  |       d{   }| j                  j                  ddd|i       |S | j                  j	                  dd	       y7 Dw)
a,  
        Exports the current storage state (cookies, localStorage, sessionStorage)
        to a JSON file at the specified path.
        
        Args:
            path (str): The path to save the storage state JSON file
        
        Returns:
            dict: The exported storage state
        r   Nz Exported storage state to {path}rg   r   rc   z5No default_context available to export storage state.r[  r\  )r   r   rA   ro   r]  )rC   r   r  s      rD   export_storage_statez3AsyncPlaywrightCrawlerStrategy.export_storage_state  s{      ..<<$<GGEKK:~  
 LKKO    Hs   ,A5A3AA5rw  c                   K   	 |j                  d       d{    t        |t              r|g}n|}g }|D ]  }	 d}	 |j                  d| d       d{   }t        j                         }		 |j                  dd       d{    t        dt        j                         |	z
         |j                  |r|nddi        d|dS 7 7 u# t        $ r,}dt        |      v r| j
                  j                  dd       	 |j                  d	d
       d{  7   n?# t        $ r3}| j
                  j                  dddt        |      i       Y d}~nd}~ww xY w	 |j                  dd
       d{  7   n?# t        $ r3}| j
                  j                  dddt        |      i       Y d}~nd}~ww xY wddd}n7| j
                  j                  dddt        |      i       dt        |      d}Y d}~d}~ww xY w7 # t        $ r4}| j
                  j                  dddt        |      i       Y d}~d}~ww xY w# t        $ rQ}| j
                  j                  dddt        |      i       |j                  dt        |      d       Y d}~hd}~ww xY w# t        $ rA}| j
                  j                  dddt        |      i       dt        |      dcY d}~S d}~ww xY ww)a  
        Executes user-provided JavaScript code with proper error handling and context,
        supporting both synchronous and async user code, plus navigations.
        
        How it works:
        1. Wait for load state 'domcontentloaded'
        2. If js_code is a string, execute it directly
        3. If js_code is a list, execute each element in sequence
        4. Wait for load state 'networkidle'        
        5. Return results   
        
        Args:    
            page (Page): The Playwright page instance
            js_code (Union[str, List[str]]): The JavaScript code to execute
        
        Returns:
            Dict[str, Any]: The results of the execution
        r  Nzj
                        (async () => {
                            try {
                                a  
                                return { success: true };
                            } catch (err) {
                                return { success: false, error: err.toString(), stack: err.stack };
                            }
                        })();
                        zExecution context was destroyedz6Navigation triggered by script, waiting for load stater  )re   r  r  rn  zNavigation wait failed: {error}rh   rc   networkidlez!Network idle wait failed: {error}Tz6Navigation triggered, ignoring context destroyed error)r  ro   #Playwright execution error: {error}Fr  rh     ,DOM content loaded after script execution inz!DOM content load timeout: {error}r  zScript chunk failed: {error}r  results Script execution failed: {error})r  
isinstancerp   r|  r   rA   ro   r]  rh   r&  printr|   rZ   )
rC   r(  rw  scriptsrh  scriptr  r^   nav_errt1s
             rD   r  z9AsyncPlaywrightCrawlerStrategy.robust_execute_user_script%  s    &f	7**+=>>>'3'")!G!RH "F+I'+}} :! "( )	6 	( 	"Z B"667ISW6XXXLdiik\^N^_& NNV6)T9JKU "j  $88{ ?	" !  I<AF KK,,-ekt,u"&*&>&>vu&>&U U U#( " $ 3 3,M(1,3S\+B !4 !" !"""&*&>&>}V[&>&\ \ \#( " $ 3 3,O(1,3S\+B !4 !" !"" ,0(`&F !KK--(M$-(/Q'8 . 
 27Q%HFA IJ Y  ++$G )$+SV#4 ,  ( ! HKK%% >% 'Q0 & 
 NNus1v#FGGH  	7KK:Q(  
  %s1v66	7sR  K4J' C
!J' I
CCCI
0H
H%H
,I
J' 	K4
J' C
H*G?D DD G? 	E))EG?EG? E=6E97E=<G?=	F9)F4/G?4F99A G?9I
?HI
H

	I)I<I
II

	J$AJJ' J$$J' '	K106K,&K1'K4,K11K4c           
      $  K   	 |j                  d       d{    t        |t              r|g}n|}g }|D ]  }	 |j                  d| d       d{   }t	        j                         }|j                  dd       d{    t        dt	        j                         |z
         t	        j                         }|j                  dd       d{    t        d	t	        j                         |z
         |j                  |r|nd
di        d|dS 7 7 7 7 I# t        $ rQ}| j                  j                  dddt        |      i       |j                  dt        |      d       Y d}~<d}~ww xY w# t        $ rA}| j                  j                  dddt        |      i       dt        |      dcY d}~S d}~wt        $ rA}| j                  j                  dddt        |      i       dt        |      dcY d}~S d}~ww xY ww)aO  
        Executes user-provided JavaScript code with proper error handling and context.
        
        Args:
            page: Playwright page object
            js_code: Single JavaScript string or list of JavaScript code strings
            
        Returns:
            Dict containing execution status and results/errors
        r  Nz
                        (() => {
                            return new Promise((resolve) => {
                                try {
                                    const result = (function() {
                                        a'  
                                    })();
                                    
                                    // If result is a promise, wait for it
                                    if (result instanceof Promise) {
                                        result.then(() => {
                                            // Wait a bit for any triggered effects
                                            setTimeout(() => resolve({ success: true }), 100);
                                        }).catch(error => {
                                            resolve({
                                                success: false,
                                                error: error.toString(),
                                                stack: error.stack
                                            });
                                        });
                                    } else {
                                        // For non-promise results, still wait a bit for effects
                                        setTimeout(() => resolve({ success: true }), 100);
                                    }
                                } catch (error) {
                                    resolve({
                                        success: false,
                                        error: error.toString(),
                                        stack: error.stack
                                    });
                                }
                            });
                        })()
                    re  rn  rf  rb  z&Network idle after script execution inr  Trc  r  rh   rc   Frd  rg  ri  )r  rj  rp   r|  r&  rk  r|   r   rA   rh   rZ   )	rC   r(  rw  rl  rh  rm  r  ro  r^   s	            rD   execute_user_scriptz2AsyncPlaywrightCrawlerStrategy.execute_user_script  s    U	7**+=>>> '3'")!G!7H#'== 6)
 *0 1!2 !$ !FH B223Et2TTTH$))+XZJZ[B22=$2OOOBDIIKRTDTUNNV6)T9JK_ "t  $88G ?!J U P
  HKK%% E% 'Q0 & 
 NNus1v#FGGH  	7KK:Q(  
  %s1v66 	7KK:Q(  
  %s1v66	7s   HE; D!E; DD.DDADD<DE; HE; DDD	E8'AE3-E; 3E88E; ;	H6G :H;H H6HHHHHc                 @   K   |j                  d       d{   S 7 w)z
        Checks if an element is visible on the page.
        
        Args:
            page: Playwright page object
            
        Returns:
            Boolean indicating visibility
        a  
            () => {
                const element = document.body;
                if (!element) return false;
                const style = window.getComputedStyle(element);
                const isVisible = style.display !== 'none' && 
                                style.visibility !== 'hidden' && 
                                style.opacity !== '0';
                return isVisible;
            }
        Nr|  rC   r(  s     rD   r  z/AsyncPlaywrightCrawlerStrategy.check_visibility  s(      ]] 
$ 
 
 
	 
   xyr  c                    K   | j                  |||       d{   }|d   r|j                  |dz         d{    |S 7 '7 w)z
        Safely scroll the page with rendering time.
        
        Args:
            page: Playwright page object
            x: Horizontal scroll position
            y: Vertical scroll position
        Nr  r  )csp_scroll_tor!  )rC   r(  rv  rw  r  r  s         rD   r  z*AsyncPlaywrightCrawlerStrategy.safe_scroll  sM      ))$155)''555 65s   AA AAAAc                 n  K   	 |j                  d| d| d| d| d| d| d       d{   }|d	   s/| j                  j                  d
dd|j                  d      i       |S 7 :# t        $ rA}| j                  j                  dddt        |      i       dt        |      dcY d}~S d}~ww xY ww)aM  
        Performs a CSP-compliant scroll operation and returns the result status.
        
        Args:
            page: Playwright page object
            x: Horizontal scroll position
            y: Vertical scroll position
            
        Returns:
            Dict containing scroll status and position information
        z() => {
                    try {
                        const startX = window.scrollX;
                        const startY = window.scrollY;
                        window.scrollTo(z, a  );
                        
                        // Get final position after scroll
                        const endX = window.scrollX;
                        const endY = window.scrollY;
                        
                        return {
                            success: true,
                            startPosition: { x: startX, y: startY },
                            endPosition: { x: endX, y: endY },
                            targetPosition: { x: z, y: z\ },
                            delta: {
                                x: Math.abs(endX - z6),
                                y: Math.abs(endY - a  )
                            }
                        };
                    } catch (e) {
                        return {
                            success: false,
                            error: e.toString()
                        };
                    }
                }Nr  z Scroll operation failed: {error}SCROLLrh   rc   z!Failed to execute scroll: {error}Frd  )r|  rA   r]  ry   rZ   rh   rp   )rC   r(  rv  rw  r  r^   s         rD   ry  z,AsyncPlaywrightCrawlerStrategy.csp_scroll_to&  s     1	==) *+2aS 
13 45#U1# >4453 74453 	7# F< )$##> #VZZ%89 $  MKN  		KK;Q(   !Q 		sD   B5'A( A&9A( %B5&A( (	B216B-'B2(B5-B22B5c                 @   K   |j                  d       d{   S 7 w)z
        Get the dimensions of the page.
        
        Args:
            page: Playwright page object
            
        Returns:
            Dict containing width and height of the page
        z
            () => {
                const {scrollWidth, scrollHeight} = document.documentElement;
                return {width: scrollWidth, height: scrollHeight};
            }
        Nrs  rt  s     rD   r  z2AsyncPlaywrightCrawlerStrategy.get_page_dimensionse  s(      ]] $   	 ru  c           	         K   	 |j                  d       d{   }|S 7 # t        $ r3}| j                  j                  dddt	        |      i       Y d}~yd}~ww xY ww)z
        Determine whether the page need to scroll
        
        Args:
            page: Playwright page object
            
        Returns:
            bool: True if page needs scrolling
        z
            () => {
                const scrollHeight = document.documentElement.scrollHeight;
                const viewportHeight = window.innerHeight;
                return scrollHeight > viewportHeight;
            }
            NzDFailed to check scroll need: {error}. Defaulting to True for safety.r{  rh   rc   T)r|  rZ   rA   r]  rp   )rC   r(  r)  r^   s       rD   r&  z/AsyncPlaywrightCrawlerStrategy.page_need_scrollv  so     	 $ / ! K   	KK^Q(   
 	s6   A ! ! A ! 	A)AA AA )NN)r  )r   r  )4r   r   r   r   r!   r#   rE   rP  rU  r_   r+  rp   r,  r   rb  rX  rf  r   rh  r   r   ry  ru  r  r  r"   r   r>  r  r  r  r  bytesr  r  rI  r(  r'  r9  r`  r   r   r   r  rq  r  r   r  ry  r  r   r&  r   rF   rD   r@  r@    s   8 KO"
+"
<G"
H	
+<S <"@# @X @4)C )(
%C 
%
$sCx. 
>T >S >5 >@*d * *V[ *X>@ (9 9-= 9M_ 9vs#S s#2B s#HZ s#j8: 8:U 8:t/b$ 4 @T e Gs G*"Iu "I "IHK4 Kc KZ     Ds d 2y7T y7E#tTWy.DY y7^bcfhkck^l y7v`7d `7U3S	>=R `7W[\_ad\dWe `7D,d s s 5 = = = =c3h =~d "4 D rF   r@  )DrV   r=  r&  abcr   r   typingr   r   r   r   r	   r
   r   r   rw   r   rO   rS   r   r   r   r   r   r   r   r  ior   PILr   r   r   pathlibr   r   pydanticr   r  jsonr  
js_snippetr   modelsr   utilsr   user_agent_generatorr   r   r   r    async_configsr!   r"   async_loggerr#   playwright_stealthr$   r%   r  r&   stealth_configr   r8   r   r;  r@  r   rF   rD   <module>r     s       # H H H    W W G  + +  .     & & $ 4 E : % ; +#' .\ \~~# ~#B3 b%9 brF   