
    g8                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZmZmZ d dlZd dlZd dlmZ ddlmZmZ dd	lmZ dd
l dd
l dd
l ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;Z;ddl<m<Z=  G d d      Z>y)    N)Enum)initForeBackStyle)Path)OptionalListUnion)asynccontextmanager   )CrawlResultMarkdownGenerationResult)async_db_manager)*)AsyncCrawlerStrategyAsyncPlaywrightCrawlerStrategyAsyncCrawlResponse)	CacheModeCacheContext_legacy_to_cache_mode)DefaultMarkdownGeneratorMarkdownGenerationStrategy)WebScrapingStrategy)AsyncLogger)BrowserConfigCrawlerRunConfig)MIN_WORD_THRESHOLD$IMAGE_DESCRIPTION_MIN_WORD_THRESHOLDURL_LOG_SHORTEN_LENGTH)sanitize_input_encodeInvalidCSSSelectorErrorformat_htmlfast_format_htmlcreate_box_message)urlparse)__version__c                      e Zd ZdZi Zdddd e ej                  d ej                                     dfde
e   de
e   dede
e   d	ed
efdZd Zd Zd Zd Zd Zed        Zded e       dddddddddddfdede
e   dededede
e   dedededededededed efd!Zded"ed#ededed$ed%ed efd&Zded e       ddddddddfd'e e   de
e   dededede
e   dededededed e e   fd(Z!d) Z"d* Z#d+ Z$y),AsyncWebCrawleraB
  
    Asynchronous web crawler with flexible caching capabilities.
    
    There are two ways to use the crawler:

    1. Using context manager (recommended for simple cases):
        ```python
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(url="https://example.com")
        ```

    2. Using explicit lifecycle management (recommended for long-running applications):
        ```python
        crawler = AsyncWebCrawler()
        await crawler.start()
        
        # Use the crawler multiple times
        result1 = await crawler.arun(url="https://example.com")
        result2 = await crawler.arun(url="https://another.com")
        
        await crawler.close()
        ```
    
    Migration Guide:
    Old way (deprecated):
        crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)
    
    New way (recommended):
        browser_config = BrowserConfig(browser_type="chromium", headless=True)
        crawler = AsyncWebCrawler(config=browser_config)
    
    
    Attributes:
        browser_config (BrowserConfig): Configuration object for browser settings.
        crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
        logger (AsyncLogger): Logger instance for recording events and errors.
        always_bypass_cache (bool): Whether to always bypass cache.
        crawl4ai_folder (str): Directory for storing cache.
        base_directory (str): Base directory for storing cache.
        ready (bool): Whether the crawler is ready for use.
        
        Methods:
            start(): Start the crawler explicitly without using context manager.
            close(): Close the crawler explicitly without using context manager.
            arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
            awarmup(): Perform warmup sequence.
            arun_many(): Run the crawler for multiple sources.
            aprocess_html(): Process HTML content.
    
    Typical Usage:
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(url="https://example.com")
            print(result.markdown)
            
        Using configuration:
        browser_config = BrowserConfig(browser_type="chromium", headless=True)
        async with AsyncWebCrawler(config=browser_config) as crawler:
            crawler_config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS                
            )
            result = await crawler.arun(url="https://example.com", config=crawler_config)
            print(result.markdown)
    NFCRAWL4_AI_BASE_DIRECTORYcrawler_strategyconfigalways_bypass_cachealways_by_pass_cachebase_directorythread_safec                    |}|2t        fddD              r3| j                  j                  dd       nt        j                        }|| _        t        t        j                  j                  |dd      | j
                  j                  d	
      | _        j                         D 	
ci c]  \  }	}
|	dv s|	|
 }}	}
|xs t        d|| j                  d|| _        | j                  j                  s| j                  | j                  _        |6j                  dd      rt        j                   dt"        d       || _        n|| _        |rt'        j(                         nd| _        t        j                  j                  |d      | _        t        j.                  | j,                  d       t        j.                  | j,                   dd       d| _        yc c}
}	w )ax  
        Initialize the AsyncWebCrawler.

        Args:
            crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
            config: Configuration object for browser settings. If None, will be created from kwargs
            always_bypass_cache: Whether to always bypass cache (new parameter)
            always_by_pass_cache: Deprecated, use always_bypass_cache instead
            base_directory: Base directory for storing cache
            thread_safe: Whether to use thread-safe operations
            **kwargs: Additional arguments for backwards compatibility
        Nc              3   &   K   | ]  }|v  
 y wN ).0kkwargss     N/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/async_webcrawler.py	<genexpr>z+AsyncWebCrawler.__init__.<locals>.<genexpr>   s     j(i11;(is   )browser_typeheadlessviewport_widthviewport_heightz`Both browser_config and legacy browser parameters provided. browser_config will take precedence.WARNINGmessagetagz	.crawl4aizcrawler.log
   )log_fileverbose	tag_width)browser_congiglogger)browser_configrG   warningTz'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. Use 'always_bypass_cache' instead. Pass warning=False to suppress this warning.   
stacklevel)exist_okz/cacheFr4   )anyrG   rI   r   from_kwargsrH   r   ospathjoinrD   itemsr   r+   getwarningswarnDeprecationWarningr-   asyncioLock_lockcrawl4ai_foldermakedirsready)selfr+   r,   r-   r.   r/   r0   r7   rH   r6   vparamss          `    r8   __init__zAsyncWebCrawler.__init__k   s   .  %j(ijj##~! $  +66v>N, "WW\\.+}M''//
 #LLN
*DAqa3O.OAaCN 	 
 !1 !
4R 5
);;5
 5
 $$+++/;;D!!(  +zz)T*C '  (<D$':D$ (3W\\^
  "ww||NKH
D((48
t++,F3dC
G
s   -G":G"c                    K   | j                   j                          d{    | j                          d{    | S 7 7 w)a  
        Start the crawler explicitly without using context manager.
        This is equivalent to using 'async with' but gives more control over the lifecycle.
        
        This method will:
        1. Initialize the browser and context
        2. Perform warmup sequence
        3. Return the crawler instance for method chaining
        
        Returns:
            AsyncWebCrawler: The initialized crawler instance
        N)r+   
__aenter__awarmupr^   s    r8   startzAsyncWebCrawler.start   s=      ##..000lln 	1s   A?AAAAc                 Z   K   | j                   j                  ddd       d{    y7 w)a  
        Close the crawler explicitly without using context manager.
        This should be called when you're done with the crawler if you used start().
        
        This method will:
        1. Clean up browser resources
        2. Close any open pages and contexts
        N)r+   	__aexit__re   s    r8   closezAsyncWebCrawler.close   s&      ##--dD$???s   !+)+c                 >   K   | j                          d {   S 7 wr3   )rf   re   s    r8   rc   zAsyncWebCrawler.__aenter__   s     ZZ\!!!s   c                 @   K   | j                          d {    y 7 wr3   )ri   )r^   exc_typeexc_valexc_tbs       r8   rh   zAsyncWebCrawler.__aexit__   s     jjls   c                 b   K   | j                   j                  dt         d       d| _        yw)z
        Initialize the crawler with warm-up sequence.
        
        This method:
        1. Logs initialization info
        2. Sets up browser configuration
        3. Marks the crawler as ready
        z	Crawl4AI INIT)rA   TN)rG   infocrawl4ai_versionr]   re   s    r8   rd   zAsyncWebCrawler.awarmup   s/      	9%5$67VD
s   -/c                   K   d yw)u   异步空上下文管理器Nr4   re   s    r8   nullcontextzAsyncWebCrawler.nullcontext   s      	s   	Turlextraction_strategychunking_strategycontent_filter
cache_modebypass_cachedisable_cacheno_cache_readno_cache_writecss_selector
screenshotpdf
user_agentreturnc                   K   |}t        |t              r|st        d      | j                  xs | j	                         4 d{    	 ||}n'|||||||	|
|||||d|}t        j                  |      }t        ||	|
|g      rN|j                  dd      rt        j                  dt        d       |j                  t        |	||
|	      |_        |j                  t        j                  |_        t!        ||j                  | j"                        }d}d}d}d}d}t%        j&                         }|j)                         rt+        j,                  |       d{   }|rt/        |j0                        }t/        |j2                  xs d
      }|r|dk(  rdn|}|j4                  }|j6                  }|j4                  r|r|j6                  r|sd}| j8                  j;                  |j<                  t?        |      t%        j&                         |z
  d       |rsHt%        j&                         }|r| j@                  jC                  |       | j@                  jE                  ||       d{   }t/        |j0                        }|j4                  }|jF                  }t%        j&                         }| j8                  j;                  |j<                  t?        |      ||z
  d        | jH                  d(|||||||jJ                  |jM                  d      rdndd| d{   }|jN                  |_'        |jP                  |_(        |jR                  |_)        |jT                  |_*        t?        |      |_+        tY        |dd      |_-        | j8                  jW                  dd|j<                  |jV                  t%        j&                         |z
  ddd|jV                  rt\        j^                  nt\        j`                  t\        jb                  d       |je                         r(t?        |      st+        jf                  |       d{    |cddd      d{    S | j8                  jW                  dd|j<                  dt%        j&                         |z
  dddt\        j^                  t\        jb                  d       t?        |      |_+        tY        |dd      |_-        |cddd      d{    S 7 7 7 7 7 7 7 # th        $ r}tk        tm        jn                               } d| d    d| d    d| d    d t        |       d!| d"    
}!| j8                  jq                  |ts        |!d#$      d%&       tu        |d
d|!'      cY d}~cddd      d{  7   S d}~ww xY w# 1 d{  7  sw Y   yxY ww))a  
            Runs the crawler for a single source: URL (web, local file, or raw HTML).

            Migration Guide:
            Old way (deprecated):
                result = await crawler.arun(
                    url="https://example.com",
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )

            New way (recommended):
                config = CrawlerRunConfig(
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
                result = await crawler.arun(url="https://example.com", crawler_config=config)

            Args:
                url: The URL to crawl (http://, https://, file://, or raw:)
                crawler_config: Configuration object controlling crawl behavior
                [other parameters maintained for backwards compatibility]

            Returns:
                CrawlResult: The result of crawling and processing
            z4Invalid URL, make sure the URL is a non-empty stringN)word_count_thresholdrv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   rD   rI   TztCache control boolean flags are deprecated and will be removed in version 0.5.0. Use 'cache_mode' parameter instead.rJ   rK   )r{   rz   r|   r}    z[]FETCH)ru   successtimingrA   )r,   zraw:F)ru   htmlextracted_contentr,   r   pdf_datarD   is_raw_html
session_idz1{url:.50}... | Status: {status} | Total: {timing}COMPLETE.2fs)ru   statusr   )r   r   r@   rA   r`   colorsz'Unexpected error in _crawl_web at line line_noz in functionz (filenamez
):
Error: z

Code context:
code_contexterror)typeERROR)ru   r   rA   )ru   r   r   error_messager4   );
isinstancestr
ValueErrorrZ   rt   r   rO   rN   rT   rU   rV   rW   ry   r   r   ENABLEDr   r-   timeperf_countershould_readr   aget_cached_urlr!   r   r   r   r   rG   
url_statusdisplay_urlboolr+   update_user_agentcrawlr   aprocess_htmlrD   
startswithstatus_coderesponse_headersdownloaded_filesssl_certificater   getattrr   r   GREENREDYELLOWshould_write
acache_url	Exceptionget_error_contextsysexc_infoerror_statusr%   r   )"r^   ru   r,   r   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   rD   r7   crawler_configconfig_kwargscache_contextasync_responsecached_resultscreenshot_datar   r   
start_timer   t1t2crawl_resulteerror_contextr   s"                                     r8   arunzAsyncWebCrawler.arun   s    f $Nc3's !WXXzz7T%5%5%777W%1 "0 5I3F1B.<*4,8-:-:.<,8*4#&'.) %)  "2!=!=m!L L-WX!::i6$MM!F 2+,	 ",,40E.;-9.;/=	1F- ((0,5,=,=) %1f6G6GIaIa$bM :>N15M&*O#H(,%!%!2!2!4J %002.>.N.Ns.S(S$4]5G5GH,A-BaBaBgeg,h)8IM^bfMfDl})*7*B*B#0#4#4!,,Z6::VY,0M.. - 9 9$(J#'#4#4#6#C '	 /  )!..0% 11CCJO 04/D/D/J/J#) 0K 0 *
  5^5H5HI*8*C*C#1#:#:!..0.. - 9 9$(J#%7 '	 /  .@T-?-? 
. #!%.?#)'6%-$*NN25..2H$e
. %
. 
( 4B3M3M08F8W8W58F8W8W57E7U7U4, 04Dz,29&,PT2U/++$W *'4'@'@*6*>*>-1->->-@:-Mc,RRS*T$ 9E8L8L$**RVRZRZ*.++$ ,  )557]@S"2"="=l"KKK+Y 877^ ++$W *'4'@'@*.-1->->-@:-Mc,RRS*T$ +/***.++$ ,  15T
-3:6<QU3V0, 877J )T6*$
(p LU 87B ! $5clln$EM B-PYBZA[ \+J78=;T:U V""%a& ***7*G)HJ " KK,,0WM# -  ' %&3	 g 877BC 877s   AW	S?W	V4
DTTDT.T/B5T$T%D(TTTW	T W	%BT-W	9T:W	TTTTW	W		V1A:V,V1V4W	%V(&W	,V11V44W:V=;WW	r   r   r   rD   c                   K   	 |j                  dd      s|nd}	t        j                         }
t        | j                        }|j                         j                         D ci c]  \  }}|dvs|| }}}|j                  |j                         D ci c]  \  }}||j                         vs|| c}}        |j                  ||fi |}|t        d|       	 t        |j                  d	d
            }t        |j                  dd
            }t        |j                  dd
            }|j                  dg       }|j                  dg       }|j                  di       }|j                  xs
 t!               }|j#                  ||      }|}t        |j$                        }| j                  j'                  dd|	t)        t        j                         |
z
  dz        d       |M|j*                  r@|j,                  r3t/        |j*                  t0              st        j                         }
|j*                  j2                  }|dk(  r.|j4                  s"| j                  j7                  ddd|	i       d}|||j$                  dj                  ||      }|dk(  r
t9               n|j,                  }|j;                  |      }|j*                  j=                  ||      }t?        j@                  |dt        d      }| j                  j'                  dd|	t        j                         |
z
  d       |sdn|}|sdn|}|jB                  rtE        |      }tG        |||||||||||||dd
       S c c}}w c c}}w # t        $ r}t        t        |            d}~wt        $ r}t        d| dt        |             d}~ww xY ww)!a  
            Process HTML content using the provided configuration.
            
            Args:
                url: The URL being processed
                html: Raw HTML content
                extracted_content: Previously extracted content (if any)
                config: Configuration object controlling processing behavior
                screenshot: Screenshot data (if any)
                pdf_data: PDF data (if any)
                verbose: Whether to enable verbose logging
                **kwargs: Additional parameters for backwards compatibility
            
            Returns:
                CrawlResult: Processed result containing extracted and formatted content
            r   FzRaw HTML)rG   )ru   Nz:Process HTML, Failed to extract content from the website: z	, error: cleaned_htmlr   fit_markdownfit_htmlmedialinksmetadata)r   base_urlz)Processed {url:.50}... | Time: {timing}msSCRAPEi  )ru   r   r@   rA   r`   zGFit markdown requested but not available. Falling back to raw markdown.EXTRACTru   markdown)r   r   r   r      )indentdefaultensure_asciiz,Completed for {url:.50}... | Time: {timing}sT)ru   r   r   markdown_v2r   r   r   r   r   r   r   r   r   r   r   )$rT   r   r   r   rG   to_dictrS   updatekeysscrapr   r"   r   r   r!   markdown_generatorr   generate_markdownraw_markdownrq   intrv   rw   r   NoExtractionStrategyinput_formatr   rI   IdentityChunkingchunkrunjsondumps	prettiifyr$   r   ) r^   ru   r   r   r,   r   r   rD   r7   _urlr   scrapping_strategyr6   r_   r`   resultr   r   r   r   r   r   r   r   markdown_resultr   r   content_formatcontentchunkingsectionsr   s                                    r8   r   zAsyncWebCrawler.aprocess_html  s    6v"(**]E"Bs
&&( &9%L" +1..*:*@*@*BW*B$!QawFV!A#*BWv||~X~tq!&++-AWqs~XY1+11
 
 >$'abeaf%ghh " 1NB1OPL0NB1OPL,VZZ
B-GHHJJw+EJJw+Ezz*b1H HNG`G`G~d|d~ 9K8\8\) 9] 9O
 *K,_-I-IJH KKC!4#4#4#6#;t"CD   ")**((v99;OP&&( "(!;!;!H!H!^3O<X<XKK'' i% %t} ( 
 &0N !) $3$@$@ #nh/	  2@61I+-vOgOg#>>'2$*$>$>$B$B3$Q!$(JJ/@TWfk$l!   J!#"&"3"3"5": !  +5d*O#+tH /= )'!)!!*"3  Y XX" + ) Q(( v #]^a]bbklopqlrks!tuuvs`   OA!N	 %M=2M=7!N	 N
3N
8,N	 $JO=N	 		ON&&O2OOOurlsc                    K   }|:t        d ||||||	|
|fD              r j                  j                  dd       |n$|||||||	|
||d
|}t        j                  |      |rO|j                  dd      rt        j                  d	t        d
       j                  t        j                  _	        j                  xs d}t        j                  |       fd} j                  j                  dddt!        |      i       t#        j$                         }|D cg c]
  } ||       }}t        j&                  |ddi d{   }t#        j$                         } j                  j)                  ddt!        |      ||z
  ddddt*        j,                  i       |D cg c]  }t/        |t0              s|n
t3        |      ! c}S c c}w 7 c c}w w)a  
            Runs the crawler for multiple URLs concurrently.

            Migration Guide:
            Old way (deprecated):
                results = await crawler.arun_many(
                    urls,
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
            
            New way (recommended):
                config = CrawlerRunConfig(
                    word_count_threshold=200,
                    screenshot=True,
                    ...
                )
                results = await crawler.arun_many(urls, crawler_config=config)

            Args:
                urls: List of URLs to crawl
                crawler_config: Configuration object controlling crawl behavior for all URLs
                [other parameters maintained for backwards compatibility]
            
            Returns:
                List[CrawlResult]: Results for each URL
            Nc              3   $   K   | ]  }|d u 
 y wr3   r4   )r5   params     r8   r9   z,AsyncWebCrawler.arun_many.<locals>.<genexpr>  s       7UuD( 7s   zXBoth crawler_config and legacy parameters provided. crawler_config will take precedence.r>   r?   )
r   rv   rw   rx   ry   rz   r~   r   r   rD   rI   Tz'bypass_cache' is deprecated and will be removed in version 0.5.0. Use 'cache_mode=CacheMode.BYPASS' instead. Pass warning=False to suppress this warning.rJ   rK      c                 L  K   t        |       j                  }t        j                         }j                  j	                  ddd| i       j
                  }j                  }|j                  v rM|j                  |   z
  }||k  r6|t        j                  d|      z   }t        j                  |       d {    |j                  |<   	4 d {    j                  | 
       d {   cd d d       d {    S 7 K7 37 7 # 1 d {  7  sw Y   y xY ww)NzStarted task for {url:.50}...PARALLELru   r   r   )r   r   )r&   netlocr   rG   debug
mean_delay	max_range_domain_last_hitrandomuniformrX   sleepr   )ru   domaincurrent_timer   r   time_since_lastdelayr,   r^   	semaphorer   s          r8   crawl_with_semaphorez7AsyncWebCrawler.arun_many.<locals>.crawl_with_semaphore  s    !#--#yy{!!;"!3< "  $..
",,	 T222&2T5J5J65R&RO&3 *V^^Ay-I I%mmE2220<%%f-$9!%'-#- "+ "  %99	 3 % %999sl   B8D$;D<D$D	D$D1D2D5D$DD$	D$DD$D!DD!D$z0Starting concurrent crawling for {count} URLs...rp   countr   return_exceptionszEConcurrent crawling completed for {count} URLs | Total time: {timing}r   r   r   )r  r   r   r   )rN   rG   rI   r   rO   rT   rU   rV   rW   ry   r   BYPASSsemaphore_countrX   	Semaphorerq   lenr   r   gatherr   r   r   r   r   r   )r^   r   r,   r   rv   rw   rx   ry   rz   r~   r   r   r   rD   r7   r   r   r  r  r   ru   tasksresultsend_timer   r  s   ` `         `            @r8   	arun_manyzAsyncWebCrawler.arun_many  s    \ $N) (*=?P"Jj#7   KK'' z% (  ( -A+>):&4",$0$0",&! ! *55mD::i.MMG +#$ $$,(1(8(8F%$449O))/:I@ KKJT+   **,J:>?$3)#.$E?#NNEJTJJG((*H KK_ Y!)J!6s ;1=
 dkk   
 `gg_fU[*VY"?FS[P_fgg# @J  hs1   D"G#(G7G#GAG#0$G	G#G#c                 H   K   t        j                          d{    y7 w)zClear the cache database.N)r   cleanupre   s    r8   aclear_cachezAsyncWebCrawler.aclear_cache7  s     &&(((   " "c                 H   K   t        j                          d{    y7 w)zFlush the cache database.N)r   	aflush_dbre   s    r8   aflush_cachezAsyncWebCrawler.aflush_cache;  s     ((***r  c                 F   K   t        j                          d{   S 7 w)z%Get the total number of cached items.N)r   aget_total_countre   s    r8   aget_cache_sizezAsyncWebCrawler.aget_cache_size?  s     %668888s   !!)%__name__
__module____qualname____doc__r   r   rP   getenvr   homer	   r   r   r   ra   rf   ri   rc   rh   rd   r   rt   r   RegexChunkingr   ExtractionStrategyChunkingStrategyRelevantContentFilterr   r   r   r   r
   r  r  r  r  r4       r8   r)   r)   )   s   >~  <@*.$)/3!)")),F			"TU!P"#78P 'P "	P
 'tnP P Pd"	@"
   26!36:2?/48.2!&"'"'#( $$")OO -.O "4O  0O 2O !+O O  O  O !O  !O" #O$ %O& 'O, -Ob__ _  #	_
 %_ _ _ _ _H 26!36:2?/48.2!& $$"Shs)Sh -.Sh "4Sh  0Sh 2Sh !+Sh Sh Sh Sh Sh Sh" +#Shj)+9r$  r)   )?rP   r   r   rU   enumr   coloramar   r   r   r   pathlibr   typingr	   r
   r   r   rX   
contextlibr   modelsr   r   async_databaser   rw   content_filter_strategyrv   async_crawler_strategyr   r   r   r   r   r   r   markdown_generation_strategyr   r   content_scraping_strategyr   async_loggerr   async_configsr   r   r,   r   r   r    utilsr!   r"   r#   r$   r%   urllib.parser&   r   r'   rr   r)   r4   r$  r8   <module>r4     s{        , ,  ( (   * 9 ,   & " l l I I ^ : % : 
  "  8X9 X9r$  