
    gz                         d dl mZmZmZmZmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ ddlmZmZ  G d d	      Z G d
 d      Zy)   )MIN_WORD_THRESHOLD$IMAGE_DESCRIPTION_MIN_WORD_THRESHOLDSCREENSHOT_HEIGHT_TRESHOLDPAGE_TIMEOUTIMAGE_SCORE_THRESHOLDSOCIAL_MEDIA_DOMAINS)UserAgentGenerator)ExtractionStrategy)ChunkingStrategy)MarkdownGenerationStrategy    )UnionListc            5           e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d!dedededededed	ed
ededededededededededededededededededef4dZ	e
dedd fd        Zy)"BrowserConfiga  
    Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.

    This class centralizes all parameters that affect browser and context creation. Instead of passing
    scattered keyword arguments, users can instantiate and modify this configuration object. The crawler
    code will then reference these settings to initialize the browser in a consistent, documented manner.

    Attributes:
        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
                            Default: "chromium".
        headless (bool): Whether to run the browser in headless mode (no visible GUI).
                         Default: True.
        use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
                                    advanced manipulation. Default: False.
        debugging_port (int): Port for the browser debugging protocol. Default: 9222.
        use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
                                       Automatically sets use_managed_browser=True. Default: False.
        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
                                     temporary directory may be used. Default: None.
        chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
                              is "chromium". Default: "chromium".
        channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
                              is "chromium". Default: "chromium".
        proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
                             Default: None.
        proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                     If None, no additional proxy config. Default: None.
        viewport_width (int): Default viewport width for pages. Default: 1080.
        viewport_height (int): Default viewport height for pages. Default: 600.
        verbose (bool): Enable verbose logging.
                        Default: True.
        accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
                                 Default: False.
        downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
                                      a default path will be created. Default: None.
        storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
                                             Default: None.
        ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
        java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
        cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like
                        {"name": "...", "value": "...", "url": "..."}.
                        Default: [].
        headers (dict): Extra HTTP headers to apply to all requests in this context.
                        Default: {}.
        user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                           "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36".
        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
                                       user_agent as-is. Default: None.
        user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
                                                    Default: None.
        text_mode (bool): If True, disables images and other rich content for potentially faster load times.
                          Default: False.
        light_mode (bool): Disables certain background features for performance gains. Default: False.
        extra_args (list): Additional command-line arguments passed to the browser.
                           Default: [].
    Nbrowser_typeheadlessuse_managed_browseruse_persistent_contextuser_data_dirchrome_channelchannelproxyproxy_configviewport_widthviewport_heightaccept_downloadsdownloads_pathignore_https_errorsjava_script_enabledsleep_on_closeverbosecookiesheaders
user_agentuser_agent_modeuser_agent_generator_config	text_mode
light_mode
extra_argsdebugging_portc                 t   || _         || _        || _        || _        || _        |xs | j                   xs d| _        |xs | j                   xs d| _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        ||ng | _        ||ni | _        || _        || _        || _        || _        || _        ||ng | _        || _        || _        || _        t7               }| j&                  dk7  r2| j(                  r& |j8                  di | j(                  xs i | _        n&| j&                  dk(  r|j9                         | _        n	 |j;                  | j$                        | _        | j"                  j?                  d| j<                         | j                  rd| _        y y )Nchromiumrandomz	sec-ch-uaT ) r   r   r   r   r   r   r   r   r   r   r   r   r   storage_stater   r    r#   r$   r%   r&   r'   r(   r)   r*   r!   r"   r+   r	   generategenerate_client_hintsbrowser_hint
setdefault)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r0   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   user_agenr_generators                                K/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/async_configs.py__init__zBrowserConfig.__init__K   s   B ) #6 &<#*,O0A0AOZA$"3"3Az
(,. 0,*#6 #6 ")"5w2")"5w2$.+F("$(2(>*B,,138+0P0P;2;; 339rDO !!X-2;;=DO0FFtWT->->? &&'+D$ '    kwargsreturnc           	         t        di d| j                  dd      d| j                  dd      d| j                  dd      d| j                  dd      d| j                  d      d	| j                  d	d      d
| j                  d
d      d| j                  d      d| j                  d      d| j                  dd      d| j                  dd      d| j                  dd      d| j                  d      d| j                  d      d| j                  dd      d| j                  dd      d| j                  dg       d| j                  di       d| j                  dd      d| j                  d      d| j                  d      d| j                  dd      d| j                  dd      d| j                  dg       S ) Nr   r-   r   Tr   Fr   r   r   r   r   r   r   8  r   X  r   r   r0   r   r    r#   r$   r%   zuMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36r&   r'   r(   r)   r*   r/   )r   getr:   s    r7   from_kwargszBrowserConfig.from_kwargs   s    
NJ?
ZZ
D1
 !'

+@% H
 $*::.F#N	

 !**_5
 "::&6
C
 JJy*5
 **W%
  N3
 "::&6=
 #JJ'8#>
 $ZZ(:EB
 "::&67
 !**_5
 !'

+@$ G
  !'

+@$ G!
" JJy"-#
$ JJy"-%
& zzX'
0 #JJ'891
2 )/

3P(Q3
4 jje45
6 zz,67
8 zz,39
 	
r9   )r-   TFFNr-   r-   NNr=   r>   FNNTTFTNNzMozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47NNFFNi$  )__name__
__module____qualname____doc__strbooldictintlistr8   staticmethodrA   r/   r9   r7   r   r      s   7v '$)',!(!!""!&"$($($W  $,0 #?L,L, L, "	L,
 !%L, L, L, L, L, L, L, L, L, L,  "!L," "#L,$ %L,& 'L,( )L,* +L,, -L,4 5L,6 &*7L,8 9L,: ;L,< =L,> ?L,\ 
D 
_ 
 
r9   r   c            g          e Zd ZdZeddddddddddddddddddddeddddddddd	dd
ddddddddedeedddddd	ddf5de	de
dededededededededededededededededede	ded ed!ed"ed#ed$e	d%eeee   f   d&ed'ed(ed)ed*ed+ed,ed-ed.ed/ed0ed1ed2e	d3ed4e	d5e	d6ed7ed8ed9ed:ed;ed<ed=effd>Zed?ed@d fdA       ZdB Zy)CCrawlerRunConfiga   
    Configuration class for controlling how the crawler runs each crawl operation.
    This includes parameters for content extraction, page manipulation, waiting conditions,
    caching, and other runtime behaviors.

    This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods.
    By using this class, you have a single place to understand and adjust the crawling options.

    Attributes:
        # Content Processing Parameters
        word_count_threshold (int): Minimum word count threshold before processing content.
                                    Default: MIN_WORD_THRESHOLD (typically 200).
        extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
                                                          Default: None (NoExtractionStrategy is used if None).
        chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
                                              Default: RegexChunking().
        markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
                                                         Default: None.
        content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content.
                                                        Default: None.
        only_text (bool): If True, attempt to extract text-only content where applicable.
                          Default: False.
        css_selector (str or None): CSS selector to extract a specific portion of the page.
                                    Default: None.
        excluded_tags (list of str or None): List of HTML tags to exclude from processing.
                                             Default: None.
        excluded_selector (str or None): CSS selector to exclude from processing.
                                         Default: None.
        keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
                                     Default: False.
        remove_forms (bool): If True, remove all `<form>` elements from the HTML.
                             Default: False.
        prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
                          Default: False.
        parser_type (str): Type of parser to use for HTML parsing.
                           Default: "lxml".

        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
                                        If None, defaults to CacheMode.ENABLED internally.
                                        Default: None.
        session_id (str or None): Optional session ID to persist the browser context and the created
                                  page instance. If the ID already exists, the crawler does not
                                  create a new page and uses the current page to preserve the state.
        bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
                             Default: False.
        disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
                              Default: False.
        no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY.
                              Default: False.
        no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
                               Default: False.

        # Page Navigation and Timing Parameters
        wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
                          Default: "domcontentloaded".
        page_timeout (int): Timeout in ms for page operations like navigation.
                            Default: 60000 (60 seconds).
        wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
                                Default: None.
        wait_for_images (bool): If True, wait for images to load before extracting content.
                                Default: False.
        delay_before_return_html (float): Delay in seconds before retrieving final HTML.
                                          Default: 0.1.
        mean_delay (float): Mean base delay between requests when calling arun_many.
                            Default: 0.1.
        max_range (float): Max random additional delay range for requests in arun_many.
                           Default: 0.3.
        semaphore_count (int): Number of concurrent operations allowed.
                               Default: 5.

        # Page Interaction Parameters
        js_code (str or list of str or None): JavaScript code/snippets to run on the page.
                                              Default: None.
        js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
                        Default: False.
        ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
                                       Default: True.
        scan_full_page (bool): If True, scroll through the entire page to load all content.
                               Default: False.
        scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
                              Default: 0.2.
        process_iframes (bool): If True, attempts to process and inline iframe content.
                                Default: False.
        remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
                                        Default: False.
        simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
                              Default: False.
        override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
                                   Default: False.
        magic (bool): If True, attempts automatic handling of overlays/popups.
                      Default: False.
        adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
                                           Default: False.

        # Media Handling Parameters
        screenshot (bool): Whether to take a screenshot after crawling.
                           Default: False.
        screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
                                             Default: None.
        screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
                                           Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
        pdf (bool): Whether to generate a PDF of the page.
                    Default: False.
        image_description_min_word_threshold (int): Minimum words for image description extraction.
                                                    Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
        image_score_threshold (int): Minimum score threshold for processing an image.
                                     Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
        exclude_external_images (bool): If True, exclude all external images from processing.
                                         Default: False.

        # Link and Domain Handling Parameters
        exclude_social_media_domains (list of str): List of domains to exclude for social media links.
                                                    Default: SOCIAL_MEDIA_DOMAINS (from config).
        exclude_external_links (bool): If True, exclude all external links from the results.
                                       Default: False.
        exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
                                           Default: False.
        exclude_domains (list of str): List of specific domains to exclude from results.
                                       Default: [].

        # Debugging and Logging Parameters
        verbose (bool): Enable verbose logging.
                        Default: True.
        log_console (bool): If True, log console messages from the page.
                            Default: False.
    NFlxmldomcontentloaded皙?333333?   T皙?word_count_thresholdextraction_strategychunking_strategymarkdown_generator	only_textcss_selectorexcluded_tagsexcluded_selectorkeep_data_attributesremove_forms	prettiifyparser_typefetch_ssl_certificate
session_idbypass_cachedisable_cacheno_cache_readno_cache_write
wait_untilpage_timeoutwait_forwait_for_imagesdelay_before_return_html
mean_delay	max_rangesemaphore_countjs_codejs_onlyignore_body_visibilityscan_full_pagescroll_delayprocess_iframesremove_overlay_elementssimulate_useroverride_navigatormagicadjust_viewport_to_content
screenshotscreenshot_wait_forscreenshot_height_thresholdpdf$image_description_min_word_thresholdimage_score_thresholdexclude_external_imagesexclude_social_media_domainsexclude_external_linksexclude_social_media_linksexclude_domainsr"   log_consoleurlc6                    |5| _         || _        || _        || _        || _        || _        || _        || _        |xs g | _        |	xs d| _	        |
| _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        | | _         |!| _!        |"| _"        |#| _#        |$| _$        |%| _%        |&| _&        |'| _'        |(| _(        |)| _)        |*| _*        |+| _+        |,| _,        |-| _-        |.| _.        |/xs t^        | _0        |0| _1        |1| _2        |2xs g | _3        |3| _4        |4| _5        | j                  %tm        | j                  tn              stq        d      | j                  %tm        | j                  tr              stq        d      | j                  ddlm:}6  |6       | _        y y )N z=extraction_strategy must be an instance of ExtractionStrategyz9chunking_strategy must be an instance of ChunkingStrategyr   )RegexChunking);r   rT   rU   rV   rW   content_filterrX   rY   rZ   r[   r\   r]   r^   r_   r`   
cache_modera   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r"   r   
isinstancer
   
ValueErrorr   r   )7r5   rT   rU   rV   rW   r   rX   rY   rZ   r[   r\   r]   r^   r_   r`   r   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r"   r   r   r   s7                                                          r7   r8   zCrawlerRunConfig.__init__<  s@   P  %9!#6 !2"4,"(*0b!2!8b$8!("& &;" %$(**, %( .(@%$". &<#,(.'>$*"4
*D' %#6 +F(4X1%:"'>$ -I,`L`)&<#*D'.4" & ##/
$$&89
 \]]!!-j""$47
 XYY !!)8%2_D" *r9   r:   r;   c           	         t        dAi d| j                  dd      d| j                  d      d| j                  d      d| j                  d      d| j                  d      d| j                  dd      d	| j                  d	      d
| j                  d
g       d| j                  dd      d| j                  dd      d| j                  dd      d| j                  dd      d| j                  dd      d| j                  dd      d| j                  d      d| j                  d      d| j                  dd      d| j                  dd      d| j                  dd      d| j                  dd      d| j                  dd      d| j                  dd      d| j                  d      d| j                  dd      d| j                  dd       d!| j                  d!d       d"| j                  d"d#      d$| j                  d$d%      d&| j                  d&      d'| j                  d'd      d(| j                  d(d)      d*| j                  d*d      d+| j                  d+d,      d-| j                  d-d      d.| j                  d.d      d/| j                  d/d      d0| j                  d0d      d1| j                  d1d      d2| j                  d2d      d3| j                  d3d      d4| j                  d4      d5| j                  d5t              d6| j                  d6d      d7| j                  d7t              d8| j                  d8t              d9| j                  d9d      d:| j                  d:t
              d;| j                  d;d      d<| j                  d<d      d=| j                  d=g       d>| j                  d>d)      d?| j                  d?d      d@| j                  d@      S )BNrT      rU   rV   rW   r   rX   FrY   rZ   r[   r   r\   r]   r^   r_   rN   r`   r   ra   rb   rc   rd   re   rf   rO   rg   i`  rh   ri   rj   rP   rk   rl   rQ   rm   rR   rn   ro   rp   Trq   rr   rS   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r"   r   r   r/   )rM   r?   r   r   r   r   r@   s    r7   rA   zCrawlerRunConfig.from_kwargs  s    F
!',BC!HF
 !'

+@ AF
 %jj)<=	F

  &zz*>?F
 "::&67F
 jje4F
  N3F
 !**_b9F
 %jj)<bAF
 "(,BE!JF
  NE:F
 jje4F
 

=&9F
" #)**-De"L#F
( zz,/)F
* zz,/+F
,  NE:-F
. !**_e</F
0 !**_e<1F
2 "::&6>3F
8 zz,0BC9F
:  NE:;F
< ZZ
+=F
> #JJ'8%@?F
@ &,ZZ0JC%PAF
B zz,4CF
D jjc2EF
F #JJ'8!<GF
L JJy)MF
N JJy%0OF
P $*::.F#MQF
R "::&6>SF
T  NC8UF
V #JJ'8%@WF
X %+JJ/H%$PYF
Z !**_e<[F
\  &zz*>F]F
^ **We,_F
` (.zz2NPU'VaF
f zz,6gF
h !'

+@ AiF
j )/

3PRl(mkF
l 

5%(mF
n 28<b  eI  2JoF
p #)**-DF["\qF
r %+JJ/H%$PsF
x *04RTh)iyF
z $*::.F#N{F
| (.zz2NPU'V}F
~ #JJ'8"=F
D JJy$/EF
F 

=%8GF
J 

5!KF
 F	
r9   c                 n   i d| j                   d| j                  d| j                  d| j                  d| j                  d| j
                  d| j                  d| j                  d	| j                  d
| j                  d| j                  d| j                  d| j                  d| j                  d| j                  d| j                  d| j                   i d| j"                  d| j$                  d| j&                  d| j(                  d| j*                  d| j,                  d| j.                  d| j0                  d| j2                  d| j4                  d| j6                  d| j8                  d| j:                  d| j<                  d | j>                  d!| j@                  d"| jB                  i d#| jD                  d$| jF                  d%| jH                  d&| jJ                  d'| jL                  d(| jN                  d)| jP                  d*| jR                  d+| jT                  d,| jV                  d-| jX                  d.| jZ                  d/| j\                  d0| j^                  d1| j`                  d2| jb                  d3| jd                  | jf                  | jh                  d4S )5NrT   rU   rV   rW   r   rX   rY   rZ   r[   r\   r]   r^   r_   r`   r   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r"   )r   r   )5rT   rU   rV   rW   r   rX   rY   rZ   r[   r\   r]   r^   r_   r`   r   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r"   r   r   )r5   s    r7   to_dictzCrawlerRunConfig.to_dict$  s   6
"D$=$=6
!4#;#;6
  !7!76
 !$"9"9	6

 d116
 6
 D--6
 T//6
  !7!76
 #D$=$=6
 D--6
 6
 4++6
 $T%?%?6
 $//6
  $//!6
" D--#6
$ T//%6
& T//'6
( d11)6
* $//+6
, D---6
. /6
0 t3316
2 '(E(E36
4 $//56
6 76
8 t3396
: t||;6
< t||=6
> %d&A&A?6
@ d11A6
B D--C6
D t33E6
F &t'C'CG6
H T//I6
J !$"9"9K6
L TZZM6
N )$*I*IO6
P $//Q6
R "4#;#;S6
T *4+K+KU6
V 488W6
X 3D4]4]Y6
Z $T%?%?[6
\ &t'C'C]6
^ +D,M,M_6
` %d&A&Aa6
b )$*I*Ic6
d t33e6
f t||g6
h  ++88k6
 6	
r9   )rB   rC   rD   rE   r   r   r   r   r   rI   r
   r   r   rG   rF   rJ   floatr   r   r8   rK   rH   rA   r   r/   r9   r7   rM   rM      sg   ~F %726.29= "!%%*"! ', "##$ -( %*-  *.'+$! %(-##(+0 !%)+E4X%:(- .2',+0 $ !M[5 "[5 0	[5
 ,[5 7[5 [5 [5 [5 [5 #[5 [5 [5 [5$  $%[5, -[5. /[50 1[52 3[54 5[5: ;[5< =[5> ?[5@ A[5B #(C[5D E[5F G[5H I[5N sDI~&O[5P Q[5R !%S[5T U[5V W[5X Y[5Z "&[[5\ ][5^ !_[5` a[5b %)c[5h i[5j #k[5l &)m[5n o[5p /2q[5r  #s[5t "&u[5z '+{[5| !%}[5~ %)[5@ A[5F G[5H I[5L M[5z G
D G
%7 G
 G
T7
r9   rM   N)configr   r   r   r   r   r   user_agent_generatorr	   rU   r
   rV   r   markdown_generation_strategyr   typingr   r   r   rM   r/   r9   r7   <module>r      s9     5 3 / D g
 g
T`
 `
r9   