
    g                        d dl Z d dlmZ d dlmZmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlZd dlZd dlZd dlZddlmZ ddl d dlmZ d d	lmZmZ d d
lmZ d dlZd dlmZ d dlmZmZmZmZ d dlZd dl m!Z!m"Z"m#Z# d dl$Z$d dl%Z%d dl&Z&d dl'm(Z( d dl)Z) G d de*      Z+dEde,de,de-de.de.de,fdZ/d Z0d Z1d Z2d Z3d Z4d Z5de,de,fdZ6d  Z7dFd!Z8e9dfd"Z:e9dfd#e,d$e,d%e-d&e,dee,ef   f
d'Z;dGd(Z<d) Z=d* Z>	 	 dHd+Z?e@ddfd,ZAdId-ZBd. ZCdGd#e,d/eDd0e,d1e,deDf
d2ZEd3 ZFd4 ZGd5 ZHd6 ZId7 ZJd#e,de,fd8ZKd#e,d9e,de.fd:ZLd;eDe,   deDe,   fd<ZMd= ZNd>e,de,fd?ZOd@e,dee,e,f   fdAZPdB ZQdJdCe-fdDZRy)K    N)urlparse)ThreadPoolExecutoras_completed)BeautifulSoupCommentelementTagNavigableString   )PROMPT_EXTRACT_BLOCKS)*)Path)DictAny)urljoin)InvalidSchema)OptionalTupler   r   )ForeStyleinitwrapsc                       e Zd Zy)InvalidCSSSelectorErrorN)__name__
__module____qualname__     C/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/utils.pyr   r      s    r    r   messagetypewidthadd_newlinesdouble_linereturnc                 6   t                t        j                  t        j                  dft        j                  t        j
                  dft        j                  t        j                  dft        j                  t        j                  dfd}|j                  |j                         |d         \  }}}ddd	}	|rd
nd}
|	|
   \  }}}}}}g }| j                  d      }|r| d|d   j                          }t        j                  ||dz
        }|j!                  |j                  d             |dd D ]o  }|j                         rLt        j                  d|j                          |dz
        }|j!                  |j                  d             _|j#                  d       q ||dz
  z  }| | | | g|D cg c]  }| | | d|d|dz
   | |  c}| | | | t$        j&                   }dj)                  |      }|rd| d}|S c c}w )a  
    Create a styled message box with colored borders and formatted text.

    How it works:
    1. Determines box style and colors based on the message type (e.g., info, warning).
    2. Wraps text to fit within the specified width.
    3. Constructs a box using characters (single or double lines) with appropriate formatting.
    4. Adds optional newlines before and after the box.

    Args:
        message (str): The message to display inside the box.
        type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info".
        width (int): Width of the box. Defaults to 120.
        add_newlines (bool): Whether to add newlines before and after the box. Defaults to True.
        double_line (bool): Whether to use double lines for the box border. Defaults to False.

    Returns:
        str: A formatted string containing the styled message box.
    u   ⚠u   ℹu   ✓   ×)warninginfosuccesserrorr+   )u   ─u   │u   ┌u   ┐u   └u   ┘)u   ═u   ║u   ╔u   ╗u   ╚u   ╝)singledoubler/   r.   
 r      )r$   r   N   <   )r   r   YELLOWLIGHTYELLOW_EXBLUELIGHTBLUE_EXGREENLIGHTGREEN_EXREDLIGHTRED_EXgetlowersplitstriptextwrapfillextendappendr   	RESET_ALLjoin)r"   r#   r$   r%   r&   stylesborder_color
text_colorprefix	box_chars
line_styleh_linev_linetltrblbrformatted_lines	raw_lines
first_linewrapped_firstlinewrappedhorizontal_lineboxresults                             r!   create_box_messager^      s5   * 	F KK!4!4e<D--u5JJ 2 2E:((D,,d3	F (.zz$**,v'O$L*f =<I )hJ%.z%:"FFBB Od#Ixq1!3!3!5 67
 ja@}22489abMDzz|"--"TZZ\N(;57K&&w}}T':;&&r* " 	*O._-bT2ds	tds\`\N6(:,aQuQwiK/@vh
Wds	t ._-bT%//1BCC YYs^FfXRM 
us    Hc                      t        j                         } t               dz  }t        d| dz        }t	        |dz        }t        ||      S )a  
    Calculate the optimal semaphore count based on system resources.

    How it works:
    1. Determines the number of CPU cores and total system memory.
    2. Sets a base count as half of the available CPU cores.
    3. Limits the count based on memory, assuming 2GB per semaphore instance.
    4. Returns the minimum value between CPU and memory-based limits.

    Returns:
        int: The calculated semaphore count.
    i   @r   r6   )os	cpu_countget_system_memorymaxintmin)ra   	memory_gb
base_countmemory_based_caps       r!   calculate_semaphore_countri   e   sI     I!#y1IQ	Q'J9q=)z+,,r    c                    	 t        j                         } | dk(  rYt        dd      5 }|D ]=  }|j                  d      st	        |j                         d         dz  c cddd       S  	 ddd       y| dk(  r?d	dl}|j                  g d
      j                  d      }t	        |j                               S | dk(  rd	dl
		j                  j                  }	j                   G 	fdd	j                        } |       }	j                  |      |_        |j#                  	j%                  |             |j&                  S t)        d      # 1 sw Y   yxY w)af  
    Get the total system memory in bytes.

    How it works:
    1. Detects the operating system.
    2. Reads memory information from system-specific commands or files.
    3. Converts the memory to bytes for uniformity.

    Returns:
        int: The total system memory in bytes.

    Raises:
        OSError: If the operating system is unsupported.
    Linuxz/proc/meminforz	MemTotal:r   i   NDarwinr   )sysctlz-nz
hw.memsizeutf-8Windowsc            
           e Zd ZdW j                  fdW j                  fdW  fdW  fdW  fdW  fdW  fdW  fd	W  fg	Zy
))get_system_memory.<locals>.MEMORYSTATUSEXdwLengthdwMemoryLoadullTotalPhysullAvailPhysullTotalPageFileullAvailPageFileullTotalVirtualullAvailVirtualullAvailExtendedVirtualN)r   r   r   c_ulong_fields_)c_ulonglongctypess   r!   MEMORYSTATUSEXrr      sZ    V^^,0--#[1#[1"K0"K0*K8
Hr    r   zUnsupported operating system)platformsystemopen
startswithrd   rA   
subprocesscheck_outputdecoderB   r   windllkernel32r~   	Structuresizeofrs   GlobalMemoryStatusExbyrefru   OSError)
r   memrY   r   outputr   r   memoryStatusr~   r   s
           @@r!   rb   rb   y   s(     __F/3'3??;/tzz|A/$66 (' (' 
8	(()GHOOPWX6<<>""	9	==))((	V-- 	 &' &n =%%fll<&@A(((455; ('s   E  E*EEc            
      P   t         j                  j                  t        j                  dt        j                  dt	        j
                                     d      } t        j                  | d       t        j                  |  dd       t        j                  |  dd       | S )at  
    Get or create the home folder for Crawl4AI configuration and cache.

    How it works:
    1. Uses environment variables or defaults to the user's home directory.
    2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist.
    3. Returns the path to the home folder.

    Returns:
        str: The path to the Crawl4AI home folder.
    CRAWL4_AI_BASE_DIRECTORYz	.crawl4aiTexist_okz/cachez/models)r`   pathrH   getenvr   homemakedirs)home_folders    r!   get_home_folderr      s     '',,ryy)CRYYOikoktktkvEwx  {F  GKKKd+KK;-v&6KK;-w'$7r    c                 h    t        j                  |       }t        |d      }|j                         }|S )z
    Beautifies an escaped HTML string.
    
    Parameters:
    escaped_html (str): A string containing escaped HTML.
    
    Returns:
    str: A beautifully formatted HTML string.
    html.parser)htmlunescaper   prettify)escaped_htmlunescaped_htmlsouppretty_htmls       r!   beautify_htmlr      s1     ]]<0N 7D--/Kr    c                    | j                  d      r$| j                  d      r| dd j                         } g }d}d}t        |       D ]?  \  }}|dk(  r|dk(  r|}|dz  }|dk(  s|dz  }|dk(  s)|j	                  | ||dz           A g }g }|D ])  }	 t        j                  |      }	|j	                  |	       + ||fS # t
        j                  $ r |j	                  |       Y Vw xY w)a  
    Splits a JSON string which is a list of objects and tries to parse each object.
    
    Parameters:
    json_string (str): A string representation of a list of JSON objects, e.g., '[{...}, {...}, ...]'.
    
    Returns:
    tuple: A tuple containing two lists:
        - First list contains all successfully parsed JSON objects.
        - Second list contains the string representations of all segments that couldn't be parsed.
    []r   r   {})r   endswithrB   	enumeraterF   jsonloadsJSONDecodeError)
json_stringsegmentsdepthstart_indexicharparsed_objectsunparsed_segmentssegmentobjs
             r!   split_and_parse_json_objectsr      s    c"{';';C'@!!B'--/ HEK[)43;zQJES[QJEzK! <= * N	.**W%C!!#&  ,,, ## 	.$$W-	.s   &B??$C&%C&c                 N    | }|j                  dd      j                  dd      }|S )a<  
    Sanitize an HTML string by escaping quotes.

    How it works:
    1. Replaces all unwanted and special characters with an empty string.
    2. Escapes double and single quotes for safe usage.

    Args:
        html (str): The HTML string to sanitize.

    Returns:
        str: The sanitized HTML string.
    "\"'z\')replace)r   sanitized_htmls     r!   sanitize_htmlr      s0      N $++C7??UKNr    textc                 .   	 	 | sy| j                  dd      j                  d      S # t        $ r:}t        d|        | j                  dd      j                  d      cY d}~S d}~ww xY w# t        $ r}t        dt        |             |d}~ww xY w)	z3Sanitize input to handle potential encoding issues.r4   ro   ignore)errorszFWarning: Encoding issue detected. Some characters may be lost. Error: asciiNzError sanitizing input: )encoder   UnicodeEncodeErrorprint	Exception
ValueErrorstr)r   es     r!   sanitize_input_encoder     s    E	I;;wx;8??HH! 	IZ[\Z]^_;;wx;8??HH	I  E3CF8<=1DEs<   ( !( 	A+/A& A+!A. &A++A. .	B7BBc                 2   | j                  dd      } | j                  dd      } | j                  dd      } | j                  dd      } | j                  d	d
      } | j                  dd      } | j                  dd      } t        j                  dd |       } | S )z
    Escapes characters in a string to be JSON safe.

    Parameters:
    s (str): The input string to be escaped.

    Returns:
    str: The escaped string, safe for JSON encoding.
    \z\\r   r   z\bz\fr0   z\nz\r	z\tz[\x00-\x1f\x7f-\x9f]c                 R    dj                  t        | j                                     S )Nz\u{:04x})formatordgroup)xs    r!   <lambda>z$escape_json_string.<locals>.<lambda><  s    +2D2DS^2Tr    )r   resub)ss    r!   escape_json_stringr   #  s     	
		$A 	
		#uA 	
		$A			$A			$A			$A			$A 	&(TVWXAHr    c                    i dd dd dd dd d	d
 dd dd dd dd dd dd dd dd dd dd dd  d!d" d# d$ d% d& d'}|D cg c]  }||j                  |d(       f }}|D ]B  \  }}| j                  |      D ])  }|r|j                  n ||      }|j                  |       + D | S c c}w ))aN  
    Replace inline HTML tags with Markdown-style equivalents.

    How it works:
    1. Maps specific tags (e.g., <b>, <i>) to Markdown syntax.
    2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object.
    3. Optionally replaces tags with their text content only.

    Args:
        soup (BeautifulSoup): Parsed HTML content.
        tags (List[str]): List of tags to replace.
        only_text (bool): Whether to replace tags with plain text. Defaults to False.

    Returns:
        BeautifulSoup: Updated BeautifulSoup object with replaced tags.
    bc                 "    d| j                    dS Nz**r   tags    r!   r   z%replace_inline_tags.<locals>.<lambda>S      2chhZr*r    r   c                 "    d| j                    dS Nr   r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>T      1SXXJar    uc                 "    d| j                    dS )N__r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>U  r   r    spanc                     | j                    S Nr   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>V      sxxjMr    delc                 "    d| j                    dS Nz~~r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>W      Rz,r    insc                 "    d| j                    dS )Nz++r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>X  r   r    r   c                 "    d| j                    dS )N~r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>Y      Qsxxj?r    supc                 "    d| j                    dS )Nz^^r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>Z  r   r    strongc                 "    d| j                    dS r   r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>[  s    388*B/r    emc                 "    d| j                    dS r   r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>\  s    AchhZq/r    codec                 "    d| j                    dS N`r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>]      azOr    kbdc                 "    d| j                    dS r  r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>^  r   r    varc                 "    d| j                    dS N_r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>_  r   r    r   c                 "    d| j                    dS r   r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>`  r   r    qc                 "    d| j                    dS )Nr   r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>a  r   r    abbrc                 F    | j                    d| j                  dd       dS )Nz (titler4   ))r   r?   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>b  s!    sxxj3777B+?*@Br    citec                 "    d| j                    dS r
  r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>c  r  r    c                 "    d| j                    dS r
  r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>d  r   r    c                     | j                    S r   r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>e  r   r    c                 "    d| j                    dS )Nz<small>z</small>r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>f  s    wsxxj9r    c                 "    d| j                    dS )Nz==r   r   s    r!   r   z%replace_inline_tags.<locals>.<lambda>g  s    b
"-r    )dfntimesmallmarkc                     | j                   S r   r   )ts    r!   r   z%replace_inline_tags.<locals>.<lambda>j  s    !&&r    )r?   find_allr   replace_with)	r   tags	only_texttag_replacementsr   replacement_datatag_namereplacement_funcreplacement_texts	            r!   replace_inline_tagsr(  @  sx   $*( 	* 	)	
 	, 	, 	* 	, 	/ 	) 	+ 	* 	* 	* 	(  	B!" 	+#$ +)9-+0 W[[VZs.2238HIJVZ[&6""==*C+4sxx:J3:O-. + '7
 K \s   B=c                 ~   	 |syt        |d      }|j                  }|rL|j                  |      }|st        d|       |j	                  d      }|D ]  }	|j                  |	        |}g g d}
|j                  dd      D ]{  }|d	   }| j                  d
      d   }|j                  d      r*||vr&|
d   j                  ||j                         d       W|
d   j                  ||j                         d       } |j                  g d      D ]  }|j                           |j                         D ]  }|j                  dk7  si |_         g g g d}|j                  d      D ]8  }|d   j                  |j                  d      |j                  d      dd       : |j                  d      D ]8  }|d   j                  |j                  d      |j                  d      dd       : |j                  d      D ]8  }|d   j                  |j                  d      |j                  d      dd       : |j                  d      D ]F  }|j                  d      }|r!|j                  |j                  |             7|j                          H d } ||      }t!        |g d|j                  dd             }fd! ||      }t"        fd"t$        d#t&        fd$} |||      }d%t$        ffd&d"t$        ffd'} ||      }fd( |      }|j                  d) *      D ]  }|j)                           t+        |      j-                  d+d,      j-                  d-d.      }t/        |      }t0        j3                         }t5               }d|_        |j9                  |      }|j-                  d/d0      }	 t;        ||      }||d||
|d2S # t<        $ r!}t?        d1t+        |             i }Y d}~.d}~ww xY w# t<        $ r)}t?        d3t+        |             t        d4|       |d}~ww xY w)5a4  
    Extract structured content, media, and links from website HTML.

    How it works:
    1. Parses the HTML content using BeautifulSoup.
    2. Extracts internal/external links and media (images, videos, audios).
    3. Cleans the content by removing unwanted tags and attributes.
    4. Converts cleaned HTML to Markdown.
    5. Collects metadata and returns the extracted information.

    Args:
        url (str): The website URL.
        html (str): The HTML content of the website.
        word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
        css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.

    Returns:
        Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
    Nr   z;Invalid CSS selector , No elements found for CSS selector: divinternalexternalaT)hrefr/  /r6   httpr-  r/  r   r,  scriptstylelinkmetanoscriptimgimagesvideosaudiosr;  srcaltimage)r>  r?  r#   videor<  audior=  c                 \    | j                  d      D ]  }|j                         |_         | S )Npre)r  get_textstring)nodechilds     r!   replace_pre_tags_with_textz:get_content_of_website.<locals>.replace_pre_tags_with_text  s*    u-$~~/ . Kr    r   r   r   r   r   r   r   r   r   r   r  r  r  r   r  r  r  r  r  r  r  r"  F)r"  c                 D   | j                   D ]  }t        |t        j                        s ||       t	        |j                  d      j                               }t	        |j                         dk(  r|j                  d      r||k  s|j                           | S )NTrB   r   )contents
isinstancer   r	   lenrE  rA   	decompose)rG  word_count_thresholdrH  
word_count(remove_empty_and_low_word_count_elementss       r!   rS  zHget_content_of_website.<locals>.remove_empty_and_low_word_count_elements  s    eW[[1<UDXY!$U^^$^%?%E%E%G!HJENN+q0d9SXbeyXy) ' Kr    bodyrQ  c                 D   g }| j                  d      D ]r  }|j                  s|j                  j                         s+t        |j                  j                         j	                               }||k  sb|j                  |       t |D ]  }|j                           | S )NT)r  rF  rB   rO  rA   rF   rP  )rT  rQ  tags_to_remover   rR  s        r!   remove_small_text_tagsz6get_content_of_website.<locals>.remove_small_text_tags  s    N }}T*::#**"2"2"4!$SZZ%5%5%7%=%=%?!@J!$88&--c2 + & & Kr    r   c                     t        | t              r| j                          S | j                  syt	        fd| j                  D              S )NTc              3   .   K   | ]  } |        y wr   r   ).0rH  is_empty_or_whitespaces     r!   	<genexpr>zIget_content_of_website.<locals>.is_empty_or_whitespace.<locals>.<genexpr>%  s     O,-e4,s   )rN  r
   rB   rM  all)r   r[  s    r!   r[  z6get_content_of_website.<locals>.is_empty_or_whitespace  s:    #/99;&<<O#,,OOOr    c                     d}|rEd}| j                  d      D cg c]  } |      s| }}|D ]  }|j                          d} |rE| S c c}w )NTF)r  rP  )rT  changesr   
empty_tagsr[  s       r!   remove_empty_tagsz1get_content_of_website.<locals>.remove_empty_tags'  sb    G-1]]4-@`-@cDZ[^D_c-@
`%CMMO"G &	  K as
   AAc                 0   | j                   D ]  }t        |t        j                        s |       t	        |j                         dk(  s?|j                   d   j
                  |j
                  k(  sf|j                   d   }|j                  |        | S Nr   r   )rM  rN  r   r	   rO  namer   )rG  rH  child_contentflatten_nested_elementss      r!   rf  z7get_content_of_website.<locals>.flatten_nested_elements:  su    eW[[1+E25>>*a/ENN14E4J4Jejj4X(-q(9**=9 ' Kr    c                 "    t        | t              S r   )rN  r   r   s    r!   r   z(get_content_of_website.<locals>.<lambda>J  s    D'9Rr    )rF  

r0   r3   r1       ``````Error extracting metadata:markdowncleaned_htmlr,   medialinksmetadatazError processing HTML content:zInvalid CSS selector: ) r   rT  selectr   new_tagrF   r  rA   r   rE  rP  rd  attrsr?   r   
new_stringr(  MIN_WORD_THRESHOLDr	   rd   extractr   r   r   	html2text	HTML2TextCustomHTML2Textignore_linkshandleextract_metadatar   r   ) urlr   rQ  css_selectorkwargsr   rT  selected_elementsdiv_tagelrp  r.  r/  url_baser   ro  r9  rA  rB  alt_textrI  rW  ra  commentrn  hrm  r7  r   rf  r[  rS  s                                 @@@r!   get_content_of_websiter  }  sO   *\VT=1 yy  $L 9$-0klxky.z{{ll5)G'r" (D 
 s.AV9Dyy~a(Hv&84+?j!(( JJL* 
 j!(( $ !

 /" ==!PQCMMO R ==?Cxx5 	 # 

 =='C(O""wwu~wwu~$  ( ]]7+E(O""yy'yy'$  , ]]7+E(O""yy'yy'$  , =='Cwwu~H  !:; (	 *$/ # `jje4
	 8>RSJ\ 	 	C 	, &d,@A	P 	P	C 	  !&		 't,
 }},R}SGOO T 4y((6>>tSI %\2 !88L)##Iu5	#D$/D !(
 	
  	.A7D	  V.A7%(>|n&MNTUUVsN   P
 EP
 I8P
 O P
 	P&P=P
 PP
 
	P<$P77P<r~  r   rQ  r  c                     |sy t        |d      }|j                  }j                  dt              j                  dg       xs g D ](  }|j	                  |      D ]  }|j                           * |rJ|j	                  |      }	|	st        d|       |j                  d      }|	D ]  }|j                  |        g g dg g g dfdfd	d
t        j                  dt        f fd|j                  d       fdt              D        D 
cg c]  }
|
|
 c}
d<    |       fd |      }t        j                  d      }D ];  }	 |j                  dd      }|j!                  |      r|j#                  d|      |d<   = t%        |      j'                  dd      j'                  dd      }t)        |      }t+               }d|_        |j/                  |      }|j'                  dd      }	 t1        ||      }||d|dS c c}
w #  Y xY w# t2        $ r!}t5        dt%        |             i }Y d }~:d }~ww xY w)Nr   $image_description_min_word_thresholdexcluded_tagsz:Invalid CSS selector, No elements found for CSS selector: r*  r+  r:  c                     | }|rB|j                   }|r1|j                  dd      }t        |j                               k\  r|S |rBy )Nr1   T)	separatorrB   )parentrE  rO  rA   )r   current_tagtext_contentr  s      r!   $find_closest_parent_with_useful_textzNget_content_of_website_optimized.<locals>.find_closest_parent_with_useful_text  sU    K)00#.#7#7#D#7#QL<--/04XX++  r    c                 0   d }d } || | j                   | j                   j                  dg             sy  || |||      }|t        k  ry | j                  dd      j                  dd      j	                         | j                  dd       |       |d	d
S )Nc                     | j                  dd      }| j                  dd      }g dddg}t        d|v|t        fd|| j                  d	d      g|D               |j                  |vg      S )
Nr5  r4   r>  )buttoniconlogor  inputzdisplay:nonec              3   4   K   | ]  }D ]  }||v  
  y wr   r   )rZ  r  r   classes_to_checks      r!   r\  zbget_content_of_website_optimized.<locals>.process_image.<locals>.is_valid_image.<locals>.<genexpr>  s#     q,VS`p[\S`p,Vs   r?  )r?   r]  anyrd  )r9  r  parent_classesr5  r>  tags_to_checkr  s         @r!   is_valid_imagezOget_content_of_website_optimized.<locals>.process_image.<locals>.is_valid_image  s    GGGR(E''%$C9%w/Me+qS#''%2D,V~,Vqqq=0	  r    c                 <   d }d }| j                  d      } ||      \  }}| j                  d      }	 ||	      \  }
}d}t        j                  j                  | j                  dd            d   j	                         j                  d	      d}|r|d
k(  r
|dkD  r|dz  }|dv r
|dkD  r|dz  }|
r|d
k(  r
|
dkD  r|dz  }|dv r
|
dkD  r|dz  }|dkD  r|dz  }| j                  d      dk7  r|dz  }t        fddD              r|dz  }||z  dk  r|dz  }|S )Nc                     | rKt        j                  d|       }|r3t        |j                  d            }|j                  d      xs d}||fS y)Nz
(\d+)(\D*)r   r6   px)NN)r   matchrd   r   )	dimensionr  numberunits       r!   parse_dimensionztget_content_of_website_optimized.<locals>.process_image.<locals>.score_image_for_usefulness.<locals>.parse_dimension  sI    HH]I>E!$U[[^!4${{1~5%t|+!r    c                    t        || j                  d            }	 t        j                  |      }|j                  dk(  r|j
                  j                  dd        y t        d|        	 y # t        $ r
}Y d }~y d }~ww xY w#  Y y xY w)Nr>     zContent-Lengthz!Failed to retrieve file size for )r   r?   requestsheadstatus_codeheadersr   r   )r9  base_urlimg_urlresponser   s        r!   fetch_image_file_sizezzget_content_of_website_optimized.<locals>.process_image.<locals>.score_image_for_usefulness.<locals>.fetch_image_file_size  s    !(3775>:
'}}W5H++s2'//334DTJ   A'KL#  %    s)   ?A. A. .	B7B <BB Bheightr$   r   r>  r4   r   .r     )%vhvminvmax   i'  r?  c              3   (   K   | ]	  }|k(    y wr   r   )rZ  r   image_formats     r!   r\  znget_content_of_website_optimized.<locals>.process_image.<locals>.score_image_for_usefulness.<locals>.<genexpr>  s     K6JF<'6Js   )jpgpngwebp      ?)r?   r`   r   splitextr@   rB   r  )r9  r  indeximages_countr  r  image_heightheight_valueheight_unitimage_widthwidth_value
width_unit
image_sizescorer  s                 @r!   score_image_for_usefulnessz[get_content_of_website_optimized.<locals>.process_image.<locals>.score_image_for_usefulness  sT   " 778,L(7(E%L+777+K&5k&B#KJ77++CGGE",=>qAGGIL'--c2LE$&<#+=QJE"::|R?OQJE%+*;QJE!99k2oQJEE!
wwu~#qK6JKKq\!#%qLr    classr>  r4   r   r   r?  r@  )r>  r?  descr  r#   )r  r?   IMAGE_SCORE_THRESHOLDr   rB   )r9  r~  r  total_imagesr  r  r  r  s          r!   process_imagez7get_content_of_website_optimized.<locals>.process_image  s    
	6	p c3::szz~~gr/JK*3ULI))775"%--eS9??A775"%8=
 	
r    r   r'   c           	      0   	 t        | t              r"t        | t              r | j                          y| j                  dv r | j
                          yd}| j                  dk(  r | j                  d      rn| d   }j                  d      d   }| | j                         d}|j                  d      r||vrd	   j                  |       nd
   j                  |       d}n| j                  dk(  ry| j                  dv rŉ| j                   d   j                   | j                  d       | j                  d      | j                   
|       d        | j                  d      }|D ]W  }| j                   d   j                  |j                  d       | j                  d      | j                   
|       d       Y y| j                  dk7  rj| j                  dv rFj                  dd      r" | j                   | j                                n( | j                          n| j                  dk7  ri | _        t        | j                         D ]M  }t        |t              r0t        |t              s t#        |j%                               dkD  s@d}C |      sLd}O |s/t#         | j                  d      j                               }|k\  }|s | j
                          |S # t&        $ r}	t)        dt+        |	             Y d }	~	yd }	~	ww xY w)NFr3  r.  r/  r0  r6   r2  r1  r-  r,  Tr9  )rA  rB  r   r>  r?  )r>  r?  r#   descriptionsourcerD  rJ  r"  r   rL  zError processing element:)rN  r
   r   rw  rd  rP  r?   rA   rE  r   rF   r  r   unwraprt  listchildrenrO  rB   r   r   r   )r   keep_elementr/  r  	link_datasource_tags
source_tagrH  rR  r   r  r  rp  ro  process_elementr~  rQ  s             r!   r  z9get_content_of_website_optimized.<locals>.process_element  s   H	'?3gw/#GOO%||NN!!!# L||s"{w{{6':v99S>!,%)3C73C3C3EF	??6*xt/C*%,,Y7*%,,Y7#&!33a()00&7;;u-&7;;u-#LL#G#P	2  /g..x8"-JW\\N!,-44%>>%0&7;;u-#LL#G#P	6  #. ||u$<<  $w  wzz+u5,,,-=W-=-=-?@&(\\U*$&GM g../e_5jPW>X5;;=)A-'+&u-'+ 0   !1!1!1!=!C!C!EF
)-AA!!!# 	-s1v6	s>   1K- K- B K- 6CK- 	CK- K- #A	K- -	L6LLr9  c           	   3   L   K   | ]  \  }} ||t                      y wr   )rO  )rZ  r   r9  imgsr  r~  s      r!   r\  z3get_content_of_website_optimized.<locals>.<genexpr>8  s%     M_61csCCI	._s   !$r;  c                 t   t        | t              r| S t        | j                        dk(  rbt        | j                  d   t        j
                        r;| j                  d   j                  | j                  k(  r | j                  d         S | j                  D cg c]
  } |       c}| _        | S c c}w rc  )rN  r
   rO  rM  r   r	   rd  )rG  rH  rf  s     r!   rf  zAget_content_of_website_optimized.<locals>.flatten_nested_elements>  s    dO,Kt}}"z$--2BGKK'PUYUbUbcdUeUjUjnrnwnwUw*4==+;<<EI]]S]E07]S Ts   B5zdata:image/[^;]+;base64,([^"]+)r>  r4   rh  r0   r3   r1   Tri  rj  rk  rl  )r   rT  r?   $IMAGE_DESCRIPTION_MIN_WORD_THRESHOLDrr  rP  r   rs  rF   r   PageElementboolr  r   r   compiler  r   r   r   r   rz  r{  r|  r}  r   r   )r~  r   rQ  r  r  r   rT  r   r  r  r]   base64_patternr9  r>  rn  r  rm  r7  r   r  rf  r  r  rp  ro  r  r  s   ` ` `              @@@@@@@@r!    get_content_of_website_optimizedr  p  s~   }-D99D+1::6\  _C  ,D(zz/2.4"4++c"BLLN # 5  KK5 ),fgsft*uvv||E"#BKKO $ ,ER26E
R
hI!4!4 I I IX ==D 	NYt_MM  	ME(O D #4(DZZ BCN	''%$C##C(+//C8E
	  t9$$VT2::4EL .LAANxx%H	51Hd+ $ Q.	  *CF3s*   *H%+8H*H1 *H.1	I:IIc                    i }| s|si S |st        | d      }|j                  }|s|S |j                  d      }|r&|j                  r|j                  j	                         nd|d<   |j                  dddi      }|r |j                  dd	      j	                         nd|d<   |j                  ddd
i      }|r |j                  dd	      j	                         nd|d
<   |j                  dddi      }|r |j                  dd	      j	                         nd|d<   |j                  ddt        j                  d      i      }|D ]M  }	|	j                  dd	      j	                         }
|	j                  dd	      j	                         }|
sF|sI|||
<   O |j                  ddt        j                  d      i      }|D ]M  }	|	j                  dd	      j	                         }
|	j                  dd	      j	                         }|
sF|sI|||
<   O |S )aR  
    Extract optimized content, media, and links from website HTML.

    How it works:
    1. Similar to `get_content_of_website`, but optimized for performance.
    2. Filters and scores images for usefulness.
    3. Extracts contextual descriptions for media files.
    4. Handles excluded tags and CSS selectors.
    5. Cleans HTML and converts it to Markdown.

    Args:
        url (str): The website URL.
        html (str): The HTML content of the website.
        word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
        css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
        **kwargs: Additional options for customization.

    Returns:
        Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
    lxmlr  Nr7  rd  r  )rt  contentr4   keywordsauthorpropertyz^og:z	^twitter:)	r   r  findrF  rB   r?   r  r   r  )r   r   rq  r  	title_tagdescription_tagkeywords_tag
author_tagog_tagsr   property_namer  twitter_tagss                r!   r}  r}  g  s   , H	T6*99D 		'"I4=)BRBR	((..0X\HW iiv}.EiFOL[o11)R@FFHaeH] 99VFJ+?9@LFR<++Ir:@@BX\HZ 6&();<JBL	26<<>RVHX mmF:rzz'7J*KmLG
B/557'')R(..0W&-H]#	  ==

<8P/Q=RL+113'')R(..0W&-H]#	  Or    c                 V    t        j                  d|       }t        t        |            S )z
    Extracts XML tags from a string.

    Args:    
        string (str): The input string containing XML tags.

    Returns:
        List[str]: A list of XML tags extracted from the input string.
    z<(\w+)>)r   findallr  set)rF  r!  s     r!   extract_xml_tagsr    s"     ::j&)DD	?r    c                     i }| D ]Z  }d| d| d}t        j                  ||t         j                        }|r#|j                  d      j	                         ||<   Vd||<   \ |S )a  
    Extract data for specified XML tags from a string.

    How it works:
    1. Searches the string for each tag using regex.
    2. Extracts the content within the tags.
    3. Returns a dictionary of tag-content pairs.

    Args:
        tags (List[str]): The list of XML tags to extract.
        string (str): The input string containing XML data.

    Returns:
        Dict[str, str]: A dictionary with tag names as keys and extracted content as values.
    r5   z>(.*?)</>r   r4   )r   searchDOTALLr   rB   )r!  rF  datar   patternr  s         r!   extract_xml_datar    sj    " DcU(3%q)		'62995A,,.DIDI  Kr    c                    ddl m} ddlm} d}d}	d||d}
|rdd	i|
d
<   |j	                  d      r|
j                  |d          t        |      D ]  }	  |d| d|dgd|
}|c S  y# |$ rd}t        dt        |             ||dz
  k  r-|	d|z  z  }t        d| d       t        j                  |       nddgdgdgcY d}~c S Y d}~}d}~ww xY w)a  
    Perform an API completion request with exponential backoff.

    How it works:
    1. Sends a completion request to the API.
    2. Retries on rate-limit errors with exponential delays.
    3. Returns the API response or an error after all retries.

    Args:
        provider (str): The name of the API provider.
        prompt_with_variables (str): The input prompt for the completion request.
        api_token (str): The API token for authentication.
        json_response (bool): Whether to request a JSON response. Defaults to False.
        base_url (Optional[str]): The base URL for the API. Defaults to None.
        **kwargs: Additional arguments for the API request.

    Returns:
        dict: The API response or an error message after all retries.
    r   )
completion)RateLimitError   r6   {Gz?)temperatureapi_keyr  r#   json_objectresponse_format
extra_argsuserroler  )modelmessageszRate limit error:r   zWaiting for z seconds before retrying...r-   z)Rate limit error. Please try again later.)r  r!  r  Nr   )litellmr  litellm.exceptionsr  r?   updateranger   r   r  sleep)providerprompt_with_variables	api_tokenjson_responser  r  r  r  max_attempts
base_delayr  attemptr  r   delays                  r!   perform_completion_with_backoffr    s   8 #1LJ J
 *0-(A
$%zz,&./&	  #0EF
 H O '  	%s1v. ))"a7l3UG+FGH

5! $I KL   "	s   A))C.AC CCc                 @   |st         j                  |d      n|}| t        t        |            d}t        }|D ]  }|j                  d|z   dz   ||         } t        ||||      }	 t        dg|j                  d   j                  j                        d   }	t        j                  |	      }	|	D ]  }
d|
d	<   	 	 |	S # t        $ rU}t        |j                  d   j                  j                        \  }}|}	|r|	j                  dd
d	g|d       Y d}~|	S d}~ww xY w)a  
    Extract content blocks from website HTML using an AI provider.

    How it works:
    1. Prepares a prompt by sanitizing and escaping HTML.
    2. Sends the prompt to an AI provider with optional retries.
    3. Parses the response to extract structured blocks or errors.

    Args:
        url (str): The website URL.
        html (str): The HTML content of the website.
        provider (str): The AI provider for content extraction. Defaults to DEFAULT_PROVIDER.
        api_token (Optional[str]): The API token for authentication. Defaults to None.
        base_url (Optional[str]): The base URL for the API. Defaults to None.

    Returns:
        List[dict]: A list of extracted content blocks.
    NURLHTMLr   r   r  blocksr   Fr-   T)r  r-   r!  r  )PROVIDER_MODELSr?   r   r   r   r   r  r  choicesr"   r  r   r   r   r   rF   )r~  r   r  r  r  variable_valuesr  variabler  r  blockr   parsedunparseds                 r!   extract_blocksr&    s>   * <E##Hd3)I "=#67O
 2# 5 = =(NS /(";!
 $
 /x9NPYdlmH!8*h.>.>q.A.I.I.Q.QRS[\F#E"E'N  M  
78H8H8K8S8S8[8[\MM 	#	  M
s   *AB? ?	DA
DDc                    |st        j                  dd      n|}ddlm} g }| D ]G  \  }}||d}t        }|D ]  }	|j                  d|	z   dz   ||	         } |j                  d|d	g       I  |||d
      }
g }|
D ]Z  }	 t        dg|j                  d   j                  j                        d   }t        j                  |      }|j                  |       \ t        |g       S # t        $ r}ddgdgdgdg}Y d}~8d}~ww xY w)at  
    Extract content blocks from a batch of website HTMLs.

    How it works:
    1. Prepares prompts for each URL and HTML pair.
    2. Sends the prompts to the AI provider in a batch request.
    3. Parses the responses to extract structured blocks or errors.

    Args:
        batch_data (List[Tuple[str, str]]): A list of (URL, HTML) pairs.
        provider (str): The AI provider for content extraction. Defaults to "groq/llama3-70b-8192".
        api_token (Optional[str]): The API token for authentication. Defaults to None.

    Returns:
        List[dict]: A list of extracted content blocks from all batch items.
    GROQ_API_KEYNr   )batch_completionr  r   r   r  r  r   )r	  r
  r  r  r-   zZError extracting blocks from the HTML content. Choose another provider/model or try again.z4What went wrong during the block extraction process?)r  r!  r  	questions)r`   r   r  r)  r   r   rF   r  r   r"   r  r   r   r   sum)
batch_datar  r  r)  r
  r~  r   r!  r  r"  	responses
all_blocksr  r  r   s                  r!   extract_blocks_batchr/  T  s>   $ 8A		.$/iI(H	T

 !6'H$9$A$Ah$oh&?%! (
 	&5JKLM   !I J
	%xj(2B2B12E2M2M2U2UVW_`FZZ'F 	&!  z2  	 	xyTU	 F	s   AC''	D0D  Dc                 (   g }g }d}| D ]d  }t        |j                               dz  }||z   |k  r|j                  |       ||z  }>|r |j                  dj                  |             |g}|}f |r |j                  dj                  |             |S )a  
    Merges small chunks into larger ones based on the total token threshold.

    :param chunks: List of text chunks to be merged based on token count.
    :param token_threshold: Max number of tokens for each merged chunk.
    :return: List of merged text chunks.
    r   g?rh  )rO  rA   rF   rH   )chunkstoken_thresholdmerged_sectionscurrent_chunktotal_token_so_farchunkchunk_token_counts          r!   %merge_chunks_based_on_token_thresholdr8    s     OM.4 11OC  '"33&&v{{='AB"GM!2  v{{=9:r    sectionsr  r  c                    g }|j                  d      r=|D ]6  }|j                  t        | ||||             t        j                  d       8 |S t               5 }|D cg c]  }|j                  t        | ||||       }}t        |      D ]!  }	|j                  |	j                                # 	 ddd       |S c c}w # 1 sw Y   |S xY w)a  
    Process sections of HTML content sequentially or in parallel.

    How it works:
    1. Sequentially processes sections with delays for "groq/" providers.
    2. Uses ThreadPoolExecutor for parallel processing with other providers.
    3. Extracts content blocks for each section.

    Args:
        url (str): The website URL.
        sections (List[str]): The list of HTML sections to process.
        provider (str): The AI provider for content extraction.
        api_token (str): The API token for authentication.
        base_url (Optional[str]): The base URL for the API. Defaults to None.

    Returns:
        List[dict]: The list of extracted content blocks from all sections.
    zgroq/r  r  N)	r   rE   r&  r  r  r   submitr   r]   )
r~  r9  r  r  r  extracted_contentsectionexecutorfuturesfutures
             r!   process_sectionsrA    s    ( 7#G$$^C(I`h%ijJJsO     !X }E  F  }Eqxx~sGXyckl  }EG  F&w/!((9 0 "
 	 F "
 s   C "B>1C>CCc                 0   g }|j                         }|rrd}|rZ| j                  d||d   z   |      d   |k  r:||j                  d      dz   z  }|r!| j                  d||d   z   |      d   |k  r:|j                  |       |rrdj	                  |      S )a3  
    Wrap text to fit within a specified width for rendering.

    How it works:
    1. Splits the text into words.
    2. Constructs lines that fit within the maximum width using the provided font.
    3. Returns the wrapped text as a single string.

    Args:
        draw (ImageDraw.Draw): The drawing context for measuring text size.
        text (str): The text to wrap.
        font (ImageFont.FreeTypeFont): The font to use for measuring text size.
        max_width (int): The maximum width for each line.

    Returns:
        str: The wrapped text.
    r4   )r   r   r   )fontr6   r1   r0   )rA   textbboxpoprF   rH   )drawr   rC  	max_widthlineswordsrY   s          r!   	wrap_textrJ    s    ( EJJLE
fdU1XoDI!LPYYUYYq\C'(D fdU1XoDI!LPYYT	 
 99Ur    c                 :    t        | d      }|j                         S )aO  
    Prettify an HTML string using BeautifulSoup.

    How it works:
    1. Parses the HTML string with BeautifulSoup.
    2. Formats the HTML with proper indentation.
    3. Returns the prettified HTML string.

    Args:
        html_string (str): The HTML string to format.

    Returns:
        str: The prettified HTML string.
    zlxml.parser)r   r   )html_stringr   s     r!   format_htmlrM    s      m4D==?r    c                 F   d}d}g }d}| j                  dd      j                  dd      j                  d      }|D ]  }|j                         s|j                  d	      r|d
z  }|j	                  ||z  |z          B|j                  d      r)|j                  d      r|j	                  ||z  |z          ||j                  d      r|j	                  ||z  |z          |d
z  }|j                         }|s|j	                  ||z  |z           dj                  |      S )z
    A fast HTML formatter that uses string operations instead of parsing.
    
    Args:
        html_string (str): The HTML string to format
        
    Returns:
        str: The formatted HTML string
    r   r3   Fr  z>
r5   z
<r0   z</r   z/>)r   rA   rB   r   rF   r   rH   )rL  indent
indent_str	formatted
in_contentpartspartr  s           r!   fast_format_htmlrU     s"    FJIJ U+33C?EEdKEzz| ??4 aKFZ&0478 __S!dmmD&9Z&0478 __S!Z&0478aKF jjlG  f!4w!>?- 0 99Yr    c                     ddl m}m}  ||      }|j                  r|j                  st        d|        ||| j                               }|S )*Normalize URLs to ensure consistent formatr   )r   r   Invalid base URL format: )urllib.parser   r   schemenetlocr   rB   )r/  r  r   r   parsed_base
normalizeds         r!   normalize_urlr^  -  sO    . 8$K[%7%74XJ?@@ 4::<0Jr    c                     	 |j                  d      }|d   }|d   }h d}t         fd|D              r j	                         S  j                  d      r|   S  j                  d      r|   S  j                  d      r	| d|   S  j                  d	      s j                  d
       | d| d  S  j	                         S # t        $ r t        d|       w xY w)rW  r0  r   r6   rX     ftp:tel:data:file:mailto:javascript:c              3   \   K   | ]#  }j                         j                  |       % y wr   r@   r   )rZ  protor/  s     r!   r\  z$normalize_url_tmp.<locals>.<genexpr>F  s%     
I7He4::<""5)7H   ),#z//)zhttp://zhttps://z./)rA   
IndexErrorr   r  rB   r   lstrip)r/  r  
base_partsprotocoldomainspecial_protocolss   `     r!   normalize_url_tmprr  :  s   A^^C(
a=A
 U

I7H
IIzz| sD6"" tD6"" s2fXdV,, ??23{{4 2fXQtf--::<5  A4XJ?@@As   C
 
C"c                 `   	 t        |       j                  j                         }|sy|j                  d      d   }t	        j
                  dd|      }|j                  d      }t        |      dkD  r|d   dv rdj                  |d	d
       S dj                  |dd
       S # t        $ r Y yw xY w)  
    Extract the base domain from a given URL, handling common edge cases.

    How it works:
    1. Parses the URL to extract the domain.
    2. Removes the port number and 'www' prefix.
    3. Handles special domains (e.g., 'co.uk') to extract the correct base.

    Args:
        url (str): The URL to extract the base domain from.

    Returns:
        str: The extracted base domain or an empty string if parsing fails.
    r4   :r   z^www\.r  r6   >   acadaeafagcocomedugovrd   milnetorgN)	r   r[  r@   rA   r   r   rO  rH   r   )r~  rp  rS  s      r!   get_base_domainr  ]  s    #%%++- c"1% 	2v. S!u:>eBi ,
 
 88E"#J''xxbc
## s   %B! A$B! B! !	B-,B-base_domainc                 >    h d}t         fd|D              ry	 t               }|j                  sy|j                  j                         j	                  dd      }|j                         j	                  dd      }|j                  |       S # t        $ r Y yw xY w)rt  r`  c              3   \   K   | ]#  }j                         j                  |       % y wr   rh  )rZ  pr~  s     r!   r\  z"is_external_url.<locals>.<genexpr>  s#     
6g399;!!!$grj  TFzwww.r4   )r  r   r[  r@   r   r   r   )r~  r  specialr$  
url_domainbases   `     r!   is_external_urlr    s     KG

6g
66#}} ]]((*2262>
  "**626 &&t,,, s   B AB 	BBtokensc                     h d}h d}| D cg c]M  }t        |      dkD  r=||vr9||vr5|j                  d      s$|j                  d      s|j                  d      s|O c}S c c}w )u  
    Clean a list of tokens by removing noise, stop words, and short tokens.

    How it works:
    1. Defines a set of noise words and stop words.
    2. Filters tokens based on length and exclusion criteria.
    3. Excludes tokens starting with certain symbols (e.g., "↑", "▲").

    Args:
        tokens (list[str]): The list of tokens to clean.

    Returns:
        list[str]: The cleaned list of tokens.
    >      ⬆️r.  anatbyinofontoupccpthe   ↑   ▲>   n'tcan'twon'tmustn'tcouldn'twouldn't	shouldn'tr.  r   amr  asr  ber  doher  isitmemynor  r  orsor  r  uswer]  andr  arebutcandidfewforhadhasherhimhishowitsmaynornotoffouroutsher  waswhowhyyetyoubeenbothdoesdowneachfromhavehersintominemoremostmustnearnoneoursoverpastsomesuchthatthemtheythisuponwerewhatwhenwhomwillwithyouraboutaboveafteralongamongbeingbelowcoulddoingmightothershallsincetheirthesethoseunderuntilwherewhichwhosewouldyoursacrossaroundbeforebehindbesidebeyondcannotduringexcepthavinginsideitselfmyselfshouldtheirstowardunlesswithinagainstbecausebeneathbetweenherselfhimselfoutsidethroughalthoughyourself	ourselves
themselves
underneathr6   r  r  u   ⬆)rO  r   )r  noise
STOP_WORDStokens       r!   clean_tokensr$    s|    " hE(JV  & -ve5zA~U"Z'$$U+$$U+$$U+ v - - -s   AA"c                 .     t                fd       }|S )a  
    Decorator to profile a function's execution time and performance.

    How it works:
    1. Records the start time before executing the function.
    2. Profiles the function's execution using `cProfile`.
    3. Prints the elapsed time and profiling statistics.

    Args:
        func (Callable): The function to decorate.

    Returns:
        Callable: The decorated function with profiling and timing enabled.
    c                 l   t        j                         }t        j                         }|j	                           | g|i |}|j                          t        j                         |z
  }t        d|dd       t        j                  |      }|j                  d       |j                  d       |S )Nz![PROFILER] Scraping completed in z.2fz seconds
cumulative   )r  perf_countercProfileProfileenabledisabler   pstatsStats
sort_statsprint_stats)	selfargsr  
start_timeprofilerr]   elapsed_timestatsfuncs	           r!   wrapperz!profile_and_time.<locals>.wrapper  s     &&(
 ##% d,T,V, 	 ((*Z7 	1,s1C8LM X&&"r    r   )r8  r9  s   ` r!   profile_and_timer:    s"      4[ 4 Nr    r  c                 d    t        j                  | j                               j                         S )z"Generate a unique hash for content)xxhashxxh64r   	hexdigest)r  s    r!   generate_content_hashr?    s!    <<()3355r    	base_pathc                     ddddddd}i }|j                         D ]A  \  }}t        j                  j                  | |      }t        j                  |d       |||<   C |S )	z.Create content directories if they don't existhtml_contentrn  markdown_contentr<  screenshots)r   cleanedrm  	extractedrD  
screenshotTr   )itemsr`   r   rH   r   )r@  dirscontent_pathskeydirnamer   s         r!   ensure_content_dirsrM    sl     !&($#D M

Www||Iw/
D4(!c %
 r    c                      t        j                         dk(  r(t        j                  t        j                                yy)a,  
    Configure the Windows event loop to use ProactorEventLoop.
    This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses.
    
    This function should only be called on Windows systems and before any async operations.
    On non-Windows systems, this function does nothing.
    
    Example:
        ```python
        from crawl4ai.async_configs import configure_windows_event_loop
        
        # Call this before any async operations if you're on Windows
        configure_windows_event_loop()
        ```
    rp   N)r   r   asyncioset_event_loop_policyWindowsProactorEventLoopPolicyr   r    r!   configure_windows_event_looprR  0  s/      I%%%g&L&L&NO &r    context_linesc                    ddl }ddl}ddl}|j                  | d         }|d   }|j                  }|j
                  }|j                  }	t        d||z
        }
||z   dz   }g }t        |
|      D ]J  }|j                  ||      }|s|j                         }||k(  rdnd}|j                  |dd| d|        L d	j                  |      }	 |j                  j                  |      }|||	|d
S # t        $ r |}Y w xY w)a  
    Extract error context with more reliable line number tracking.
    
    Args:
        exc_info: The exception info from sys.exc_info()
        context_lines: Number of lines to show before and after the error
    
    Returns:
        dict: Error context information
    r   Nr6   r   r   u   →r1   4dr0   )filenameline_nofunctioncode_context)	traceback	linecacher`   
extract_tbrV  linenord  rc   r  getlinerstriprF   rH   r   relpathr   )exc_inforS  rZ  r[  r`   tb
last_framerV  rW  	func_namecontext_startcontext_endr   rY   pointerrY  rel_paths                    r!   get_error_contextri  C  s*     
		hqk	*B BJ""HGI 7]23MM)A-K M=+.  1-;;=D G|eG  Ab67)1TF!;< / 99]+L77??8, $	 	  s   C0 0C>=C>)r+   x   TF)Fr   )FN)zgroq/llama3-70b-8192N)   )Sr  rY  r   concurrent.futuresr   r   bs4r   r   r   r	   r
   r   r   r   r`   r   promptsr   configpathlibr   typingr   r   r   r  requests.exceptionsr   r   r   r<  coloramar   r   r   rC   r*  r.  	functoolsr   rO  r   r   r   rd   r  r^   ri   rb   r   r   r   r   r   r   r(  rv  r  r  r}  r  r  r  DEFAULT_PROVIDERr&  r/  r8  r  rA  rJ  rM  rU  r^  rr  r  r  r$  r:  r?  rM  rR  ri  r   r    r!   <module>rv     sg    ! ? E E   	 	  *       - - -  & &     	i 	E E3 E EY] Esw E  EH EN-(/6b&&*-X0E E E:1z >P`d qVf Wi  C u# uS uPS ux{ u  RV  WZ  \_  W_  R` unBHB FP *:tX\ 5n:x:!# ! ! ! !`d !F:&+ Z!F% % %N 3 4 BD-c D-tCy D-L+Z63 63 6
3 4S> &P&6s 6r    