
    g+x                        d dl Z d dlZd dlmZmZ d dlmZmZmZ d dl	m
Z
 d dlmZ d dlZd dlZd dl Z d dlZddl d dl	mZmZmZ d d	l	mZmZ d d
lmZ d dlmZ ddlmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&  e jN                  d      Z( e jN                  d      Z) e jN                  d      Z*d Z+d Z, G d de      Z- G d de-      Z.y)    N)ABCabstractmethod)DictAnyOptional)BeautifulSoup)ThreadPoolExecutor   )*)elementNavigableStringComment)PageElementTag)urljoin)InvalidSchema)RelevantContentFilterBM25ContentFilter)MarkdownGenerationStrategyDefaultMarkdownGenerator)MarkdownGenerationResult)extract_metadatanormalize_urlis_external_urlget_base_domainz^og:z	^twitter:z
(\d+)(\D*)c                     | rJt         j                  |       }|r3t        |j                  d            }|j                  d      xs d}||fS y)Nr
      px)NN)DIMENSION_REGEXmatchintgroup)	dimensionr    numberunits       W/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/content_scraping_strategy.pyparse_dimensionr'      sI    %%i0Q(F;;q>)TD4<    c                    t        || j                  d            }	 t        j                  |      }|j                  dk(  r|j
                  j                  dd        y t        d|        	 y # t        $ r
}Y d }~y d }~ww xY w#  Y y xY w)Nsrc   zContent-Lengthz!Failed to retrieve file size for )r   getrequestsheadstatus_codeheadersprintr   )imgbase_urlimg_urlresponsees        r&   fetch_image_file_sizer7   *   s    hswwu~.G
==)3&##''(8> 	 5gY?@ 	   	s)   ?A. A. .	B7B <BB Bc            	       \    e Zd Zedededeeef   fd       Zedededeeef   fd       Zy)ContentScrapingStrategyurlhtmlreturnc                      y N selfr:   r;   kwargss       r&   scrapzContentScrapingStrategy.scrap:   s    r(   c                    K   y wr>   r?   r@   s       r&   ascrapzContentScrapingStrategy.ascrap>   s	     s   N)	__name__
__module____qualname__r   strr   r   rC   rE   r?   r(   r&   r9   r9   9   sb     C d38n    3 T#s(^  r(   r9   c                       e Zd ZdZddZddZdededeeef   fdZ	dededeeef   fd	Z
d
 Zd ZddZd Zdedeeef   fdZdedeeef   deeef   deeef   def
dZedfdededededeeef   f
dZy)WebScrapingStrategya  
    Class for web content scraping. Perhaps the most important class. 
    
    How it works:
    1. Extract content from HTML using BeautifulSoup.
    2. Clean the extracted content using a content cleaning strategy.
    3. Filter the cleaned content using a content filtering strategy.
    4. Generate markdown content from the filtered content.
    5. Return the markdown content.
    Nc                     || _         y r>   )logger)rA   rM   s     r&   __init__zWebScrapingStrategy.__init__N   s	    r(   c                 `    | j                   r"t        | j                   |      } |d||d| yy)z#Helper method to safely use logger.messagetagNr?   )rM   getattr)rA   levelrQ   rR   rB   
log_methods         r&   _logzWebScrapingStrategy._logQ   s0    ;; e4J:wC:6: r(   r:   r;   r<   c                 .     | j                   ||fddi|S )a  
        Main entry point for content scraping.  

        Args:
            url (str): The URL of the page to scrape.
            html (str): The HTML content of the page.
            **kwargs: Additional keyword arguments.

        Returns:
            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:

            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
        is_asyncF)_scrapr@   s       r&   rC   zWebScrapingStrategy.scrapW   s!    " t{{3?u???r(   c                 b   K   t        j                  | j                  ||fi | d{   S 7 w)a  
        Main entry point for asynchronous content scraping.

        Args:
            url (str): The URL of the page to scrape.
            html (str): The HTML content of the page.
            **kwargs: Additional keyword arguments.

        Returns:
            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:

            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
        N)asyncio	to_threadrY   r@   s       r&   rE   zWebScrapingStrategy.ascrapj   s,     " &&t{{CHHHHHs   &/-/c                    t        |t              r|S t        |j                        dk(  rat        |j                  d   t              rD|j                  d   j
                  |j
                  k(  r| j                  |j                  d         S |j                  D cg c]  }| j                  |       c}|_        |S c c}w )z
        Flatten nested elements in a HTML tree.

        Args:
            node (Tag): The root node of the HTML tree.

        Returns:
            Tag: The flattened HTML tree.
        r
   r   )
isinstancer   lencontentsr   nameflatten_nested_elements)rA   nodechilds      r&   rb   z+WebScrapingStrategy.flatten_nested_elements}   s     dO,Kt}}"z$--2BC'HT]][\M]MbMbfjfofoMo//a0@AAJN--X-55e<-X Ys   B<c                     |j                  dt              }|}|rB|j                  }|r1|j                  dd      }t	        |j                               |k\  r|S |rBy)a  
        Find the closest parent with useful text.

        Args:
            tag (Tag): The starting tag to search from.
            **kwargs: Additional keyword arguments.

        Returns:
            Tag: The closest parent with useful text, or None if not found.
        $image_description_min_word_threshold T)	separatorstripN)r,   $IMAGE_DESCRIPTION_MIN_WORD_THRESHOLDparentget_textr_   split)rA   rR   rB   rf   current_tagtext_contents         r&   $find_closest_parent_with_useful_textz8WebScrapingStrategy.find_closest_parent_with_useful_text   sp     06zz:`  cG  0H,%,,K*33c3M|))+,0TT''  r(   c                     g }|j                   D ]>  }||vs|r$|j                  d      r|j                  |       .|j                  |       @ |D ]  }||=  y)aS  
        Remove unwanted attributes from an HTML element.

        Args:    
            element (Tag): The HTML element to remove attributes from.
            important_attrs (list): List of important attributes to keep.
            keep_data_attributes (bool): Whether to keep data attributes.

        Returns:
            None
        data-N)attrs
startswithappend)rA   r   important_attrskeep_data_attributesattrs_to_removeattrs         r&   remove_unwanted_attributesz.WebScrapingStrategy.remove_unwanted_attributes   s]     MMD?*'??73'..t4#**40 " $D $r(   c                    !"#$ d }t        g d      t        ddg      }t        g d      !|j                  dd      }|j                  dd      |j                  d	d      #|j                  d
d      }	|j                  dd      }
|j                  dd      }|j                  d      }|j                  d      }|j                  }|j                  dg       }d|v sJ|j                  |v s<t	        fd|D              s(t	        #fdD              st	        fdD              ryd}|r'|j                         rt        |      }||dkD  rdndz  }|r'|j                         rt        |      }||dkD  rdndz  }r|dz  }|||z  dk  z  }!fd t	         fd#|	|
|fD              r|dz  }|
s|r|dz  }|j                  d      r|dz  }d}#|	|
|fD ]0  }|s!D cg c]  }||j                         v s| }}|s+|d   } n ||j                  dt              k  ryt               $g "|}|j                  dt              } | j                  |fi ||d||dd&"$fd 	} |#        ||	       d!D ]2  }|j                  |      x}s ||      D ]  } ||d"   |d           4 |j                  d      x}rF|j                  d#      D ]2  }|j                  d      x}
s ||
      D ]  # |#d"   #d           4 |j                  j                         D ]-  \  }}|j!                  d$      sd	|v sd|v s!d%|v s& ||       / "r"S dS c c}w )'a2  
        Process an image element.
        
        How it works:
        1. Check if the image has valid display and inside undesired html elements.
        2. Score an image for it's usefulness.
        3. Extract image file metadata to extract size and extension.
        4. Generate a dictionary with the processed image information.
        5. Return the processed image information.

        Args:
            img (Tag): The image element to process.
            url (str): The URL of the page containing the image.
            index (int): The index of the image in the list of images.
            total_images (int): The total number of images in the list.
            **kwargs: Additional keyword arguments.

        Returns:
            dict: A dictionary containing the processed image information.
        c                 (   | j                  d      D cg c]
  }|sd|  c}D cg c]Z  }|j                         j                         d   d|v r0|j                         j                         d   j                  d      nd d\ c}S c c}w c c}w )Nhttpr   rg   w)r:   width)rm   ri   rstrip)spus      r&   <lambda>z3WebScrapingStrategy.process_image.<locals>.<lambda>   s    67ggfo!KoD*o!K"M!KA +,'')//*;A*>!8 JKIZ[]I^IeIefiIj)-#/!K"M!K"Ms   B
B
AB)buttoniconlogor   input)jpgjpegpngwebpavifgifstyle altr*   data-srcsrcsetdata-srcsetr   heightclasszdisplay:nonec              3   4   K   | ]  }D ]  }||v  
  y wr>   r?   ).0cclsclasses_to_checks      r&   	<genexpr>z4WebScrapingStrategy.process_image.<locals>.<genexpr>   s      M.Q<LSS<L.s   c              3   &   K   | ]  }|v  
 y wr>   r?   )r   r   r*   s     r&   r   z4WebScrapingStrategy.process_image.<locals>.<genexpr>        3"2QS"2   c              3   &   K   | ]  }|v  
 y wr>   r?   )r   r   r   s     r&   r   z4WebScrapingStrategy.process_image.<locals>.<genexpr>   r   r   Nr      r
   g      ?c                 .     t         fdD              S )Nc              3   B   K   | ]  }|j                         v   y wr>   )lower)r   fmtr:   s     r&   r   zNWebScrapingStrategy.process_image.<locals>.has_image_format.<locals>.<genexpr>  s     C]csciik)]s   )any)r:   image_formatss   `r&   has_image_formatz;WebScrapingStrategy.process_image.<locals>.has_image_format  s    C]CCCr(   c              3   .   K   | ]  } |        y wr>   r?   )r   r:   r   s     r&   r   z4WebScrapingStrategy.process_image.<locals>.<genexpr>
  s     U0T$0Ts   pictureimage_score_thresholdrf   image)r   descscoretypegroup_idformatc                     | rA| j                  d      s/| vr*j                  |        j                  i | |d       y y y y )Nzdata:)r*   r   )rt   addru   )r*   r   	base_infoimage_variantsunique_urlss     r&   add_variantz6WebScrapingStrategy.process_image.<locals>.add_variant0  sJ    3>>'2s+7M$%%&O&O3&OP 8N2sr(   )r   r   r:   sourcerr   r}   r>   )	frozensetr,   rk   ra   r   isdigitr!   find_parentr   IMAGE_SCORE_THRESHOLDsetrj   rp   find_allrs   itemsrt   )%rA   r2   r:   indextotal_imagesrB   parse_srcsettags_to_checkr   data_srcr   data_srcsetr   r   rk   parent_classesr   	width_val
height_valdetected_formatr   format_matchesr   rf   r   ry   valuer   r   r   r   r   r   r   r   r*   r   s%                                @@@@@@@@r&   process_imagez!WebScrapingStrategy.process_image   s   *M
 %%?@!8W"56!"OP $ggeR ggeR 77:r*2&ggmR0 "GR0 e#KK=(M.MM3"2333"233 U]]_E
I)c/Qq0Efnn&VJ*s*Q1EQJE|#c))	D Uh0TUUQJE[QJE??9%QJE 6;7C1>!U##BT#!U!&4Q&7O 8 FJJ68MNN e  06zz:`  cG  0H,=D==cLVL %
		Q 	CH .D%u%*51Fuvg? 2 . ooi0070!**84#ZZ1161+F3#CJG=  4 5 99??,KD%w'Ud]h$>NTZ^cTcE" - "0~9T9m "Vs   'M ?M r   c                 P    g g g d}i }i } | j                   |||||fi | |||dS )aD  
        Process an HTML element.
        
        How it works:
        1. Check if the element is an image, video, or audio.
        2. Extract the element's attributes and content.
        3. Process the element based on its type.
        4. Return the processed element information.

        Args:
            url (str): The URL of the page containing the element.
            element (Tag): The HTML element to process.
            **kwargs: Additional keyword arguments.

        Returns:
            dict: A dictionary containing the processed element information.
        imagesvideosaudios)mediainternal_links_dictexternal_links_dict)_process_element)rA   r:   r   rB   r   r   r   s          r&   process_elementz#WebScrapingStrategy.process_elementM  sZ    $ r:  	
 	
 #6#6
 	
r(   r   r   r   c                 f   	 t        |t              r"t        |t              r |j                          y|j	                  dt        |            }|j                  dv r |j                          yd}|j	                  dg       }		 |j                  dk(  r |j                  d      r |j                  dd      j                         }
|
sy|j                  d      d	   }	 t        |
|      }| |j                         j                          |j                  dd      j                         |d}t        ||      }d}|rLt        |      }||d<   |j	                  dd      r |j                          y|	r||	v r |j                          y|r
||vr|||<   n	||vr|||<   	 |j                  dk(  rg d} |j                  dd      }|s)|r' |j                  |j!                  d      d      }|s|r'|s |j                          yd|j"                  v r3|j"                  d   j                  d      d   j                  d      d   }t        ||      syt        |      }|j	                  dd      r |j                          y|	r||	v r |j                          yy	 |j	                  dd      r!|j                  dk(  r |j                          y|j                  dv r||j                   d   j%                   |j                  d       |j                  d      |j                   | j&                  |fi |d        |j(                  d      }|D ]b  }||j                   d   j%                  |j	                  d       |j                  d      |j                   | j&                  |fi |d       d y|j                  t*        v r3|j	                  d d      r! |j,                   |j                                	 | j/                  |t0        |j	                  d!d             t5        |j6                        D ]\  }t        |t              r0t        |t              s t9        |j                               dkD  s@d}C | j:                  |||||fi |s[d}^ |j	                  d&t<              }|s/t9         |j                  d'      j                               }||k\  }|s |j                          |S # t        $ r
}Y d
}~yd
}~ww xY w# t        $ r}t        dt        |             d
}~ww xY w# t        $ r}dd
}~ww xY w# t        $ r+}| j3                  d"d#d$d"t        |      i%       Y d
}~Pd
}~ww xY w# t        $ r*}| j3                  d"d(d$d"t        |      i%       Y d
}~yd
}~ww xY w))z2
        Process an HTML element.        
        Fbase_domain)scriptr   linkmetanoscriptexclude_domainsahrefr   /r   Ntitle)r   textr   r   Texclude_external_linkszError processing links: r2   )r*   r   zsrcsetdata-lazy-srczdata-originalr*   r   r   ,rg   exclude_external_imageszError processing imagesremove_formsform)videoaudior   r   )r*   r   r   descriptionr   	only_textrw   errorz+Error removing unwanted attributes: {error}SCRAPErQ   rR   paramsword_count_threshold)ri   z!Error processing element: {error})r^   r   r   extractr,   r   ra   	decomposeri   rm   r   
ValueErrorrl   r   	ExceptionrI   poprs   ru   rp   r   ONLY_TEXT_ELIGIBLE_TAGSreplace_withrz   IMPORTANT_ATTRSrV   listchildrenr_   r   MIN_WORD_THRESHOLD)rA   r:   r   r   r   r   rB   r   keep_elementr   r   url_basenormalized_hrefr6   	link_datais_externallink_base_domainpotential_sourcesr*   image_src_base_domainsource_tags
source_tagrd   r   
word_counts                            r&   r   z$WebScrapingStrategy._process_elementp  s   E	'?3gw/#GOO%
 !**]OC4HIK||NN!!!# L$jj):B?O
9E<<3&;7;;v+>&7;;vr288:D$"yy~a0H%*7c*B !0 0 0 0 2 8 8 :!,Wb!9!?!?!A'2	!I #2/;"OK#'L #+:?+K(3C	-0!::&>F-G--/#( -/?B 1 1 1 3',
 #*2EECL/@*2EECL/@30<<5((f%%'++eR0C!&7)gkk*;*?*?*BBG "&7)))+$  7==0%mmH5;;C@CII#NqQ +3<#,;C,@) zz";UC)))+$$ '0OC-G--/#(
  a )l zz.%0W\\V5K!!!#||11a()00&7;;u-&7;;u-#LL#L4#L#LW#_X^#_	2  /g..x8"-JW\\N!,-44%>>%0&7;;u-#LL#L4#L#LW#_X^#_	6  #. ||66::k51(G(()9)9)9);<//&**UkmrJst g../e_5jPW>X5;;=)A-'+,t,,S%@SUhslrs'+ 0 $*::.DFX#Y  !1!1!1!=!C!C!EF
)-AA!!!#y & %$%Z  E":3q6( CDDEj  0//0@  		'I #SV,   6  	IIg;Q(  
 	s   1U= :U= /U= AT 
T S5 +BT 2T 
T  AT3 1T3 AT3 .T3 T3 3U= C(U= 8AU= >'U %AU= :U= AU= 5	T>T TT 	T0T++T00U= 3	U<T>>UU= 	U: U5/U= 5U::U= =	V0 V++V0r   css_selectorc           
          d}|sy|j                  dd      }t        ||      }|j                  }	t              }
	 t	        d|      }t        |j                  d
g       xs g       r*|	j                  fd      D ]  } |j                           |j                  dd      }|rld|vxr d|v}|r9|	j                  |      x}rM |j                          |	j                  |      x}r%n'|	j                  |      D ]  } |j                           |rP|	j                  |      }|sdddg g g dg g di d| dS |j                  d      }	|D ]  }|	j                  |        t        |j                  dg       t        z         |d<   t        |j                  dg             |d<   |j                  dd      r|d   j!                  |d         |d<     j"                  |	f||
d|}g g d}|d   }|d   }|d   }t%        |j'                               |d<   t%        |j'                               |d<   |	j                  d       fdt)              D        D cg c]  }|	|D ]  }|  c}}|d <    j+                  |	      }	t-        j.                  d!      }D ];  }|j                  d"d      }|j1                  |      s'|j3                  d|      |d"<   = d}	 |	j5                         j7                  d#      }|j=                  d+d,      j=                  d-d      }|||||d.S # t
        $ r-} j                  ddddt        |      i	       i }Y d}~d}~ww xY wc c}}w # t
        $ r}d}t        |d$      }	|	j                  dd%&      }d'|_        |	j                  j                  |       |	j5                         j7                  d#      }t;        d(        j                  dd)d*       Y d}~d}~ww xY w)/a  
        Extract content from HTML using BeautifulSoup.

        Args:
            url (str): The URL of the page to scrape.
            html (str): The HTML content of the page to scrape.
            word_count_threshold (int): The minimum word count threshold for content extraction.
            css_selector (str): The CSS selector to use for content extraction.
            **kwargs: Additional keyword arguments.

        Returns:
            dict: A dictionary containing the extracted content.
        TNparserlxmlr   r   z"Error extracting metadata: {error}r   r   excluded_tagsc                      | j                   v S r>   )ra   )rR   r  s    r&   r   z,WebScrapingStrategy._scrap.<locals>.<lambda>_  s    SXX5Nr(   excluded_selectorr   rg   r   )internalexternalz$No elements found for CSS selector: )markdowncleaned_htmlsuccessr   linksmetadatarQ   divexclude_social_media_domainsr   exclude_social_media_linksF)r   r   r   r   r   r  r  r2   c           	   3   ^   K   | ]$  \  }}j                  ||t                     & y wr>   )r   r_   )r   ir2   imgsrA   r:   s      r&   r   z-WebScrapingStrategy._scrap.<locals>.<genexpr>  s2      ;*93  $11#sAs4yI*9s   *-r   zdata:image/[^;]+;base64,([^"]+)r*   zutf-8zhtml.parsercrawl4ai_error_message)ida8  
            Crawl4AI Error: This page is not fully supported.
            
            Possible reasons:
            1. The page may have restrictions that prevent crawling.
            2. The page might not be fully loaded.
            
            Suggestions:
            - Try calling the crawl function with these parameters:
            magic=True,
            - Set headless=False to visualize what's happening on the page.
            
            If the issue persists, please check the page's structure and any potential anti-crawling measures.
            u   [LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.zAfter processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.rP   z


z  )r  r  r   r  r  )r,   r   bodyr   r   r   rV   rI   r   r   r   
select_oneselectnew_tagru   SOCIAL_MEDIA_DOMAINSunionr   r   values	enumeraterb   recompiler    subencode_contentsdecodestringr1   replace) rA   r:   r;   r   r  rB   r  parser_typesoupr  r   r   r6   r   r  is_single_selectorselected_elementsel
result_objr  r   r   r   resultr2   base64_patternr*   str_body	error_divr  r  r  s    ``                            @@r&   rY   zWebScrapingStrategy._scrap;  s    jj62T;/yy%c*	#B-D FJJ;ArB==)NO! P #JJ':B?!$,=!=!^#M^B^!!%1B!CCgC#GOO% "&1B!CCgC  ${{+<=G#GOO%  >  $L 9$ "$&#(*bBG*,"= "!El^T  <<&D'B ( 25VZZ@^`b5cfz5z1{-.$'

3Db(I$J !::2E:(./@(A(G(GOmHn(oF$%)T))
 $8#	

 

  R07#()>?()>? !!4!;!;!=>j !4!;!;!=>j }}U#;*3D/;
;F!    ;
h ++D1$FGC''%$C##C(+//C8E
	   	++-44W=HB  ''5==dCH
 )
 	
q  	IIg<Q(  
 D	F
$  	G }5D U/GHI I  IIY'++-44W=H  l  nIIg `   7	s7    L0 %M)'M/ 0	M&9"M!!M&/	P8BO>>Pr>   )r   )F)rF   rG   rH   __doc__rN   rV   rI   r   r   rC   rE   rb   rp   rz   r   r   r   boolr   r   r!   rY   r?   r(   r&   rK   rK   B   sT   	;@ @C @d38n @&I I3 IT#s(^ I&".0N:`!
K !
d3PS8n !
FI[ I$sCx. Igkloqtltgu I  MQ  RU  WZ  RZ  M[ I  jn IV GYnr X
# X
S X
 X
hk X
  BF  GJ  LO  GO  BP X
r(   rK   )/r'  timeabcr   r   typingr   r   r   bs4r   concurrent.futuresr	   r[   r-   osconfigr   r   r   r   r   urllib.parser   requests.exceptionsr   content_filter_strategyr   r   markdown_generation_strategyr   r   modelsr   utilsr   r   r   r   r(  OG_REGEXTWITTER_REGEXr   r'   r7   r9   rK   r?   r(   r&   <module>rI     s    	  # & &  1      1 1     - M ^ ,  2::g

<("**]+c Q

1 Q

r(   