
    gT                     h   d dl mZmZ d dlmZmZmZmZmZ d dl	m
Z
mZ d dlZd dlZddl ddl ddl ddl d dlmZ ddl d dlZd dlZd dlZd dlmZ d d	lmZmZ d d
lmZ  G d de      Z  G d de       Z! G d de       Z" G d de       Z# G d de       Z$ G d de$      Z% G d de$      Z&y)    )ABCabstractmethod)AnyListDictOptionalUnion)ThreadPoolExecutoras_completedN   )*)partialBeautifulSoup)htmletree)	dataclassc            
       v    e Zd ZdZddefdZedededeeee	f      fd       Z
dedee   deeee	f      fd	Zy
)ExtractionStrategyz<
    Abstract base class for all extraction strategies.
    input_formatc                     || _         d| _        | j                  j                  | _        |j                  dd      | _        y)a  
        Initialize the extraction strategy.

        Args:
            input_format: Content format to use for extraction.
                         Options: "markdown" (default), "html", "fit_markdown"
            **kwargs: Additional keyword arguments
        z<|DEL|>verboseFN)r   DEL	__class____name__namegetr   )selfr   kwargss      Q/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/extraction_strategy.py__init__zExtractionStrategy.__init__   s6     )NN++	zz)U3    urlr   returnc                      y)z
        Extract meaningful blocks or chunks from the given HTML.

        :param url: The URL of the webpage.
        :param html: The HTML content of the webpage.
        :return: A list of extracted blocks or chunks.
        N r   r#   r   qr   s        r    extractzExtractionStrategy.extract&   s     	r"   sectionsc           	         g }t               5 }|D cg c]!  } |j                  | j                  ||fi |# }}t        |      D ]!  }	|j	                  |	j                                # 	 ddd       |S c c}w # 1 sw Y   |S xY w)z
        Process sections of text in parallel by default.

        :param url: The URL of the webpage.
        :param sections: List of sections (strings) to process.
        :return: A list of processed JSON blocks.
        N)r
   submitr)   r   extendresult)
r   r#   r*   r(   r   extracted_contentexecutorsectionfuturesfutures
             r    runzExtractionStrategy.run1   s     !X\de\dQXxt||S'LVL\dGe&w/!((9 0 " !  f " ! s   A9&A41A94A99BN)markdown)r   
__module____qualname____doc__strr!   r   r   r   r   r)   r4   r&   r"   r    r   r      su    4S 4 3 c Dc3h<P  !s !d3i !$tCQTH~BV !r"   r   c            	       ^    e Zd ZdZdededeeeef      fdZdedee   deeeef      fdZ	y)	NoExtractionStrategyz
    A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
    r#   r   r$   c                     d|dgS )zJ
        Extract meaningful blocks or chunks from the given HTML.
        r   )indexcontentr&   r'   s        r    r)   zNoExtractionStrategy.extractD   s     -..r"   r*   c                 R    t        |      D cg c]  \  }}|g |d c}}S c c}}w )Nr=   tagsr>   )	enumerate)r   r#   r*   r(   r   ir1   s          r    r4   zNoExtractionStrategy.runJ   s-    OXYaObcObG!RG<Obcccs   #N)
r   r6   r7   r8   r9   r   r   r   r)   r4   r&   r"   r    r;   r;   @   s^    /3 /c /Dc3h<P /ds dd3i d$tCQTH~BV dr"   r;   c                        e Zd ZdZeddddfdedee   dedef fdZd	ed
e	dede
eeef      fdZd Zd	ede
e   de
eeef      fdZddZ xZS )LLMExtractionStrategya  
    A strategy that uses an LLM to extract meaningful content from the HTML.
    
    Attributes:
        provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
        api_token: The API token for the provider.
        instruction: The instruction to use for the LLM model.  
        schema: Pydantic model schema for structured data.
        extraction_type: "block" or "schema".
        chunk_token_threshold: Maximum tokens per chunk.
        overlap_rate: Overlap between chunks.
        word_token_rate: Word to token conversion rate.
        apply_chunking: Whether to apply chunking.
        base_url: The base URL for the API request.
        api_base: The base URL for the API request.
        extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
        verbose: Whether to print verbose output.
        usages: List of individual token usages.
        total_usage: Accumulated token usage.
    Nblockprovider	api_tokeninstructionschemac                    t        |   di | || _        |xs- t        j	                  |d      xs t
        j                  d      | _        || _        || _	        || _
        |rd| _	        |j	                  dt              | _        |j	                  dt              | _        |j	                  dt              | _        |j	                  dd      | _        |j	                  d	d
      | _        |j	                  d|j	                  d	d
            | _        |j	                  di       | _        | j"                  sd| _        |j	                  dd      | _        g | _        t/               | _        | j                  st3        d      y
)a*  
        Initialize the strategy with clustering parameters.
        
        Args:
            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
            api_token: The API token for the provider.
            instruction: The instruction to use for the LLM model.  
            schema: Pydantic model schema for structured data.
            extraction_type: "block" or "schema".
            chunk_token_threshold: Maximum tokens per chunk.
            overlap_rate: Overlap between chunks.
            word_token_rate: Word to token conversion rate.
            apply_chunking: Whether to apply chunking.
            base_url: The base URL for the API request.
            api_base: The base URL for the API request.
            extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
            verbose: Whether to print verbose output.
            usages: List of individual token usages.
            total_usage: Accumulated token usage.   

        zno-tokenOPENAI_API_KEYrJ   chunk_token_thresholdoverlap_rateword_token_rateapply_chunkingTbase_urlNapi_base
extra_argsg    eAr   FzvAPI token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.r&   )superr!   rG   PROVIDER_MODELSr   osgetenvrH   rI   extract_typerJ   CHUNK_TOKEN_THRESHOLDrM   OVERLAP_RATErN   WORD_TOKEN_RATErO   rP   rQ   rR   rS   r   usages
TokenUsagetotal_usage
ValueError)r   rG   rH   rI   rJ   extraction_typer   r   s          r    r!   zLLMExtractionStrategy.__init__f   sO   0 	"6" "no&9&9(J&OnSUS\S\]mSn&+ (D%+ZZ0GI^%_""JJ~|D%zz*;_M$jj)94@

:t4

:vzz*d/KL **\26""),D&zz)U3%<~~  V  W  W r"   r#   ixr   r$   c                 d   | j                   rt        d| d|        |t        t        |            d}t        }| j
                  r| j
                  |d<   t        }| j                  dk(  r6| j                  r*t        j                  | j                  d      |d<   t        }|D ]  }|j                  d	|z   d
z   ||         } t        | j                  || j                  | j                   xs | j"                  | j$                        }t'        |j(                  j*                  |j(                  j,                  |j(                  j.                  |j(                  j0                  r |j(                  j0                  j2                  ni |j(                  j4                  r |j(                  j4                  j2                  ni       }| j6                  j9                  |       | j:                  xj*                  |j*                  z  c_        | j:                  xj,                  |j,                  z  c_        | j:                  xj.                  |j.                  z  c_        	 t=        dg|j>                  d   j@                  jB                        d   }	t        jD                  |	      }	|	D ]  }
d|
d<   	 	 | j                   rt        dtK        |	      d|d|       |	S # tF        $ rT}tI        |j>                  d   j@                  jB                        \  }}|}	|r|	j9                  dddg|d       Y d}~d}~ww xY w)a  
        Extract meaningful blocks or chunks from the given HTML using an LLM.
        
        How it works:
        1. Construct a prompt with variables.
        2. Make a request to the LLM using the prompt.
        3. Parse the response and extract blocks or chunks.
        
        Args:
            url: The URL of the webpage.
            ix: Index of the block.
            html: The HTML content of the webpage.
            
        Returns:
            A list of extracted blocks or chunks.
        z[LOG] Call LLM for z - block index: )URLHTMLREQUESTrJ      )indentSCHEMA{})rQ   rS   )completion_tokensprompt_tokenstotal_tokenscompletion_tokens_detailsprompt_tokens_detailsblocksr   FerrorTr=   rq   rA   r>   Nz[LOG] Extractedzblocks from URL:zblock index:)&r   printescape_json_stringsanitize_htmlPROMPT_EXTRACT_BLOCKSrI   &PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTIONrX   rJ   jsondumps&PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTIONreplaceperform_completion_with_backoffrG   rH   rR   rQ   rS   r]   usagerk   rl   rm   rn   __dict__ro   r\   appendr^   extract_xml_datachoicesmessager>   loads	Exceptionsplit_and_parse_json_objectslen)r   r#   ra   r   variable_valuesprompt_with_variablesvariableresponser}   rp   rF   eparsedunparseds                 r    r)   zLLMExtractionStrategy.extract   s   " <<'u,<RDAB &}T':;

 !6)-)9)9OI&$J!(T[[(,

4;;q(IOH%$J!'H$9$A$Ah$oh&?%! (
 3MM!NN]]3dmm &nn>>"..66!44[c[i[i  \D  \Dhnn&N&N&W&W  JLS[SaSaSwSw(.."F"F"O"O}
 	5! 	**e.E.EE*&&%*=*==&%%););;%	%xj(2B2B12E2M2M2U2UVW_`FZZ'F!&g   <<#S[2Dc>[]^  		;H<L<LQ<O<W<W<_<_`FHF!$I'	 			s   AK 	L/A
L**L/c                 p   g }g }d}|D ],  }|t        |j                  d            | j                  z  z  }. t        j                  ||z        }|dk  rd}||z  }	d}
g }|D ]  }|j                  d      }t        |      | j                  z  }|
|z   |	k  r|j                  |       |
|z  }
Kt        |      |dz
  k(  r|j                  |       n|dkD  r|r|| d }|j                  |       |j                  dj                  |             |}|}
 |r |j                  dj                  |             |S )z[
        Merge documents into sections based on chunk_token_threshold and overlap.
        r    r   N)r   splitrO   mathfloorr-   r   join)r   	documentsrM   overlapchunksr*   rm   documentnum_sectionsadjusted_chunk_thresholdtotal_token_so_farcurrent_chunktokenstoken_countoverlap_tokenss                  r    _mergezLLMExtractionStrategy._merge   sX     "HCs 34t7K7KKKL " zz,1F"FG!L#/,#> !H^^C(Ff+(<(<<K!K/3KK$$V,"k1" x=L1$44!((0 Q;=%2G89%=N!((8 78 &%0") ". OOCHH]34r"   r*   c                 j   | j                  || j                  t        | j                  | j                  z              }g }| j                  j                  d      rat        |      D ]Q  \  }}t        | j                  |      }|j                   ||t        |                   t        j                  d       S |S t        d      5 }t        | j                  |      }t        |      D cg c]!  \  }}|j                  ||t        |            # }	}}t        |	      D ]"  }
	 |j                  |
j!                                $ 	 ddd       |S c c}}w # t"        $ rD}| j$                  rt'        d|        |j)                  ddd	gt+        |      d
       Y d}~~d}~ww xY w# 1 sw Y   |S xY w)aK  
        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
        
        Args:
            url: The URL of the webpage.
            sections: List of sections (strings) to process.
            
        Returns:
            A list of extracted blocks or chunks.
        )r   zgroq/g      ?   )max_workerszError in thread execution: r   Trq   rr   N)r   rM   intrN   rG   
startswithrB   r   r)   r-   sanitize_input_encodetimesleepr
   r,   r   r.   r   r   rs   r   r9   )r   r#   r*   merged_sectionsr/   ra   r1   extract_funcr0   r2   r3   r   s               r    r4   zLLMExtractionStrategy.run  s    ++d0033d6G6GGH & 
 ==##G,(9G&t||S9!((b:OPW:X)YZ

3  :: ! ' $2h&t||S9py  {J  qK  L  qKalacel8??<=RSZ=[\  qK  L*73F)00A 4	 3& ! # L
 % 	<<!$?s"CD)00%&%)%,I'*1v	2  		 3& ! sH   	%F(.&EF(&EF(F(	F%!:F F( F%%F((F2c                 D   t        d       t        ddddd       t        d       t        ddd| j                  j                  d	       t        d
dd| j                  j                  d	       t        ddd| j                  j                  d	       t        d       t        ddddddd
dddd       t        d       t        | j                  d      D ]=  \  }}t        |dd|j                  d	d|j                  d	d|j                  d	       ? y)zHPrint a detailed token usage report showing total and per-request usage.z
=== Token Usage Summary ===Typez<15r   Countz>12z------------------------------
Completionz>12,PromptTotalz
=== Usage History ===z	Request #z<10z0------------------------------------------------r   N)rs   r^   rk   rl   rm   rB   r\   )r   rC   r}   s      r    
show_usagez LLMExtractionStrategy.show_usageP  s(   -.Agc]+,hc"!D$4$4$F$Ft#LMN#a 0 0 > >tDEFQt//<<TBCD'(S!<"4Ahs^1WSMRSh!$++q1HAuQsG1U44T:!E<O<OPT;UUVW\WiWijnVopq 2r"   )r$   N)r   r6   r7   r8   DEFAULT_PROVIDERr9   r   r   r!   r   r   r   r)   r   r4   r   __classcell__r   s   @r    rE   rE   P   s    , "2d#'tw0W0W>Fsm0W 0W040WfP3 P3 Pc Pd4S>6J Pd/d0!s 0!d3i 0!Dc3h4H 0!frr"   rE   c            
            e Zd ZdZd fd	Zddee   dededee   fdZddee   fd	Z	ddee   fd
Z
deeee   f   deeee   f   fdZdededeeeef      fdZdedee   deeeef      fdZ xZS )CosineStrategya  
    Extract meaningful blocks or chunks from the given HTML using cosine similarity.
    
    How it works:
    1. Pre-filter documents using embeddings and semantic_filter.
    2. Perform clustering using cosine similarity.
    3. Organize texts by their cluster labels, retaining order.
    4. Filter clusters by word count.
    5. Extract meaningful blocks or chunks from the filtered clusters.
    
    Attributes:
        semantic_filter (str): A keyword filter for document filtering.
        word_count_threshold (int): Minimum number of words per cluster.
        max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
        linkage_method (str): The linkage method for hierarchical clustering.
        top_k (int): Number of top categories to extract.
        model_name (str): The name of the sentence-transformers model.
        sim_threshold (float): The similarity threshold for clustering.
    c                    t        |   di | ddl}	|| _        || _        || _        || _        || _        || _        t        j                         | _
        |j                  dd      | _         |	j                  g       | _        d| _        t!               | _        t%        | j"                        | _        | j                  r#t)        d| j"                  j*                   d       t-        |      \  | _        | _        | j0                  j3                  | j"                         | j0                  j5                          d| _         |	j                  g       | _        | j                  r#t)        d	| j"                  j*                   d       t7               \  | _        }
| j                  r>t)        d
| dt;        t        j                         | j                  z
        z   dz          yy)a  
        Initialize the strategy with clustering parameters.

        Args:
            semantic_filter (str): A keyword filter for document filtering.
            word_count_threshold (int): Minimum number of words per cluster.
            max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
            linkage_method (str): The linkage method for hierarchical clustering.
            top_k (int): Number of top categories to extract.
        r   Nr   Fdirectz#[LOG] Loading Extraction Model for z device.batchz([LOG] Loading Multilabel Classifier for z[LOG] Model loaded z, models/reuters, took  secondsr&   )rT   r!   numpysemantic_filterword_count_thresholdmax_distlinkage_methodtop_ksim_thresholdr   timerr   r   arraybuffer_embeddingsget_embedding_method
get_devicedevicecalculate_batch_sizedefault_batch_sizers   typeload_HF_embedding_model	tokenizermodeltoevalload_text_multilabel_classifiernlpr9   )r   r   r   r   r   r   
model_namer   r   np_r   s              r    r!   zCosineStrategy.__init__w  s    	"6".$8! ,
*YY[
zz)U3!)"$,! l #7t{{"C<<78H8H7IRS &=Z%H"


dkk"

$+!!)"  <<<T[[=M=M<NhWX57! <<'
|3JKcRVR[R[R]`d`j`jRjNkknxxy r"   r   r   
at_least_kr$   c                 d   |s|S t        |      |k  rt        |      dz  }ddlm} | j                  |g      d   }| j                  |      } ||g|      j	                         }t        ||      D 	cg c]  \  }}	|	| j                  k\  s||	f }
}}	t        |
      |k  rdt        ||      D 	cg c]  \  }}	|	| j                  k  s||	f }}}	|j                  d d       |
j                  |d|t        |
      z
          |
D cg c]  \  }}|	 }
}}|
d| S c c}	}w c c}	}w c c}}w )a  
        Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.

        Args:
            documents (List[str]): A list of document texts.
            semantic_filter (str): A keyword filter for document filtering.
            at_least_k (int): The minimum number of documents to return.

        Returns:
            List[str]: A list of filtered and sorted document texts.
        rf   r   )cosine_similarityc                     | d   S )Nr   r&   )xs    r    <lambda>z<CosineStrategy.filter_documents_embeddings.<locals>.<lambda>  s    adr"   T)keyreverseN)	r   sklearn.metrics.pairwiser   get_embeddingsflattenzipr   sortr-   )r   r   r   r   r   query_embeddingdocument_embeddingssimilaritiesdocsimfiltered_docsremaining_docsr   s                r    filter_documents_embeddingsz*CosineStrategy.filter_documents_embeddings  sV    y>J&Y1,J> --.?@C #11)< )/):<OPXXZ 58	<4Pn4PSTW[_[m[mTm#s4Pn }
*9<Y9Ur9UXS#Y\_c_q_qYqsCj9UNrNDA  0Pc->P1P!QR ,99=a=9[j)) o s
 :s   4D D 2D&
D&D,	sentencesc           	         | j                   j                  dv r"ddl}|| j                  }g }t	        dt        |      |      D ]  }||||z    }| j                  |ddd      }|j                         D 	
ci c]!  \  }	}
|	|
j                  | j                         # }}	}
|j                         5   | j                  d
i |}ddd       j                  j                  d      j                         j                         }|j                  |        t!        j"                  |      | _        | j$                  S | j                   j                  d	k(  ro|| j                  }g }t	        dt        |      |      D ],  }||||z    }| j                  |      }|j                  |       . t!        j"                  |      | _        | j$                  S c c}
}	w # 1 sw Y   xY w)z
        Get BERT embeddings for a list of sentences.

        Args:
            sentences (List[str]): A list of text chunks (sentences).

        Returns:
            NumPy array of embeddings.
        )cpugpucudampsr   NTpt)padding
truncationreturn_tensorsr   )dimr   r&   )r   r   torchr   ranger   r   itemsr   no_gradr   last_hidden_statemeanr   r   r   r   vstackr   )r   r   
batch_sizebypass_bufferr   all_embeddingsrC   batch_sentencesencoded_inputr   tensormodel_output
embeddingss                r    r   zCosineStrategy.get_embeddings  s    ;;==!!44
N1c)nj9"+Aa*n"= $Y]nr sP]PcPcPe fPeffii&<!<Pe f ]]_#-4::#>#>L % *;;@@Q@GKKMSSU
%%j1 : &(YY~%>D" %%% [[&!!44
N1c)nj9"+Aa*n"=!ZZ8
%%j1 :
 &(YY~%>D"%%%/ !g %_s   7&G0GG 	c                     ddl m}m} ddlm} t        j
                         | _        | j                  |d      } ||d      } ||| j                        } ||| j                  d	      }|S )
z
        Perform hierarchical clustering on sentences and return cluster labels.

        Args:
            sentences (List[str]): A list of text chunks (sentences).

        Returns:
            NumPy array of cluster labels.
        r   )linkagefcluster)pdistT)r   cosine)methoddistance)	criterion)
scipy.cluster.hierarchyr  r  scipy.spatial.distancer  r   r   r   r   r   )	r   r   r  r  r  r  distance_matrixlinkedlabelss	            r    hierarchical_clusteringz&CosineStrategy.hierarchical_clustering  sc     	>0YY[
(($(G
  
H51D1DE&$--:Fr"   clustersc                     i }|j                         D ]D  \  }}dj                  |      }t        |j                               }|| j                  k\  s@|||<   F |S )a  
        Filter clusters to remove those with a word count below the threshold.

        Args:
            clusters (Dict[int, List[str]]): Dictionary of clusters.

        Returns:
            Dict[int, List[str]]: Filtered dictionary of clusters.
        r   )r   r   r   r   r   )r   r  filtered_clusters
cluster_idtexts	full_text
word_counts          r    filter_clusters_by_word_countz,CosineStrategy.filter_clusters_by_word_count4  sa     !)!1JIY__./J T66605!*- "2 ! r"   r#   r   c           	      V   t        j                          } |j                  | j                        }| j                  || j                        }|sg S | j                  |      }t        j                          }i }t        |      D ])  \  }	}
|j                  |
g       j                  ||	          + | j                  |      }t        |      D cg c]#  }t        |      g dj                  ||         d% }}| j                  rt        d| j                          | j                  j                   dv r;| j#                  |D cg c]  }|d   	 c}      }t%        ||      D ]
  \  }}
|
|d<    | j                  r%t        dt        j                          |z
  dd	       |S c c}w c c}w )
a  
        Extract clusters from HTML content using hierarchical clustering.

        Args:
            url (str): The URL of the webpage.
            html (str): The HTML content of the webpage.

        Returns:
            List[Dict[str, Any]]: A list of processed JSON blocks.
        r   r@   u   [LOG] 🚀 Assign tags using )r   r   r   r   r>   rA   u"   [LOG] 🚀 Categorization done in z.2fr   )r   r   r   r   r   r  rB   
setdefaultr   r  sortedr   r   r   rs   r   r   r   r   )r   r#   r   r(   r   ttext_chunksr  r  r=   labelr  idxcluster_listclusters                  r    r)   zCosineStrategy.extractK  s    IIK djj* 66{DDXDXYI --k: IIK%f-LE5r*11+e2DE . !>>xH rx  yJ  rK  L  rKjm#c(RCHHM^_bMcDde  rK  L<<1$++?@;;<<XXNgwy1NOF"%lF";"' #<& <<6tyy{Qs6K8TU= L Os   (F!F&r*   c                 \     | j                   || j                  j                  |      fi |S )z
        Process sections using hierarchical clustering.

        Args:
            url (str): The URL of the webpage.
            sections (List[str]): List of sections (strings) to process.

        Returns:
        )r)   r   r   )r   r#   r*   r(   r   s        r    r4   zCosineStrategy.run  s)     t||Cx!8CFCCr"   )N
   g?ward   z&sentence-transformers/all-MiniLM-L6-v2g333333?)   )NFN)r   r6   r7   r8   r!   r   r9   r   r   r   r  r   r  r   r)   r4   r   r   s   @r    r   r   c  s    &GzR**T#Y **QT **be **ostwox **X.&S	 .&`c 0!d3S	>6J !tTWY]^aYbTbOc !.A3 Ac ADc3h<P AFDs Dd3i D$tCQTH~BV Dr"   r   c            	       ,    e Zd ZdZdZdeeef   f fdZdedede	eeef      fdZ
edefd	       Zed
efd       Zed
efd       Zd Zd Zd Zd Zd Zd Zdede	e   de	eeef      fdZedefd       Zedefd       Zedefd       Z xZS )JsonElementExtractionStrategya  
    Abstract base class for extracting structured JSON from HTML content.

    How it works:
    1. Parses HTML content using the `_parse_html` method.
    2. Uses a schema to define base selectors, fields, and transformations.
    3. Extracts data hierarchically, supporting nested fields and lists.
    4. Handles computed fields with expressions or functions.

    Attributes:
        DEL (str): Delimiter used to combine HTML sections. Defaults to '
'.
        schema (Dict[str, Any]): The schema defining the extraction rules.
        verbose (bool): Enables verbose logging for debugging purposes.

    Methods:
        extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
        _extract_item(element, fields): Extracts fields from a single element.
        _extract_single_field(element, field): Extracts a single field based on its type.
        _apply_transform(value, transform): Applies a transformation to a value.
        _compute_field(item, field): Computes a field value using an expression or function.
        run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.

    Abstract Methods:
        _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
        _get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
        _get_elements(element, selector): Retrieves child elements using a selector.
        _get_element_text(element): Extracts text content from an element.
        _get_element_html(element): Extracts raw HTML from an element.
        _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
    
rJ   c                 `    t        |   di | || _        |j                  dd      | _        y)z
        Initialize the JSON element extraction strategy with a schema.

        Args:
            schema (Dict[str, Any]): The schema defining the extraction rules.
        r   FNr&   )rT   r!   rJ   r   r   r   rJ   r   r   s      r    r!   z&JsonElementExtractionStrategy.__init__  s.     	"6"zz)U3r"   r#   html_contentr$   c                    | j                  |      }| j                  || j                  d         }g }|D ]  }i }	d| j                  v r1| j                  d   D ]  }
| j                  ||
      }|||	|
d   <   ! | j	                  || j                  d         }|	j                  |       |	sw|j                  |	        |S )a  
        Extract structured data from HTML content.

        How it works:
        1. Parses the HTML content using the `_parse_html` method.
        2. Identifies base elements using the schema's base selector.
        3. Extracts fields from each base element using `_extract_item`.

        Args:
            url (str): The URL of the page being processed.
            html_content (str): The raw HTML content to parse and extract.
            *q: Additional positional arguments.
            **kwargs: Additional keyword arguments for custom extraction.

        Returns:
            List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
        baseSelector
baseFieldsr   fields)_parse_html_get_base_elementsrJ   _extract_single_field_extract_itemupdater   )r   r#   r-  r(   r   parsed_htmlbase_elementsresultselementitemfieldvalue
field_datas                r    r)   z%JsonElementExtractionStrategy.extract  s    & &&|4//T[[=XY$GDt{{*![[6E 66wFE(.3U6]+ 7 ++GT[[5JKJKK
#t$ %  r"   c                      y)z*Parse HTML content into appropriate formatNr&   r   r-  s     r    r2  z)JsonElementExtractionStrategy._parse_html       	r"   selectorc                      y)z(Get all base elements using the selectorNr&   r   r7  rB  s      r    r3  z0JsonElementExtractionStrategy._get_base_elements  rA  r"   c                      y)z%Get child elements using the selectorNr&   r   r:  rB  s      r    _get_elementsz+JsonElementExtractionStrategy._get_elements  rA  r"   c           	      Z   	 |d   dk(  r7| j                  ||d         }|r|d   nd }|r| j                  ||d         S i S |d   dk(  r8| j                  ||d         }|D cg c]  }| j                  ||d          c}S |d   dk(  r8| j                  ||d         }|D cg c]  }| j                  ||d          c}S | j                  ||      S c c}w c c}w # t        $ rD}| j
                  rt        d|d	    d
t        |              |j                  d      cY d }~S d }~ww xY w)Nr   nestedrB  r   r1  listnested_listzError extracting field r   : default)	rG  r5  _extract_list_itemr4  r   r   rs   r9   r   )r   r:  r<  nested_elementsnested_elementelementselr   s           r    _extract_fieldz,JsonElementExtractionStrategy._extract_field  sN   	(V}("&"4"4WeJ>O"P7F!3DN\t)).%/JdbddV}&--guZ7HIOWXx//E(ODxXXV}---guZ7HIJRS(B**2uX?(SS--gu== Y T  	(||/fbQIJ99Y''	(sR   <C C !C "C>C !C "C>C C 
C 	D*&9D%D*%D*c                    d|v r.| j                  ||d         }|s|j                  d      S |d   }n|}d}|d   dk(  r| j                  |      }n|d   dk(  r| j                  ||d         }na|d   dk(  r| j	                  |      }nG|d   d	k(  r?| j                  |      }t        j                  |d
   |      }|r|j                  d      nd}d|v r| j                  ||d         }||S |j                  d      S )a  
        Extract a single field based on its type.

        How it works:
        1. Selects the target element using the field's selector.
        2. Extracts the field value based on its type (e.g., text, attribute, regex).
        3. Applies transformations if defined in the schema.

        Args:
            element: The base element to extract the field from.
            field (Dict[str, Any]): The field definition in the schema.

        Returns:
            Any: The extracted field value.
        rB  rM  r   Nr   text	attributer   regexpatternr   	transform)	rG  r   _get_element_text_get_element_attribute_get_element_htmlresearchgroup_apply_transform)r   r:  r<  selectedr=  rU  matchs          r    r4  z3JsonElementExtractionStrategy._extract_single_field  s   " ))'53DEHyy++{HH=F"**84E6]k)//%:LME6]f$**84E6]g%))(3DIIeI.5E&+EKKNE%))%{1CDE)uCuyy/CCr"   c                 R    i }|D ]  }| j                  ||      }||||d   <   ! |S )Nr   )r4  r   r:  r1  r;  r<  r=  s         r    rN  z0JsonElementExtractionStrategy._extract_list_itemE  s>    E..w>E &+U6]#  r"   c                     i }|D ]:  }|d   dk(  r| j                  ||      }n| j                  ||      }|3|||d   <   < |S )a   
        Extracts fields from a given element.

        How it works:
        1. Iterates through the fields defined in the schema.
        2. Handles computed, single, and nested field types.
        3. Updates the item dictionary with extracted field values.

        Args:
            element: The base element to extract fields from.
            fields (List[Dict[str, Any]]): The list of fields to extract.

        Returns:
            Dict[str, Any]: A dictionary representing the extracted item.
        r   computedr   )_compute_fieldrS  rd  s         r    r5  z+JsonElementExtractionStrategy._extract_itemM  s^    " EV}
*++D%8++GU; &+U6]#  r"   c                     |dk(  r|j                         S |dk(  r|j                         S |dk(  r|j                         S |S )a  
        Apply a transformation to a value.

        How it works:
        1. Checks the transformation type (e.g., `lowercase`, `strip`).
        2. Applies the transformation to the value.
        3. Returns the transformed value.

        Args:
            value (str): The value to transform.
            transform (str): The type of transformation to apply.

        Returns:
            str: The transformed value.
        	lowercase	uppercasestrip)lowerupperrk  )r   r=  rY  s      r    r`  z.JsonElementExtractionStrategy._apply_transformh  sE    " #;;= +%;;= '!;;= r"   c           	          	 d|v rt        |d   i |      S d|v r |d   |      S y # t        $ rD}| j                  rt        d|d    dt	        |              |j                  d      cY d }~S d }~ww xY w)N
expressionfunctionzError computing field r   rL  rM  )r   r   r   rs   r9   r   )r   r;  r<  r   s       r    rg  z,JsonElementExtractionStrategy._compute_field  s    	(u$E,/T::u$(uZ(.. % 	(||.uV}oRAxHI99Y''	(s   & & 	A39A.(A3.A3r*   c                 `    | j                   j                  |      } | j                  ||fi |S )a4  
        Run the extraction strategy on a combined HTML content.

        How it works:
        1. Combines multiple HTML sections using the `DEL` delimiter.
        2. Calls the `extract` method with the combined HTML.

        Args:
            url (str): The URL of the page being processed.
            sections (List[str]): A list of HTML sections.
            *q: Additional positional arguments.
            **kwargs: Additional keyword arguments for custom extraction.

        Returns:
            List[Dict[str, Any]]: A list of extracted items.
        )r   r   r)   )r   r#   r*   r(   r   combined_htmls         r    r4   z!JsonElementExtractionStrategy.run  s.    $ h/t||C9&99r"   c                      y)zGet text content from elementNr&   r   r:  s     r    rZ  z/JsonElementExtractionStrategy._get_element_text  rA  r"   c                      y)zGet HTML content from elementNr&   rt  s     r    r\  z/JsonElementExtractionStrategy._get_element_html  rA  r"   rV  c                      y)z Get attribute value from elementNr&   r   r:  rV  s      r    r[  z4JsonElementExtractionStrategy._get_element_attribute  rA  r"   )r   r6   r7   r8   r   r   r9   r   r!   r   r)   r   r2  r3  rG  rS  r4  rN  r5  r`  rg  r4   rZ  r\  r[  r   r   s   @r    r)  r)    s;   @ C	4tCH~ 	4'3 'c 'DcSVhDX 'R       s  (*(DT62	(:s :d3i :$tCQTH~BV :* C   C     r"   r)  c                   z     e Zd ZdZdeeef   f fdZdefdZdefdZ	defdZ
d	efd
Zd	efdZdefdZ xZS )JsonCssExtractionStrategya  
    Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.

    How it works:
    1. Parses HTML content with BeautifulSoup.
    2. Selects elements using CSS selectors defined in the schema.
    3. Extracts field data and applies transformations as defined.

    Attributes:
        schema (Dict[str, Any]): The schema defining the extraction rules.
        verbose (bool): Enables verbose logging for debugging purposes.

    Methods:
        _parse_html(html_content): Parses HTML content into a BeautifulSoup object.
        _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
        _get_elements(element, selector): Selects child elements using a CSS selector.
        _get_element_text(element): Extracts text content from a BeautifulSoup element.
        _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
        _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
    rJ   c                 0    d|d<   t        |   |fi | y Nr   r   rT   r!   r,  s      r    r!   z"JsonCssExtractionStrategy.__init__      !'~*6*r"   r-  c                     t        |d      S )Nzhtml.parserr   r@  s     r    r2  z%JsonCssExtractionStrategy._parse_html  s    \=99r"   rB  c                 $    |j                  |      S r'  selectrD  s      r    r3  z,JsonCssExtractionStrategy._get_base_elements  s    !!(++r"   c                 $    |j                  |      S r'  r  rF  s      r    rG  z'JsonCssExtractionStrategy._get_elements  s     ~~h''r"   r$   c                 &    |j                  d      S )NT)rk  )get_textrt  s     r    rZ  z+JsonCssExtractionStrategy._get_element_text  s    d++r"   c                     t        |      S r'  )r9   rt  s     r    r\  z+JsonCssExtractionStrategy._get_element_html  s    7|r"   rV  c                 $    |j                  |      S r'  r   rw  s      r    r[  z0JsonCssExtractionStrategy._get_element_attribute      {{9%%r"   )r   r6   r7   r8   r   r9   r   r!   r2  r3  rG  rZ  r\  r[  r   r   s   @r    ry  ry    sd    *+tCH~ +: :, ,(s (
,C ,C & &r"   ry  c                        e Zd ZdZdeeef   f fdZdefdZdefdZ	ded	efd
Z
ded	efdZdefdZd	efdZd	efdZdefdZ xZS )JsonXPathExtractionStrategya  
    Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.

    How it works:
    1. Parses HTML content into an lxml tree.
    2. Selects elements using XPath expressions.
    3. Converts CSS selectors to XPath when needed.

    Attributes:
        schema (Dict[str, Any]): The schema defining the extraction rules.
        verbose (bool): Enables verbose logging for debugging purposes.

    Methods:
        _parse_html(html_content): Parses HTML content into an lxml tree.
        _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
        _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
        _get_elements(element, selector): Selects child elements using an XPath selector.
        _get_element_text(element): Extracts text content from an lxml element.
        _get_element_html(element): Extracts the raw HTML content of an lxml element.
        _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
    rJ   c                 0    d|d<   t        |   |fi | y r{  r|  r,  s      r    r!   z$JsonXPathExtractionStrategy.__init__  r}  r"   r-  c                 ,    t        j                  |      S r'  )r   
fromstringr@  s     r    r2  z'JsonXPathExtractionStrategy._parse_html  s    |,,r"   rB  c                 $    |j                  |      S r'  )xpathrD  s      r    r3  z.JsonXPathExtractionStrategy._get_base_elements  s      **r"   css_selectorr$   c                 0    d|v r|S | j                  |      S )z'Convert CSS selector to XPath if needed/)_basic_css_to_xpath)r   r  s     r    _css_to_xpathz)JsonXPathExtractionStrategy._css_to_xpath  s!    ,''55r"   c                     d|v r%|j                  d      }ddj                  |      z   S d|v r%|j                  d      }ddj                  |      z   S d|z   S )z.Basic CSS to XPath conversion for common casesz > z//r  r   )r   r   )r   r  partss      r    r  z/JsonXPathExtractionStrategy._basic_css_to_xpath  sd    L  &&u-E#((5/)), &&s+E$))E***l""r"   c                 r    | j                  |      }|j                  d      sd|z   }|j                  |      S )N.)r  r   r  )r   r:  rB  r  s       r    rG  z)JsonXPathExtractionStrategy._get_elements  s8    ""8,$%KE}}U##r"   c                 ^    dj                  |j                  d            j                         S )N z	.//text())r   r  rk  rt  s     r    rZ  z-JsonXPathExtractionStrategy._get_element_text  s#    www}}[1288::r"   c                 0    t        j                  |d      S )Nunicode)encoding)r   tostringrt  s     r    r\  z-JsonXPathExtractionStrategy._get_element_html  s    ~~g	::r"   rV  c                 $    |j                  |      S r'  r  rw  s      r    r[  z2JsonXPathExtractionStrategy._get_element_attribute  r  r"   )r   r6   r7   r8   r   r9   r   r!   r2  r3  r  r  rG  rZ  r\  r[  r   r   s   @r    r  r    s    ,+tCH~ +- -+ +6# 6# 6# # #$s $;C ;;C ;& &r"   r  )'abcr   r   typingr   r   r   r   r	   concurrent.futuresr
   r   rx   r   promptsconfigutilsmodels	functoolsr   model_loaderr   r   r   r]  bs4r   lxmlr   r   dataclassesr   r   r;   rE   r   r)  ry  r  r&   r"   r    <module>r     s    # 3 3 ?          	   !+! +!Zd- d Mr. MrfwD' wDz	N$6 N`,& = ,&\>&"? >&r"   