
    g$                         d dl mZmZ d dlmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZmZ d dlZd dlmZ  ej$                  d	      Zd
ededefdZ G d de      Z G d de      Zy)    )ABCabstractmethod)OptionalDictAnyTuple   )MarkdownGenerationResult)CustomHTML2Text)RelevantContentFilterBM25ContentFilterN)urljoinz+!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)baseurlreturnc                     |j                  d      r|S |j                  d      r| j                  d      r| dd |z   S | |z   S t        | |      S )z"Fast URL joining for common cases.)http://https://mailto:z///N)
startswithendswithr   )r   r   s     Z/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/markdown_generation_strategy.pyfast_urljoinr      sS    
~~>?

~~c==9s?"cz4    c                       e Zd ZdZddee   deeeef      fdZ	e
	 	 	 	 ddededeeeef      dee   d	ed
efd       Zy)MarkdownGenerationStrategyz7Abstract base class for markdown generation strategies.Ncontent_filteroptionsc                 (    || _         |xs i | _        y N)r   r    )selfr   r    s      r   __init__z#MarkdownGenerationStrategy.__init__   s    ,}"r   cleaned_htmlbase_urlhtml2text_options	citationsr   c                      y)z$Generate markdown from cleaned HTML.N )r#   r%   r&   r'   r   r(   kwargss          r   generate_markdownz,MarkdownGenerationStrategy.generate_markdown   s     	r   NN) NNT)__name__
__module____qualname____doc__r   r   r   strr   r$   r   boolr
   r,   r*   r   r   r   r      s    A%x0E'F %X`aefiknfnaoXp %  *,GKKO+/'*#& -5T#s(^,D *22G)H	
 %) '? r   r   c                        e Zd ZdZddee   deeeef      f fdZ	ddedede
eef   fdZ	 	 	 	 	 dd	eded
eeeef      deeeef      dee   dedefdZ xZS )DefaultMarkdownGeneratoraw  
    Default implementation of markdown generation strategy.
    
    How it works:
    1. Generate raw markdown from cleaned HTML.
    2. Convert links to citations.
    3. Generate fit markdown if content filter is provided.
    4. Return MarkdownGenerationResult.
    
    Args:
        content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
        options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
        
    Returns:
        MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
    r   r    c                 &    t         |   ||       y r"   )superr$   )r#   r   r    	__class__s      r   r$   z!DefaultMarkdownGenerator.__init__9   s    1r   markdownr&   r   c                 @   i }i }g }d}d}t         j                  |      D ]  }|j                  |||j                                 |j	                         \  }	}
}|r)|
j                  d      s|
|vrt        ||
      ||
<   ||
   }
|
|vrOg }|r|j                  |       |	r|	|k7  r|j                  |	       ||rddj                  |      z   ndf||
<   |dz  }||
   d   }|j                  |j                  d      j                  d      s|	 d| d	nd
|	 d| d       |j                         } |j                  ||d        dj                  |      }dg}|j                  d t        |j                         d       D               |dj                  |      fS )a"  
        Convert links in markdown to citations.
        
        How it works:
        1. Find all links in the markdown.
        2. Convert links to citations.
        3. Return converted markdown and references markdown.
        
        Note:
        This function uses a regex pattern to find links in markdown.
        
        Args:
            markdown (str): Markdown text.
            base_url (str): Base URL for URL joins.
            
        Returns:
            Tuple[str, str]: Converted markdown and references markdown.
        r   r	   )r   r   r   z: z - r.   !   ⟨u   ⟩z![u   ⟩]Nz

## References

c              3   >   K   | ]  \  }\  }}d | d| | d  yw)r=   u   ⟩ 
Nr*   ).0r   numdescs       r   	<genexpr>zFDefaultMarkdownGenerator.convert_links_to_citations.<locals>.<genexpr>o   s4      
$S [c4 #d3%vR($S   c                     | d   d   S )Nr	   r   r*   )xs    r   <lambda>zEDefaultMarkdownGenerator.convert_links_to_citations.<locals>.<lambda>q   s    1Q4PQ7r   )key)LINK_PATTERNfinditerappendstartgroupsr   r   joingroupendextendsorteditems)r#   r:   r&   link_map	url_cachepartslast_endcountermatchtextr   titlerB   rA   converted_text
referencess                   r   convert_links_to_citationsz3DefaultMarkdownGenerator.convert_links_to_citations<   s   & 	!**84ELL(5;;=9:$||~D#u /Q Ri'%1(C%@IcNn("$++e,DEM4;;t+<!(T$D1A*Ar R13-"CLLU[[^5N5Ns5SD6SE-[]^b]ccfgjfkkoYpqyy{H' 5* 	Xhi() ..
 
$*8>>+;AR$S
 	

 rwwz222r   r%   r'   r(   c           	         	 t        |      }ddddddddd}	|r|	j                  |       n;|r|	j                  |       n'| j                  r|	j                  | j                          |j                  di |	 |sd}nt	        |t
              st        |      }	 |j                  |      }
|
j                  d	d
      }
|
}d}|r	 | j                  |
|      \  }}d}d}|s| j                  rK	 |xs | j                  }|j                  |      }dj                  d |D              }|j                  |      }t        |
xs d|xs d|xs d|xs d|xs d      S # t        $ r}dt        |       }
Y d}~d}~ww xY w# t        $ r}|
}dt        |       }Y d}~d}~ww xY w# t        $ r}dt        |       }d}Y d}~d}~ww xY w# t        $ r(}dt        |       }t        ||ddd      cY d}~S d}~ww xY w)a  
        Generate markdown with citations from cleaned HTML.
        
        How it works:
        1. Generate raw markdown from cleaned HTML.
        2. Convert links to citations.
        3. Generate fit markdown if content filter is provided.
        4. Return MarkdownGenerationResult.
        
        Args:
            cleaned_html (str): Cleaned HTML content.
            base_url (str): Base URL for URL joins.
            html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
            options (Optional[Dict[str, Any]]): Additional options for markdown generation.
            content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
            citations (bool): Whether to generate citations.
            
        Returns:
            MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
        )baseurlr   FT)
body_widthignore_emphasisignore_linksignore_imagesprotect_linkssingle_line_break	mark_codeescape_snobr.   z#Error converting HTML to markdown: Nz    ```z```zError generating citations: r?   c              3   >   K   | ]  }d j                  |        yw)z<div>{}</div>N)format)r@   ss     r   rC   z=DefaultMarkdownGenerator.generate_markdown.<locals>.<genexpr>   s     -_Q^Ao.D.DQ.GQ^rD   zError generating fit markdown: )raw_markdownmarkdown_with_citationsreferences_markdownfit_markdownfit_htmlzError in markdown generation: r*   )r   updater    update_params
isinstancer3   handle	Exceptionreplacer^   r   filter_contentrN   r
   )r#   r%   r&   r'   r    r   r(   r+   hdefault_optionsrl   erm   rn   ro   filtered_html	error_msgs                    r   r,   z*DefaultMarkdownGenerator.generate_markdownv   sA   8O	1A#( %!&!%%)!$	O !&&'89&&w/&&t||4AOO.o.  !c2"<0N xx5 (//	5AL ,8#')RCGCbCb$hD@+-@ +-L+-M!4!4'%3%Jt7J7JN$2$A$A,$OM$(II-_Q^-_$_M#$88M#:L
 ,)/R(?(E2$7$=2)/R&," =  N!DSVHMN ! R.:+,HQ*Q'R ! '%DSVH#ML$&M'  		8QAI+&(1$& 		s   BG E ,G E? G -A
F% 7#G 	E<$E72G 7E<<G ?	F"FG F""G %	G.G>G GG 	G<G71G<7G<r-   )r.   )r.   NNNT)r/   r0   r1   r2   r   r   r   r3   r   r$   r   r^   r4   r
   r,   __classcell__)r9   s   @r   r6   r6   (   s     2x0E'F 2X`aefiknfnaoXp 2833 83# 83uUXZ]U] 83x *,GK=AKO+/k'*k#&k -5T#s(^,Dk #+4S>":	k
 *22G)Hk %)k '?kr   r6   )abcr   r   typingr   r   r   r   modelsr
   	html2textr   content_filter_strategyr   r   reurllib.parser   compilerI   r3   r   r   r6   r*   r   r   <module>r      se    # - - , & M 	   rzzHI	s 	 	 	 "y9 yr   