
    gQ                         d dl Z d dlZd dlmZ d dlZd dlZd dlmZmZm	Z	 d dl
mZ d dlZd dlZddlmZmZ ddlmZmZ d dlZd dlZddlmZ dd	lmZ dd
lmZ ddlmZmZ  ej<                  ej>                          ej@                  e!      Z"e jF                  jI                   e jJ                  d ejL                               d      xZ'Z( e jR                  e(d       e jF                  jI                  e'd      Z( G d d      Z* e*       Z+y)    N)Path)OptionalTupleDict)asynccontextmanager   )ensure_content_dirsgenerate_content_hash)CrawlResultMarkdownGenerationResult)NEED_MIGRATION)VersionManager)AsyncLogger)get_error_contextcreate_box_message)levelCRAWL4_AI_BASE_DIRECTORY	.crawl4aiTexist_okzcrawl4ai.dbc                       e Zd ZddedefdZd Zd Zed        Zd Z	d Z
d	 Zd
efdZdedee   fdZdefdZdefdZd Zd ZdededefdZdededee   fdZy)AsyncDatabaseManager	pool_sizemax_retriesc                    t         | _        t        t        j                  j                  t                     | _        || _        || _        i | _	        t        j                         | _        t        j                         | _        t        j                  |      | _        d| _        t#               | _        t'        t        j                  j)                  t*        dd      dd      | _        y )NFr   zcrawler_db.log
   )log_fileverbose	tag_width)DB_PATHdb_pathr	   ospathdirnamecontent_pathsr   r   connection_poolasyncioLock	pool_lock	init_lock	Semaphoreconnection_semaphore_initializedr   version_managerr   joinbase_directorylogger)selfr   r   s      L/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/async_database.py__init__zAsyncDatabaseManager.__init__   s    01IJ"&@B  $+$5$5i$@!!-/!WW\\.+?OP
    c           	        K   	 | j                   j                  dd       t        j                  t        j                  j                  | j                        d       | j                  j                         }| j                          d{    t        j                  | j                  d      4 d{   }|j                  d	      4 d{   }|j                          d{   }|st        d
      ddd      d{    ddd      d{    |r| j                   j                  dd       | j                          d{    ddlm}  |        d{    | j                  j%                          | j                   j'                  dd       y| j                   j'                  dd       y7 07 	7 7 7 # 1 d{  7  sw Y   xY w7 # 1 d{  7  sw Y   xY w7 7 # t        $ rL}| j                   j)                  dddt+        |      i       | j                   j                  dd        d}~ww xY ww)z+Initialize the database and connection poolzInitializing databaseINIT)tagTr   N      >@timeoutzISELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'z"crawled_data table was not createdz%New version detected, running updatesr   )run_migrationz%Version update completed successfullyCOMPLETEz.Database initialization completed successfullyz&Database initialization error: {error}ERRORerrormessager8   paramsz)Database will be initialized on first userA   r8   )r1   infor"   makedirsr#   r$   r!   r.   needs_updateainit_db	aiosqliteconnectexecutefetchone	Exceptionupdate_db_schema
migrationsr<   update_versionsuccessr?   str)r2   rF   dbcursorresultr<   es          r3   
initializezAsyncDatabaseManager.initialize,   s    +	KK4&AKK5E  //<<>L --/!! !((tDD::_ #)??#44F!'(LMM  ED   !Hf U++---5#o%%$$335##$KQ[#\##$TZd#e) " E 5    EDDD .%  	KK@Q(  
 KKC  
 	s  IBG8 F<)G8 6F?7G8 :GGGG(G)G:GGG
G8 G6G8 G4G8 !G6";G8 IG8 ;I<G8 ?G8 GGGG	GG	GG8 G1%G(&G1-G8 6G8 8	IAIIIc                 .  K   | j                   4 d{    | j                  j                         D ]  }|j                          d{     | j                  j	                          ddd      d{    y7 f7 37 	# 1 d{  7  sw Y   yxY ww)z&Cleanup connections when shutting downN)r)   r&   valuescloseclear)r2   conns     r3   cleanupzAsyncDatabaseManager.cleanup\   sc     >>>,,335jjl"" 6  &&( ">>" ">>>sV   BA:B1B A<	 B )B4A>5B<B >B BB	BBc                	  K   | j                   sQ| j                  4 d{    | j                   s 	 | j                          d{    d| _         ddd      d{    | j                  j                          d{    t        t        j                               }	 | j                  4 d{    || j                   vr	 t#        j$                  | j&                  d
       d{   }|j)                  d       d{    |j)                  d       d{    |j)                  d      4 d{   }|j+                          d{   }|D cg c]  }|d   	 }	}h d}
|
t-        |	      z
  }|rt/        d|       ddd      d{    || j                   |<   ddd      d{    | j                   |    	 | j                  4 d{    || j                   v r2| j                   |   j3                          d{    | j                   |= ddd      d{    | j                  j5                          y7 (7 # t        $ rV}ddl}t         |j                               }| j                  j                  dddt        |      |d   |d   d	        d}~ww xY w7 T# 1 d{  7  sw Y   exY w7 K7 7 7 7 7 7 c c}w 7 R# 1 d{  7  sw Y   cxY w# t        $ rq}ddl}t         |j                               }d|d    d|d    d|d    dt        |       d|d    
}| j                  j                  t1        |d              d}~ww xY w7 # 1 d{  7  sw Y   xY w# t        $ rq}ddl}t         |j                               }d|d    d|d    d|d    dt        |       d|d    
}| j                  j                  t1        |d              d}~ww xY w7 77 7 # 1 d{  7  sw Y   xY w# | j                  4 d{  7   || j                   v r3| j                   |   j3                          d{  7   | j                   |= ddd      d{  7   n# 1 d{  7  sw Y   nxY w| j                  j5                          w xY ww)z4Connection pool manager with enhanced error handlingNTr   zSDatabase initialization failed:
{error}

Context:
{context}

Traceback:
{traceback}r>   code_contextfull_traceback)r?   context	tracebackrA   r8   force_verboserB   r9   r:   zPRAGMA journal_mode = WALzPRAGMA busy_timeout = 5000PRAGMA table_info(crawled_data)r   >   urlhtmllinksmediarP   markdownmetadata
screenshotcleaned_htmldownloaded_filesresponse_headersextracted_contentzDatabase missing columns: z.Unexpected error in db get_connection at line line_noz in functionz (filenamez
):
Error: z

Code context:
r?   )type)rA   )r-   r*   rV   rL   sysr   exc_infor1   r?   rQ   r,   acquireidr'   current_taskr)   r&   rH   rI   r!   rJ   fetchallset
ValueErrorr   rY   release)r2   rU   rt   error_contexttask_idr[   rS   columnscolcolumn_namesexpected_columnsmissing_columnserror_messages                r3   get_connectionz#AsyncDatabaseManager.get_connectionc   sh       ~~~(("oo///,0)	 &~( ''//111W))+,<	0~~~$"6"66#%.%6%6 LL$(&   #ll+FGGG"ll+GHHH $(<<0Q#R#RV\,2OO,=&=G>E+FgsCFgL+F0,
 /?\AR.RO.&03MoM^1_&` ` $S#R 9=,,W5/ &~N &&w//  ~~~d222..w7==???,,W5 &~ %%--/g & 0$ "(9,#,,.(I)) %A '*.),Q+8+H-:;K-L$	 * 	  &~~~( 	2 &  HH $S&=+F $S#R#R#R % "(9,#,,.(IL][dMeLf g""/
";!<B}Z?X>Y Z&&)!fX ...;N.K-LN & ))$6}G$T *  1 &~~~R  	-lclln=M@yAY@Z [#J/0=3L2M Na& """/"?!@B  KK*=H   	 &? &~~~t~~~d222..w7==???,,W5 &~~~~ %%--/s\  S
IS
J2II
IS
J/!S
=K>!S
 N 0K1N 4M;$K;(K)K;KK;KK;4K5K;8K%KK%K!$K%K;K"K;$N /M80N S
PS
/P
PPS
'P( S

I	J,AJ''J,,J2/S
2K8J;9K 	S
N K;K;K;K;K%K%"K;%K8	+K.,K8	3K;;	M5A,M00M55M;8N ;NNN	N 	PA,PPP- S
PS
P*P P*%S
-S>Q
?S/R2Q5
3RSRSR)R R)%"SS
c                 .  K   t        | j                        D ]Y  }	 | j                         4 d{   } ||g|  d{   }|j                          d{    |cddd      d{    c S  y7 D7 57 7 # 1 d{  7  sw Y   nxY w{# t        $ rv}|| j                  dz
  k(  r6| j
                  j                  ddd| j                  t        |      d        t        j                  d|dz   z         d{  7   Y d}~d}~ww xY ww)z,Execute database operations with retry logicNr   z2Operation failed after {retries} attempts: {error}r>   T)retriesr?   rb   )
ranger   r   commitrL   r1   r?   rQ   r'   sleep)r2   	operationargsattemptrR   rT   rU   s          r3   execute_with_retryz'AsyncDatabaseManager.execute_with_retry   s     T--.G7..00B#,R#7$#77F))+%%! 100 /07% 10000  7d..22KK%% T#&*'+'7'7%(V 	 &  mmA1$56667s   DBA5BA=A7A=A9A=B+A;,B0D5B7A=9A=;B=B	BB	BD	DA&DDDDDDc                   K   t        j                  | j                  d      4 d{   }|j                  d       d{    |j	                          d{    ddd      d{    y7 F7 /7 7 # 1 d{  7  sw Y   yxY ww)zInitialize database schemar9   r:   Na{  
                CREATE TABLE IF NOT EXISTS crawled_data (
                    url TEXT PRIMARY KEY,
                    html TEXT,
                    cleaned_html TEXT,
                    markdown TEXT,
                    extracted_content TEXT,
                    success BOOLEAN,
                    media TEXT DEFAULT "{}",
                    links TEXT DEFAULT "{}",
                    metadata TEXT DEFAULT "{}",
                    screenshot TEXT DEFAULT "",
                    response_headers TEXT DEFAULT "{}",
                    downloaded_files TEXT DEFAULT "{}"  -- New column added
                )
            )rH   rI   r!   rJ   r   )r2   rR   s     r3   rG   zAsyncDatabaseManager.ainit_db   st     $$T\\4@@B**      ))+# A@@  # A@@@sh   &BA/BA7A1A7A3A7B)A5*B1A73A75B7B	=B >B	Bc                   K   t        j                  | j                  d      4 d{   }|j                  d       d{   }|j	                          d{   }|D cg c]  }|d   	 }}g d}|D ]!  }||vs| j                  ||       d{    # |j                          d{    ddd      d{    y7 7 7 mc c}w 7 :7 "7 # 1 d{  7  sw Y   yxY ww)z Update database schema if neededr9   r:   Nrd   r   )rh   rg   rj   rk   rn   rm   )rH   rI   r!   rJ   ry   aalter_db_add_columnr   )r2   rR   rS   r   columnr   new_columnss          r3   rM   z%AsyncDatabaseManager.update_db_schema   s     $$T\\4@@B::&GHHF"OO--G4;<G&F1IGL< oK%-33FB??? & ))+ A@@H-< @ A@@@s   &C)CC)CCCCC"C	.C>CCC-C.C2C)=C>C)CC	CCC)C&CC&"C)
new_columnc                    K   |dk(  r|j                  d| d       d{    n|j                  d| d       d{    | j                  j                  ddd|i	       y7 C7 'w)
zAdd new column to the databasern   z$ALTER TABLE crawled_data ADD COLUMN z TEXT DEFAULT "{}"Nz TEXT DEFAULT ""z'Added column '{column}' to the databaser7   r   r@   )rJ   r1   rD   )r2   r   rR   s      r3   r   z)AsyncDatabaseManager.aalter_db_add_column   sx     ++**CJ<Ocdeee**CJ<O_`aaa=j) 	 	
 fas   A(A$A(A&&A(&A(re   returnc           
          K    fd}	  j                  |       d{   S 7 # t        $ r4} j                  j                  ddddt	        |      i       Y d}~yd}~ww xY ww)z'Retrieve cached URL data as CrawlResultc                   K   | j                  df      4 d {   }|j                          d {   }|s	 d d d       d {    y |j                  D cg c]  }|d   	 }}t        t	        ||            }|d   |d   |d   |d   |d   |d   d}|j                         D ]B  \  }}|r6j                  ||j                  d	      d          d {   }	|	xs d
||<   >d
||<   D g d}
|
D ]%  }	 ||   rt        j                  ||         ni ||<   ' t        |d   t              r'|d   |d<   |d   j                  d      r|d   d   |d<   	 |d   rt        j                  |d         ng |d<   t        j                  j!                         }|j                         D ci c]  \  }}||v s|| }}}t        di |cd d d       d {    S 7 7 7 c c}w 7 # t        j                  $ r	 i ||<   Y w xY w# t        j                  $ r g |d<   Y w xY wc c}}w 7 Y# 1 d {  7  sw Y   y xY ww)Nz(SELECT * FROM crawled_data WHERE url = ?r   rf   rl   ri   ro   rk   )rf   rl   ri   ro   rk   screenshots_ )rh   rg   rj   rn   ri   markdown_v2raw_markdownrm    )rJ   rK   descriptiondictzipitems_load_contentsplitjsonloadsJSONDecodeError
isinstancer   getr   __annotations__keys)rR   rS   rowr   r   row_dictcontent_fieldsfield
hash_valuecontentjson_fieldsvalid_fieldskvfiltered_dictr2   re   s                  r3   _getz2AsyncDatabaseManager.aget_cached_url.<locals>._get  s    zz:SF "OO--   >D=O=OP=Ok;q>=OPGS 12 %V,$,^$< ( 4)12E)F"*<"8#+L#9" *8)=)=)?%E:!(,(:(:&!KK,Q/) # +2-R*, *@ ](E-IQRW$**Xe_*E^` ) hz2D9.6z.BH]+
+//?/7
/CN/S,6_ghz_{4::hGY>Z3[  BDH/0
  +::??A2:..2B X2B$!Qa<FWA2B X"3]3o   . Q #  // -*,- ++ 635H/06
 !Yk   s   IGIH0GH0IG IH0G#&A1H0G(H07"G+<H0"H
81H0)H(6H(;H0IH.IH0 I#H0+HH0HH0
H%"H0$H%%	H0.I0I6H97I>INz$Error retrieving cached URL: {error}r>   Tr?   rb   r   rL   r1   r?   rQ   )r2   re   r   rU   s   ``  r3   aget_cached_urlz$AsyncDatabaseManager.aget_cached_url  si     8	4t		006666 	KK>"Q(	   	s6   A'' %' A'' 	A$*AA'A$$A'rT   c           
         	K   j                   dfj                  xs ddfdj                  xs ddfj                  xs ddfd}	 t	        j
                  t              r j
                  j                         df|d<   nt        d	      r j                  j                         df|d<   nct	        j
                  t              r,t        j
                  
      }|j                         df|d<   nt               j                         df|d<   i 	|j                         D ]%  \  }\  }}| j                  ||       d{   	|<   ' 	fd}	 | j!                  |       d{    y# t        $ rP}| j                  j                  dt        |       d       t               j                         df|d<   Y d}~d}~ww xY w7 7 c# t        $ r4}| j                  j#                  ddddt        |      i       Y d}~yd}~ww xY ww)zCache CrawlResult datarf   r   cleanedN	extractedr   )rf   rl   ri   ro   rk   ri   r   )r   z#Error processing markdown content: WARNINGrC   c                   K   | j                  dj                  d   d   d   d   j                  t        j                  j
                        t        j                  j                        t        j                  j                  xs i       d   t        j                  j                  xs i       t        j                  j                  xs g       f       d {    y 7 w)Na  
                INSERT INTO crawled_data (
                    url, html, cleaned_html, markdown,
                    extracted_content, success, media, links, metadata,
                    screenshot, response_headers, downloaded_files
                )
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ON CONFLICT(url) DO UPDATE SET
                    html = excluded.html,
                    cleaned_html = excluded.cleaned_html,
                    markdown = excluded.markdown,
                    extracted_content = excluded.extracted_content,
                    success = excluded.success,
                    media = excluded.media,
                    links = excluded.links,
                    metadata = excluded.metadata,
                    screenshot = excluded.screenshot,
                    response_headers = excluded.response_headers,
                    downloaded_files = excluded.downloaded_files
            rf   rl   ri   ro   rk   )
rJ   re   rP   r   dumpsrh   rg   rj   rn   rm   )rR   content_hashesrT   s    r3   _cachez/AsyncDatabaseManager.acache_url.<locals>._cachen  s     ** ( 

v&~.z*23

6<<(

6<<(

6??0b1|,

6228b9

6228b9'     s   C"C-%C+&C-zError caching URL: {error}r>   Tr?   rb   )rf   rl   ro   rk   r   ri   r   model_dump_jsonhasattrr   rQ   rL   r1   warningr   _store_contentr   r?   )
r2   rT   content_mapmarkdown_resultrU   r   r   content_typer   r   s
    `       @r3   
acache_urlzAsyncDatabaseManager.acache_urlM  s	     [[&)#006B	B"(":":"@b+!N!,,2MB
	a&//+CD+1??+J+J+Lj*YJ'/+1+=+=+M+M+OQ[*\J'FOOS1":"X+:+J+J+Lj*YJ'+C+E+U+U+WYc*dJ' .9.?.?.A*E*G\*.*=*=g|*T$TN5! /B!	F	))&111a  	aKK=c!fXF   
 (@'A'Q'Q'SU_&`K
#	a %UL 2 	KK4"Q(	   	sz   AH
C	E. 0HG
HG (G)G -H.	G7AG=HGHG 	H*HHHHc           
         K   d }	 | j                  |       d{   S 7 # t        $ r4}| j                  j                  ddddt	        |      i       Y d}~yd}~ww xY ww)	zGet total number of cached URLsc                    K   | j                  d      4 d {   }|j                          d {   }|r|d   ndcd d d       d {    S 7 67  7 	# 1 d {  7  sw Y   y xY ww)Nz!SELECT COUNT(*) FROM crawled_datar   )rJ   rK   )rR   rS   rT   s      r3   _countz5AsyncDatabaseManager.aget_total_count.<locals>._count  sS     zz"EFF&%00$*vay GFF0 GFFFsS   A*AA*AAAA*	A
A*AA*A'AA'#A*Nz"Error getting total count: {error}r>   Tr?   rb   r   r   )r2   r   rU   s      r3   aget_total_countz%AsyncDatabaseManager.aget_total_count  sg     	2
		008888 	KK<"Q(	   	s6   A""  " A"" 	A*AA"AA"c           
         K   d }	 | j                  |       d{    y7 # t        $ r4}| j                  j                  ddddt	        |      i       Y d}~yd}~ww xY ww)z Clear all data from the databasec                 B   K   | j                  d       d {    y 7 w)NzDELETE FROM crawled_datarJ   rR   s    r3   _clearz.AsyncDatabaseManager.aclear_db.<locals>._clear  s     **7888   Nz Error clearing database: {error}r>   Tr?   rb   r   )r2   r   rU   s      r3   	aclear_dbzAsyncDatabaseManager.aclear_db  sd     	9	))&111 	KK:"Q(	   	6   A## !# A## 	A *AA#A  A#c           
         K   d }	 | j                  |       d{    y7 # t        $ r4}| j                  j                  ddddt	        |      i       Y d}~yd}~ww xY ww)zDrop the entire tablec                 B   K   | j                  d       d {    y 7 w)Nz!DROP TABLE IF EXISTS crawled_datar   r   s    r3   _flushz.AsyncDatabaseManager.aflush_db.<locals>._flush  s     **@AAAr   Nz Error flushing database: {error}r>   Tr?   rb   r   )r2   r   rU   s      r3   	aflush_dbzAsyncDatabaseManager.aflush_db  se     	B	))&111 	KK:"Q(	   	r   r   r   c                   K   |syt        |      }t        j                  j                  | j                  |   |      }t        j                  j                  |      sLt        j                  |dd      4 d{   }|j                  |       d{    ddd      d{    |S |S 7 17 7 # 1 d{  7  sw Y   |S xY ww)z+Store content in filesystem and return hashr   wutf-8encodingN)	r
   r"   r#   r/   r%   existsaiofilesopenwrite)r2   r   r   content_hash	file_pathfs         r3   r   z#AsyncDatabaseManager._store_content  s     ,W5GGLL!3!3L!A<P	 ww~~i(}}YgFF!ggg&&& GF | G& GFFF sZ   A7C9B+:C=B1B-B1C"B/#	C-B1/C1C7B:8C?Cr   c                   K   |syt         j                  j                  | j                  |   |      }	 t	        j
                  |dd      4 d{   }|j                          d{   cddd      d{    S 7 -7 7 	# 1 d{  7  sw Y   yxY w#  | j                  j                  dddd|i	       Y yxY ww)
z$Load content from filesystem by hashNrr   r   z#Failed to load content: {file_path}r>   Tr   rb   )	r"   r#   r/   r%   r   r   readr1   r?   )r2   r   r   r   r   s        r3   r   z"AsyncDatabaseManager._load_content  s     GGLL!3!3L!A<P	
	}}YgFF!VVX~ GFF% GFFF	KK="#Y/	   s   1CB A>B B(B )B,B 8B9B =C>B  BB B
BBB CB #B><CN)r      )__name__
__module____qualname__intr4   rV   r\   r   r   r   rG   rM   rQ   r   r   r   r   r   r   r   r   r   r   r   r5   r3   r   r      s    
# 
 
$-`) V0 V0r7,.

S 

E E+1F ENL{ L^ $ C s s  3 8TW= r5   r   ),r"   rt   pathlibr   rH   r'   typingr   r   r   
contextlibr   loggingr   utilsr	   r
   modelsr   r   xxhashr   configr   r.   r   async_loggerr   r   r   basicConfigINFO	getLoggerr   r1   r#   r/   getenvhomer0   r    rE   r   async_db_managerr   r5   r3   <module>r      s        ( ( *   = 9   " + % 8   ',, '			8	$77<<			2Lidiik(Z\gh h Gd #
'',,~}
5S Sl () r5   