
    g#                     V   d dl mZ d dlmZ d dlZd dlZd dlZd dlZddl d dl	Z	d dl
Zd dlmZ ej                  j                  ej                  j!                   ej"                         ej                  j%                  e                  Z e       d        Z e       d        Z e       d	        Zd
 Z e       d        Z e       d        Z e       ddefd       Z e       d        Z e       d        Z e       d        Z e       d        Z ddZ!d Z"e#dk(  r e"        yy)    )	lru_cache)PathN   )*)MODEL_REPO_BRANCHc                     dd l }| j                  dk(  r%|j                  j                  |       j                  S | j                  dk(  ryy)Nr   cudampsl       0 )torchtyper	   get_device_propertiestotal_memory)devicer   s     J/var/www/openai/venv/lib/python3.12/site-packages/crawl4ai/model_loader.pyget_available_memoryr      s>    {{fzz//7DDD		    c                 |    t        |       }| j                  dk(  ry| j                  dv r|dk\  ry|dk\  ry|dk\  ry	y
y)Ncpu   )r	   r
   l           l           l        @       )r   r   )r   available_memorys     r   calculate_batch_sizer      sO    +F3{{e		'~-/.r   c                      dd l } | j                  j                         r| j                  d      }|S | j                  j
                  j                         r| j                  d      }|S | j                  d      }|S )Nr   r	   r
   r   )r   r	   is_availabler   backendsr
   )r   r   s     r   
get_devicer   )   sl    zz f%
 M	 
			(	(	*e$ M e$Mr   c                 @    t               }| j                  |       | |fS )N)r   to)modelr   s     r   set_model_devicer#   4   s    \F	HHV&=r   c                  (   t         j                  j                  t        j                  dt	        j
                               d      } t        j                  | d       t        j                  |  dd       t        j                  |  dd       | S )NCRAWL4_AI_BASE_DIRECTORYz	.crawl4aiT)exist_okz/cachez/models)ospathjoingetenvr   homemakedirs)home_folders    r   get_home_folderr.   9   sg    '',,ryy)CTYY[QS^_KKKd+KK;-v&6KK;-w'$7r   c                      ddl m} m}m}m} | j                  dd       }|j                  dd       }|j                          t        |      \  }}||fS )Nr   BertTokenizer	BertModelAutoTokenizer	AutoModelzbert-base-uncasedresume_downloadtransformersr1   r2   r3   r4   from_pretrainedevalr#   )r1   r2   r3   r4   	tokenizerr"   r   s          r   load_bert_base_uncasedr<   A   sW    OO--.ASW-XI%%&94%PE	JJL$U+ME6er   returnc                     ddl m}m}m}m} |j                  | d      }|j                  | d      }|j                          t        |      \  }}||fS )zLoad the Hugging Face model for embedding.
    
    Args:
        model_name (str, optional): The model name to load. Defaults to "BAAI/bge-small-en-v1.5".
        
    Returns:
        tuple: The tokenizer and model.
    r   r0   Nr5   r7   )
model_namer1   r2   r3   r4   r;   r"   r   s           r   load_HF_embedding_modelr@   J   sW     PO--j$-OI%%j$%GE	JJL$U+ME6er   c                      ddl m} m} ddl m} dd l}| j                  d      }|j                  d      }|j                          t        |      \  }} |d||      }|S )Nr   )r3   "AutoModelForSequenceClassification)pipelinez1dstefa/roberta-base_topic_classification_nyt_newsztext-classification)r"   r;   )r8   r3   rB   rC   r   r9   r:   r#   )r3   rB   rC   r   r;   r"   r   pipes           r   load_text_classifierrE   [   sX    N%--.abI.>>?rsE	JJL$U+ME6))LDKr   c                    	
 ddl m} m} dd l}ddlm dd l
d}|j                  |d       	| j                  |d       j                          t              \  j                  j                  d	
fd	}|fS )Nr   )rB   r3   )expitzcardiffnlp/tweet-topic-21-multir5   c                     | ddd|      }|j                         D ci c]  \  }}||j                         }}}j                         5   di |}d d d        j                  j	                         j                         j                         } |      }||k\  dz  }g }	|D ]9  }
t        |
      D cg c]  \  }}|dk(  s|    }}}|	j                  |       ; |	S c c}}w # 1 sw Y   xY wc c}}w )NptT)return_tensorspadding
truncation
max_lengthr    )	itemsr!   no_gradlogitsdetachr   numpy	enumerateappend)texts	thresholdrM   tokenskeyvaloutputscorespredictionsbatch_labels
predictionivaluelabelsclass_mappingr   rG   r"   r;   r   s                 r   _classifierz4load_text_multilabel_classifier.<locals>._classifier   s    5tPTakl6<llnEn(#s#svvf~%nE]]__V_F  %%'++-335v*a/%J7@7L[7L81ePUYZPZmA&7LF[' &  F_ \s   C*	C0=C<C<0C9)g      ?r   )r8   rB   r3   rS   scipy.specialrG   r   r9   r:   r#   configid2label)rB   r3   npMODELrd   rc   r   rG   r"   r;   r   s        @@@@@@r   load_text_multilabel_classifierrj   h   s~    N# .E--eT-JI.>>uVZ>[E	JJL$U+ME6LL))M $ r   c                      dd l } 	 | j                  j                  d       | j                  j                  d      S # t        $ r | j	                  d       Y 7w xY w)Nr   ztokenizers/punktpunkt)nltkdatafindLookupErrordownload)rm   s    r   load_nltk_punktrr      sN    		)* 99>>,--  gs   < AAc            
      6   dd l } d}t               }t        |      |z  }|j                         rt	        |j                               sd}t        }t        |      dz  }t        d       |j                         r;	 t        j                  |       |j                         rt        j                  |       	 t        j                  dd	d
||t        |      gt        j                  t        j                  d       t        |      dz  }|j                  dd       |dz  dz  }t        j                   ||       t        j                  |       t        d       	 | j'                  t        |            S # t        $ r* t        d       t        d|        t        d|        Y y w xY w# t        j"                  $ r}	t        d|	        Y d }	~	y d }	~	wt$        $ r}	t        d|	        Y d }	~	y d }	~	ww xY w# t$        $ r}	t        d|	        Y d }	~	y d }	~	ww xY w)Nr   models/reutersz)https://github.com/unclecode/crawl4ai.gitcrawl4aiu7   [LOG] ⏬ Downloading Spacy model for the first time...zh[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:z- gitclonez-bT)stdoutstderrcheckmodels)parentsr&   reutersu-   [LOG] ✅ Spacy Model downloaded successfullyz0An error occurred while cloning the repository: zAn error occurred: zError loading spacy model: )spacyr.   r   existsanyiterdirr   printshutilrmtreePermissionError
subprocessrunstrDEVNULLmkdircopytreeCalledProcessError	Exceptionload)
r~   namer-   model_folderrepo_urlbranchrepo_foldermodels_foldersource_folderes
             r   load_spacy_modelr      s   D!#K$t+L !c,*>*>*@&A>";'*4GH k*&&(MM,/	NNvx[9IJ!))!))	 !-8Mt< ((2Y>MOOM<8 MM+&ABzz#l+,,G #   A  B;-()<.)*	6 ,, 	DQCHI 	's+,	  +A3/0sO   ;:E4 6B#F* G7 40F'&F'*G4=GG4G//G47	H HHc                    | rt        d       t               }t        j                  j	                  |d      t        j                  j	                  |d      g}|D ]1  }t        |      j                         st        j                  |       3 t        d       t        d       t               \  }}t        d|        t        d       t                t        d       y	)
z*Download all models required for Crawl4AI.z![LOG] Removing existing models...rt   r{   z[LOG] Existing models removed.z$[LOG] Downloading text classifier...z [LOG] Text classifier loaded on z,[LOG] Downloading custom NLTK Punkt model...u-   [LOG] ✅ All models downloaded successfully.N)r   r.   r'   r(   r)   r   r   r   r   rj   rr   )remove_existingr-   model_foldersfolder_r   s         r   download_all_modelsr      s    12%'GGLL&67GGLLh/
 $FF|""$f% $ 	./ 

01/1IAv	,VH
56	
89	
9:r   c                      t        d       t        d       t        j                  d      } | j                  ddd       | j	                         }t        |j                  	       y )
Nz/[LOG] Welcome to the Crawl4AI Model Downloader!zE[LOG] This script will download all the models required for Crawl4AI.zCrawl4AI Model Downloader)descriptionz--remove-existing
store_truez)Remove existing models before downloading)actionhelp)r   )r   argparseArgumentParseradd_argument
parse_argsr   r   )parserargss     r   mainr      sX    	
;<	
QR$$1LMF
+LGrsD(<(<=r   __main__)zBAAI/bge-small-en-v1.5)F)$	functoolsr   pathlibr   r   r'   r   tarfilemodel_loaderr   urllib.requesturllibcrawl4ai.configr   r(   realpathr)   getcwddirname__file____location__r   r   r   r#   r.   r<   tupler@   rE   rj   rr   r   r   r   __name__rN   r   r   <module>r      s^           -wwYRYY["''//(:S TU
   $  
     E    
 
 ) )V . . : :x;8> zF r   