
    8g                     z    d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ddlmZ  G d de      Zy	)
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC   )BaseTokenizerc                       e Zd ZdZ	 	 	 	 	 	 	 ddeeeeeef   f      deeeee	eef   e	eef   f   f      deee
f   ded	ed
ee   dee   f fdZededefd       Zdddgdg dfdeeee   f   dededeeee
f      dedee   defdZdddgdg ddfdeee   eee      f   dededeeee
f      dedee   dedee   fdZ xZS )SentencePieceBPETokenizerzrSentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    N<unk>Tvocabmerges	unk_tokenreplacementadd_prefix_spacedropoutfuse_unkc           	         ||t        t        |||||            }nt        t        |||            }|j                  t        |            |j	                  t        |      g       t               |_        |rdnd}	t        j                  ||	      |_	        t        j                  ||	      |_        d||||d}
t        | 5  ||
       y )N)r   r   r   alwaysnever)r   prepend_schemeSentencePieceBPE)modelr   r   r   r   )r
   r   token_to_idstradd_special_tokensr   
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__)selfr   r   r   r   r   r   r   	tokenizerr   
parameters	__class__s              a/var/www/openai/venv/lib/python3.12/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr*   z"SentencePieceBPETokenizer.__init__   s     !3!#eVWPYdl"mnI!#gU]"^_I  Y0<((#i.)9:#v	%57"0":":{cq"r	$..;Wef	 ("& 0

 	J/    vocab_filenamemerges_filenamec                 N    t        j                  | |      \  }}t        ||fi |S )N)r   	read_filer   )r1   r2   kwargsr   r   s        r/   	from_filez#SentencePieceBPETokenizer.from_file1   s(    noFv(A&AAr0   i0u     i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc                     t        j                  ||||||      }t        |t              r|g}| j                  j                  ||       y)z%Train the model using the given filesr9   r:   r;   r<   r=   r>   )trainerN)r   
BpeTrainer
isinstancer#   
_tokenizertrain)	r+   r8   r9   r:   r;   r<   r=   r>   rA   s	            r/   rE   zSentencePieceBPETokenizer.train6   sP     %%!'))-'
 eS!GEeW5r0   iteratorlengthc	                 v    t        j                  ||||||      }	| j                  j                  ||	|       y)z(Train the model using the given iteratorr@   )rA   rG   N)r   rB   rD   train_from_iterator)
r+   rF   r9   r:   r;   r<   r=   r>   rG   rA   s
             r/   rI   z-SentencePieceBPETokenizer.train_from_iteratorN   sH     %%!'))-'
 	++ 	, 	
r0   )NNr   u   ▁TNF)__name__
__module____qualname____doc__r   r   r#   r   intr   r	   boolfloatr*   staticmethodr6   r   rE   r   rI   __classcell__)r.   s   @r/   r   r   
   s    7;OS,3 !%#'#(0c4S>1230 sDsCx%S/)I$JJKL0 j)	0
 0 0 %0 4.0B B# B B B  8?y"&("6S$s)^$6 6 	6
 U3
?346 6 s)6 66  8?y"&(" $
x'>>?
 
 	

 U3
?34
 
 s)
 
 
r0   r   N)typingr   r   r   r   r   r   
tokenizersr	   r
   r   r   r   tokenizers.modelsr   tokenizers.normalizersr   base_tokenizerr   r    r0   r/   <module>rY      s(    ? ? P P ! ' )]
 ]
r0   