
    8g                         d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZmZmZ ddlmZ  G d de      Zy	)
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizers
processorstrainers)BPE)	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                   z    e Zd ZdZ	 	 	 	 	 	 	 	 	 ddeeeeeef   f      deeeee	eef   e	eef   f   f      de
de
dee   dee   d	ee   d
ee   de
f fdZededefd       Zdddg fdeeee   f   dedede
deeeef      f
dZdddg dfdeee   eee      f   dedede
deeeef      dee   fdZ xZS )ByteLevelBPETokenizerzjByteLevelBPETokenizer

    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
    Nvocabmergesadd_prefix_space	lowercasedropoutunicode_normalizercontinuing_subword_prefixend_of_word_suffixtrim_offsetsc
           
          |$|"t        t        ||||xs d|xs d            }
nt        t                     }
g }|r|t        |      gz  }|r|t               gz  }t	        |      dkD  r)t	        |      dkD  rt        |      |
_        n
|d   |
_        t        j                  |      |
_	        t        j                         |
_        t        j                  |	      |
_        d|||||||	d}t        | =  |
|       y )	N )r   r   r   r   r   )r   )r   ByteLevelBPE)modelr   r   r   r   r   r   r   )r
   r   r   r   lenr   
normalizerr   	ByteLevelpre_tokenizerr   decoderr   post_processorsuper__init__)selfr   r   r   r   r   r   r   r   r   	tokenizernormalizers
parameters	__class__s                ^/var/www/openai/venv/lib/python3.12/site-packages/tokenizers/implementations/byte_level_bpe.pyr+   zByteLevelBPETokenizer.__init__   s    !3!#.G.M2'9'?RI "#%(I 78JKLLKIK=(K {a;!#'/'<	$'21~	$"0":":L\"]	$..0	#-#7#7\#R	  $ 0""4)B"4(	

 	J/    vocab_filenamemerges_filenamec                 N    t        j                  | |      \  }}t        ||fi |S )N)r   	read_filer   )r3   r4   kwargsr   r   s        r1   	from_filezByteLevelBPETokenizer.from_fileJ   s(    noFv$UF=f==r2   i0u     Tfiles
vocab_sizemin_frequencyshow_progressspecial_tokensc                     t        j                  ||||t        j                  j	                               }t        |t              r|g}| j                  j                  ||       y)z%Train the model using the given filesr;   r<   r=   r>   initial_alphabet)trainerN)	r   
BpeTrainerr   r&   alphabet
isinstancestr
_tokenizertrain)r,   r:   r;   r<   r=   r>   rB   s          r1   rH   zByteLevelBPETokenizer.trainO   s\     %%!'')+55>>@
 eS!GEeW5r2   iteratorlengthc                     t        j                  ||||t        j                  j	                               }| j
                  j                  |||       y)z(Train the model using the given iteratorr@   )rB   rJ   N)r   rC   r   r&   rD   rG   train_from_iterator)r,   rI   r;   r<   r=   r>   rJ   rB   s           r1   rL   z)ByteLevelBPETokenizer.train_from_iteratord   sT     %%!'')+55>>@
 	++ 	, 	
r2   )	NNFFNNNNF)__name__
__module____qualname____doc__r   r   rF   r   intr   boolfloatr+   staticmethodr8   r   r	   rH   r   rL   __classcell__)r0   s   @r1   r   r   
   s    7;OS!&#',037,0"80c4S>12380 sDsCx%S/)I$JJKL80 	80
 80 %80 %SM80 $,C=80 %SM80 80t ># > > >  "796S$s)^$6 6 	6
 6 U3
?3460  "79 $
x'>>?
 
 	

 
 U3
?34
 
r2   r   N)typingr   r   r   r   r   r   
tokenizersr	   r
   r   r   r   r   tokenizers.modelsr   tokenizers.normalizersr   r   r   base_tokenizerr   r    r2   r1   <module>r\      s+    ? ? \ \ ! S S )p
M p
r2   