
    8gZ                         d dl mZmZmZmZmZmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZmZ ddlmZ  G d d	e      Zy
)    )DictIteratorListOptionalTupleUnion   )
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)BertNormalizer	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                       e Zd ZdZ	 	 	 	 	 	 	 	 	 d!deeeeeef   f      deeeee	eef   e	eef   f   f      deee
f   d	ed
ee   dedee   dedef fdZededefd       Zdddgdg ddfdeeee   f   dededeeee
f      dedee   d	ee   defdZdddgdg dddfdeee   eee      f   dededeeee
f      dedee   d	ee   dedee   fd Z xZS )"CharBPETokenizera  Original BPE Tokenizer

    Represents the BPE algorithm, as introduced by Rico Sennrich
    (https://arxiv.org/abs/1508.07909)

    The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
    Sennrich subword-nmt implementation by the following options that you can deactivate:
        - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
            * removing any control characters and replacing all whitespaces by the classic one.
            * handle chinese chars by putting spaces around them.
            * strip all accents.
        - spitting on punctuation in addition to whitespaces (deactivate it with
          `split_on_whitespace_only=True`)
    N<unk></w>Tvocabmerges	unk_tokensuffixdropout	lowercaseunicode_normalizerbert_normalizersplit_on_whitespace_onlyc
           
         |%|#t        t        |||t        |      |            }
n t        t        t        |      ||            }
|
j                  t        |            |
j	                  t        |      g       g }|r|t        |      gz  }|r|t        d      gz  }|r|t               gz  }t        |      dkD  r)t        |      dkD  rt        |      |
_
        n
|d   |
_
        |	rt        j                         |
_        nt        j                         |
_        t        j                   |      |
_        d|||||||	d	}t$        | M  |
|       y )
N)r   r   end_of_word_suffix)r   r   r$   F)r   r   r   )r   r   )modelr   r   r   r   r    r!   r"   )r   r   strtoken_to_idadd_special_tokensr   r   r   lenr   
normalizerr   WhitespaceSplitpre_tokenizerBertPreTokenizerr   
BPEDecoderdecodersuper__init__)selfr   r   r   r   r   r   r    r!   r"   	tokenizernormalizers
parameters	__class__s                ^/var/www/openai/venv/lib/python3.12/site-packages/tokenizers/implementations/char_level_bpe.pyr1   zCharBPETokenizer.__init__   s\    !3!#!)n'-I "#Idj"klI  Y0<((#i.)9: 78JKLLKNU;<<KIK=(K {a;!#'/'<	$'21~	$#&4&D&D&FI#&4&E&E&GI#$//v>	 """4.(@	

 	J/    vocab_filenamemerges_filenamec                 N    t        j                  | |      \  }}t        ||fi |S )N)r   	read_filer   )r9   r:   kwargsr   r   s        r7   	from_filezCharBPETokenizer.from_file\   s(    noFvv888r8   i0u  r	   i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc	           	          t        j                  |||||||      }	t        |t              r|g}| j                  j                  ||	       y)z%Train the model using the given filesr@   rA   rB   rC   rD   r$   rE   )trainerN)r   
BpeTrainer
isinstancer&   
_tokenizertrain)
r2   r?   r@   rA   rB   rC   rD   r   rE   rH   s
             r7   rL   zCharBPETokenizer.traina   sS     %%!'))-%'
 eS!GEeW5r8   iteratorlengthc
           	      x    t        j                  |||||||      }
| j                  j                  ||
|	       y)z(Train the model using the given iteratorrG   )rH   rN   N)r   rI   rK   train_from_iterator)r2   rM   r@   rA   rB   rC   rD   r   rE   rN   rH   s              r7   rP   z$CharBPETokenizer.train_from_iterator{   sK     %%!'))-%'
 	++ 	, 	
r8   )	NNr   r   NFNTF)__name__
__module____qualname____doc__r   r   r&   r   intr   r
   floatboolr1   staticmethodr>   r   rL   r   rP   __classcell__)r6   s   @r7   r   r   	   s9   " 7;OS,3#',0 $).A0c4S>123A0 sDsCx%S/)I$JJKLA0 j)	A0
 A0 %A0 A0 %SMA0 A0 #'A0F 9# 9 9 9  8?y"&( &"6S$s)^$6 6 	6
 U3
?346 6 s)6 6 6:  8?y"&( &" $
x'>>?
 
 	

 U3
?34
 
 s)
 
 
 
r8   r   N)typingr   r   r   r   r   r    r
   r   r   r   r   modelsr   r4   r   r   r   r   base_tokenizerr   r    r8   r7   <module>r_      s+    ? ? H H  Z Z )M
} M
r8   