
    8g                         d dl mZmZmZmZmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ ddlmZ  G d	 d
e      Zy)    )DictIteratorListOptionalUnion)
AddedToken	Tokenizerdecoderstrainers)	WordPiece)BertNormalizer)BertPreTokenizer)BertProcessing   )BaseTokenizerc                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 d!deeeeeef   f      deee	f   deee	f   deee	f   d	eee	f   d
eee	f   de
de
dee
   de
def fdZedefd       Zdddg g dddfdeeee   f   dedededee   deeee	f      de
defdZdddg g ddddfdeee   eee      f   dedededee   deeee	f      de
dedee   fd Z xZS )"BertWordPieceTokenizerzBert WordPiece TokenizerNT##vocab	unk_token	sep_token	cls_token	pad_token
mask_token
clean_texthandle_chinese_charsstrip_accents	lowercasewordpieces_prefixc                 >   | t        t        |t        |                  }nt        t        t        |                  }|j                  t        |            |j	                  t        |      g       |j                  t        |            |j	                  t        |      g       |j                  t        |            |j	                  t        |      g       |j                  t        |            |j	                  t        |      g       |j                  t        |            |j	                  t        |      g       t        |||	|
      |_        t               |_        |u|j                  t        |            }|t        d      |j                  t        |            }|t        d      t        t        |      |ft        |      |f      |_        t        j                  |      |_        d||||||||	|
|d}t        | =  ||       y )N)r   )r   r   r   r   z%sep_token not found in the vocabularyz%cls_token not found in the vocabulary)prefixBertWordPiece)modelr   r   r   r   r   r   r   r   r   r   )r	   r   strtoken_to_idadd_special_tokensr   
normalizerr   pre_tokenizer	TypeErrorr   post_processorr
   decodersuper__init__)selfr   r   r   r   r   r   r   r   r   r   r   	tokenizersep_token_idcls_token_id
parameters	__class__s                   ^/var/www/openai/venv/lib/python3.12/site-packages/tokenizers/implementations/bert_wordpiece.pyr-   zBertWordPieceTokenizer.__init__   s    !)ES^"LMI!)c)n"EFI   Y0<((#i.)9:  Y0<((#i.)9:  Y0<((#i.)9:  Y0<((#i.)9:  Z1=((#j/):;-!!5'	 
	 #3"4	$00Y@L# GHH$00Y@L# GHH'5s9~|6TWZ[dWegsVt'uI$$..6GH	 %""""$$$8*"!2

 	J/    c                 D    t        j                  |       } t        | fi |S )N)r   	read_filer   )r   kwargss     r4   	from_filez BertWordPieceTokenizer.from_fileQ   s"    ##E*%e6v66r5   i0u     i  )[PAD][UNK][CLS][SEP][MASK]files
vocab_sizemin_frequencylimit_alphabetinitial_alphabetspecial_tokensshow_progressc	           	          t        j                  |||||||      }	t        |t              r|g}| j                  j                  ||	       y)z%Train the model using the given filesrA   rB   rC   rD   rE   rF   continuing_subword_prefix)trainerN)r   WordPieceTrainer
isinstancer$   
_tokenizertrain)
r.   r@   rA   rB   rC   rD   rE   rF   r   rJ   s
             r4   rN   zBertWordPieceTokenizer.trainV   sS    & ++!')-)'&7
 eS!GEeW5r5   iteratorlengthc
           	      x    t        j                  |||||||      }
| j                  j                  ||
|	       y)z(Train the model using the given iteratorrH   )rJ   rP   N)r   rK   rM   train_from_iterator)r.   rO   rA   rB   rC   rD   rE   rF   r   rP   rJ   s              r4   rR   z*BertWordPieceTokenizer.train_from_iteratorv   sK    ( ++!')-)'&7
 	++ 	, 	
r5   )Nr<   r>   r=   r;   r?   TTNTr   )__name__
__module____qualname____doc__r   r   r$   r   intr   boolr-   staticmethodr9   r   rN   r   rR   __classcell__)r3   s   @r4   r   r      s@   " 7;,3,3,3,3-5%)(,!%@0c4S>123@0 j)@0 j)	@0
 j)@0 j)@0 #z/*@0 @0 #@0  ~@0 @0 @0D 7 7 7  "&(8
 #!%6S$s)^$6 6 	6
 6 s)6 U3
?346 6 6F  "&(8
 #!% $!!
x'>>?!
 !
 	!

 !
 s)!
 U3
?34!
 !
 !
  !!
r5   r   N)typingr   r   r   r   r   
tokenizersr   r	   r
   r   tokenizers.modelsr   tokenizers.normalizersr   tokenizers.pre_tokenizersr   tokenizers.processorsr   base_tokenizerr   r    r5   r4   <module>rc      s.    8 8 @ @ ' 1 6 0 )K
] K
r5   