
    8g!9                        d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlmZmZ ej                   j#                  e      Zej                   j'                  ed      Z ee      5 Zej/                         Zddd        G d d      Ze	e   Ze	ee      Z G d d	e
      Z G d
 d      Z G d d      Z G d d      Z efde	e!   de!fdZ"y# 1 sw Y   WxY w)    N)Template)AnyCallableDictList
NamedTupleOptionalTuple)Encoding	Tokenizerzvisualizer-styles.cssc                   @    e Zd ZU eed<   eed<   eed<   dededefdZy)
Annotationstartendlabelc                 .    || _         || _        || _        y N)r   r   r   )selfr   r   r   s       P/var/www/openai/venv/lib/python3.12/site-packages/tokenizers/tools/visualizer.py__init__zAnnotation.__init__   s    

    N)__name__
__module____qualname__int__annotations__strr    r   r   r   r      s+    J	HJc  C r   r   c                   .    e Zd ZU ee   ed<   ee   ed<   y)CharStateKeytoken_ixanno_ixN)r   r   r   r	   r   r   r   r   r   r    r       s    smc]r   r    c                   P    e Zd ZU ee   ed<   d Zed        Zed        Z	de
fdZy)	CharStatechar_ixc                 .    || _         d | _        g | _        y r   )r%   r"   tokens)r   r%   s     r   r   zCharState.__init__'   s    &*!#r   c                 T    t        | j                        dkD  r| j                  d   S d S )Nr   lenr'   r   s    r   r!   zCharState.token_ix-   s%    !$T[[!1A!5t{{1~?4?r   c                 2    t        | j                        dkD  S )zJ
        BPE tokenizers can output more than one token for a char
           r)   r+   s    r   is_multitokenzCharState.is_multitoken1   s    
 4;;!##r   returnc                 D    t        | j                  | j                        S )N)r!   r"   )r    r!   r"   r+   s    r   partition_keyzCharState.partition_key8   s    ]]LL
 	
r   N)r   r   r   r	   r   r   r   propertyr!   r.   r    r1   r   r   r   r$   r$   $   sG    c]$ @ @ $ $
| 
r   r$   c                       e Zd Zy)AlignedN)r   r   r   r   r   r   r4   r4   ?   s    r   r4   c            
       R   e Zd ZdZ ej
                  dej                        Z	 	 ddede	de
eegef      fdZg dfd	ed
ede
e	   de
e   fdZed
edeeef   fd       Zedee   d	edefd       Zed	eded
edefd       Zed	ed
edefd       Zed	eded
edee   fd       Zy)EncodingVisualizera  
    Build an EncodingVisualizer

    Args:

         tokenizer (:class:`~tokenizers.Tokenizer`):
            A tokenizer instance

         default_to_notebook (:obj:`bool`):
            Whether to render html output in a notebook by default

         annotation_converter (:obj:`Callable`, `optional`):
            An optional (lambda) function that takes an annotation in any format and returns
            an Annotation object
    z(.{1})?(unk|oov)(.{1})?)flagsN	tokenizerdefault_to_notebookannotation_converterc                 t    |r		 ddl m}m} || _        || _        || _        y # t        $ r t	        d      w xY w)Nr   HTMLdisplayzWe couldn't import IPython utils for html display.
                        Are you running in a notebook?
                        You can also pass `default_to_notebook=False` to get back raw HTML
                    )IPython.core.displayr=   r>   ImportError	Exceptionr8   r9   annotation_coverter)r   r8   r9   r:   r=   r>   s         r   r   zEncodingVisualizer.__init__V   sN     > ##6 #7    s   " 7textannotationsr/   c                 H   | j                   }||}|r		 ddlm}m} | j                  t        t        | j                  |            }| j                  j                  |      }t        j                  |||      }|r  |             y|S # t        $ r t        d      w xY w)a  
        Build a visualization of the given text

        Args:
            text (:obj:`str`):
                The text to tokenize

            annotations (:obj:`List[Annotation]`, `optional`):
                An optional list of annotations of the text. The can either be an annotation class
                or anything else if you instantiated the visualizer with a converter function

            default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
                If True, will render the html in a notebook. Otherwise returns an html string.

        Returns:
            The HTML string if default_to_notebook is False, otherwise (default) returns None and
            renders the HTML in the notebook

        Nr   r<   zeWe couldn't import IPython utils for html display.
                    Are you running in a notebook?)r9   r?   r=   r>   r@   rA   rB   listmapr8   encoder6   _EncodingVisualizer__make_html)	r   rC   rD   r9   final_default_to_notebookr=   r>   encodinghtmls	            r   __call__zEncodingVisualizer.__call__l   s    2 %)$<$<!*(;%$> ##/s4#;#;[IJK>>((.!--dHkJ$DJK  6 s   B B!c                     t        |       dk(  ri S t        t        d |             }t        |      }t        d|z        }|dk  rd}d}d}d}i }t	        |      D ]  }d| d	| d
| d||<   ||z  } |S )a  
        Generates a color palette for all the labels in a given set of annotations

        Args:
          annotations (:obj:`Annotation`):
            A list of annotations

        Returns:
            :obj:`dict`: A dictionary mapping labels to colors in HSL format
        r   c                     | j                   S r   )r   )xs    r   <lambda>z;EncodingVisualizer.calculate_label_colors.<locals>.<lambda>   s    177r             @   
   zhsl(,z%,%)r*   setrG   r   sorted)	rD   labels
num_labelsh_stepslhcolorsr   s	            r   calculate_label_colorsz)EncodingVisualizer.calculate_label_colors   s     {q IS*K89[
S:%&B;FF^E"1#QqcA3a0F5MKA $ r   consecutive_chars_listrK   c                    | d   }|j                   |j                  |j                     }d| dS | d   }|j                   }|j                   dz   }||| }g }	i }
|j                  |	j                  d       |j                  r|	j                  d       |j                  dz  r|	j                  d	       n|	j                  d
       t
        j                  j                  |j                  |j                           ?|	j                  d       |j                  |j                     |
d<   n|	j                  d       ddj                  |	       d}d}|
j                         D ]  \  }}|d| d| dz  } d| d| d| dS )a  
        Converts a list of "consecutive chars" into a single HTML element.
        Chars are consecutive if they fall under the same word, token and annotation.
        The CharState class is a named tuple with a "partition_key" method that makes it easy to
        compare if two chars are consecutive.

        Args:
            consecutive_chars_list (:obj:`List[CharState]`):
                A list of CharStates that have been grouped together

            text (:obj:`str`):
                The original text being processed

            encoding (:class:`~tokenizers.Encoding`):
                The encoding returned from the tokenizer

        Returns:
            :obj:`str`: The HTML span for a set of consecutive chars
        r   z(<span class="special-token" data-stoken=z></span>r-   tokenzmulti-token   z	odd-tokenz
even-tokenzspecial-tokenstokz	non-tokenzclass=" " z data-z="z<span z ></span>)
r%   r'   r!   appendr.   r6   unk_token_regexsearchjoinitems)rc   rC   rK   firststokenlastr   r   	span_textcss_classes
data_itemscssdatakeyvals                  r   consecutive_chars_to_htmlz,EncodingVisualizer.consecutive_chars_to_html   s   2 'q)== __U^^4F >fXXNN%b)llQsO	
>>%w'""""=1~~!
 "";/ ""<0!11889XYe""?3%-__U^^%D
6" {+#((;/04"((*HCfSEC5**D +uAdV2i[88r   c                 B   t         j                  | ||      }|d   g}|d   j                  }g }t         j                  |      }|d   j                  }|.||   }	|	j                  }
||
   }|j                  d| d|
 d       |dd  D ]  }|j                  }||k7  rm|j                  t         j                  || |             |g}||j                  d       |.||   }	|	j                  }
||
   }|j                  d| d|
 d       |}|j                         |d   j                         k(  r|j                  |       |j                  t         j                  || |             |g} |j                  t         j                  || |             t        |      }|S )Nr   z&<span class="annotation" style="color:z" data-label="z">r-   )rC   rK   rl   )	r6   %_EncodingVisualizer__make_char_statesr"   rb   r   rm   r|   r1   HTMLBody)rC   rK   rD   char_statescurrent_consecutive_charsprev_anno_ixspanslabel_colors_dictcur_anno_ixannor   colorcsress                 r   __make_htmlzEncodingVisualizer.__make_html   s   (;;D(KX%0^$4!"1~--.EEkR!!n,,"{+DJJE%e,ELLA%W\V]]_`aab/B**Kl*&@@1!!) A  .0D)+LL+*&{3D JJE-e4ELL#I%P^_d^eeg!hi&L!%>q%A%O%O%QQ)004 &@@1!!) A  .0D)M "R 	88)! 9 	
 uo
r   c                     dgt        |       z  }t        |      D ]/  \  }}t        |j                  |j                        D ]  }|||<   	 1 |S )a  
        Args:
            text (:obj:`str`):
                The raw text we want to align to

            annotations (:obj:`AnnotationList`):
                A (possibly empty) list of annotations

        Returns:
            A list of  length len(text) whose entry at index i is None if there is no annotation on
            charachter i or k, the index of the annotation that covers index i where k is with
            respect to the list of annotations
        N)r*   	enumerateranger   r   )rC   rD   annotation_mapr"   ais         r   __make_anno_mapz"EncodingVisualizer.__make_anno_map<  sR     #d)+#K0JGQ177AEE*$+q! + 1 r   c                    t         j                  | |      }t        t        |             D cg c]  }t	        |       }}t        |j                        D ]M  \  }}|j                  |      }||\  }	}
t        |	|
      D ]   }||   j                  j                  |       " O t        |      D ]  \  }}|||   _	         |S c c}w )a  
        For each character in the original text, we emit a tuple representing it's "state":

            * which token_ix it corresponds to
            * which word_ix it corresponds to
            * which annotation_ix it corresponds to

        Args:
            text (:obj:`str`):
                The raw text we want to align to

            annotations (:obj:`List[Annotation]`):
                A (possibly empty) list of annotations

            encoding: (:class:`~tokenizers.Encoding`):
                The encoding returned from the tokenizer

        Returns:
            :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
            it's state is
        )
r6   "_EncodingVisualizer__make_anno_mapr   r*   r$   r   r'   token_to_charsrm   r"   )rC   rK   rD   r   r%   r   r!   rf   offsetsr   r   r   r"   s                r   __make_char_statesz%EncodingVisualizer.__make_char_statesQ  s    . ,;;D+NJOPSTXPYJZ'[JZw	'(:JZ'[(9OHe--h7G"$
suc*AN))00: +	  : !*. 9GW+2K ( !:  (\s   C)TN)r   r   r   __doc__recompile
IGNORECASErn   r   boolr	   r   r   r   r   r   AnnotationListrM   staticmethodr   rb   r   r$   r   r|   rI   PartialIntListr   r~   r   r   r   r6   r6   C   s     !bjj!>bmmTO
 %)FJ	 " 'xz0A'BC	2 ').2	++ $+ &d^	+
 
#+Z N tCH~  8 A9 $YA9A9 A9 A9F ?# ? ? ?SV ? ?B c  >  ( " " "~ "Z^_hZi " "r   r6   childrenr/   c                 6    dj                  |       }d| d| dS )a[  
    Generates the full html with css from a list of html spans

    Args:
        children (:obj:`List[str]`):
            A list of strings, assumed to be html elements

        css_styles (:obj:`str`, `optional`):
            Optional alternative implementation of the css

    Returns:
        :obj:`str`: An HTML string with style markup
    rk   z?
    <html>
        <head>
            <style>
                zs
            </style>
        </head>
        <body>
            <div class="tokenized-text" dir=auto>
            z4
            </div>
        </body>
    </html>
    )rp   )r   
css_styleschildren_texts      r   r   r   w  s9     GGH%M  
 O  r   )#	itertoolsosr   stringr   typingr   r   r   r   r   r	   r
   
tokenizersr   r   pathdirname__file__rp   css_filenameopenfreadrx   r   r   r   r   r    r$   r4   r6   r   r   r   r   r   <module>r      s     	 	  I I I * ''//(
#ww||G%<=	,1
&&(C   j!hsm$: 

 
6	 	q qh	 .1 tCy S W s   ,CC