
    gS@                         d dl Z d dlZd dlZd dlmZmZ d dlZd dlmZm	Z	m
Z
mZmZ d dlmZmZ d dlmZmZ d dlmZ  ej*                  e      Z G d d      Z G d	 d
e      Z G d de      Z G d d      Zd Zd Zd Zedk(  r e        yy)    N)ListUnion)AttentionInputIDsAttentionOutputIDsMultiHeadAttentionInputIDsMultiHeadAttentionOutputIDs	Operators)helper
load_model)	NodeProto	OnnxModel)SymbolicShapeInferenceHelperc                       e Zd ZdedefdZdeedf   fdZdeedf   fdZ	deedf   fdZ
defd	Zd
ee   dee   ddfdZd
ee   dee   ddfdZdededdfdZdeedf   fdZddeddfdZy)PackingAttentionBasemodelattention_op_typec                     || _         g | _        g | _        d| _        i | _        | j                   j                   j
                  j                  | _        || _        | j                   j                  |      | _
        y )NF)r   nodes_to_removenodes_to_addprune_graphnode_name_to_graph_namegraphnamethis_graph_namer   get_nodes_by_op_typeattention_nodes)selfr   r   s      e/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/convert_to_packing_mode.py__init__zPackingAttentionBase.__init__   sg     %
%'"$!&-/$$(JJ$4$4$:$:$?$?!2#zz>>?PQ    returnNc                 n   | j                   t        j                  k(  rt        j                  nt
        j                  }| j                         }|rt        |j                        |k  ry |j                  |   }| j                  D ].  }t        |j                        |k  s|j                  |   |k7  s. y  |S N)r   r	   	ATTENTIONr   
MASK_INDEXr   KEY_PADDING_MASK_try_getting_first_attentionleninputr   )r   
mask_indexfirst_attention_nodeattention_masknodes        r   _try_getting_attention_maskz0PackingAttentionBase._try_getting_attention_mask%   s     %%)<)<< ((+<< 	
  $@@B#s+?+E+E'F*'T-33J? ((D4::*,

:0F.0X ) r    c                 R    t        | j                        dk  ry | j                  d   S )Nr   )r(   r   r   s    r   r'   z1PackingAttentionBase._try_getting_first_attention9   s)    t##$)##A&&r    c                     d }| j                   j                         D ]?  }|j                  t        j                  k(  s|j                  t        j
                  k(  s>|}A |S r#   )r   nodesop_typer	   	LAYERNORMSKIPLAYERNORM)r   last_layernorm_noder-   s      r   _try_getting_last_layernormz0PackingAttentionBase._try_getting_last_layernorm?   sP    "JJ$$&D||y222dlliF]F]6]&*# ' #"r    c                     t               r#   NotImplementedErrorr0   s    r   _are_attentions_supportedz.PackingAttentionBase._are_attentions_supportedF       !##r    inputsoutputsc                 *   t        j                  t        j                  ||| j                  j                  t        j                              }d|_        | j                  j                  |       | j                  | j                  |j                  <   y Nr=   r>   r   com.microsoft)r
   	make_noder	   REMOVEPADDINGr   create_node_namedomainr   appendr   r   r   r   r=   r>   new_nodes       r   _insert_removepadding_nodez/PackingAttentionBase._insert_removepadding_nodeI   sp    ####,,Y-D-DE	
 *  *6:6J6J$$X]]3r    c                 *   t        j                  t        j                  ||| j                  j                  t        j                              }d|_        | j                  j                  |       | j                  | j                  |j                  <   y r@   )r
   rC   r	   RESTOREPADDINGr   rE   rF   r   rG   r   r   r   rH   s       r   _insert_restorepadding_nodez0PackingAttentionBase._insert_restorepadding_nodeU   sp    ##$$,,Y-E-EF	
 *  *6:6J6J$$X]]3r    token_offsetcumulative_sequence_lengthc                     t               r#   r9   )r   rN   rO   s      r   )_replace_attention_with_packing_attentionz>PackingAttentionBase._replace_attention_with_packing_attentiona   r<   r    c                 x    | j                   t        j                  k(  r|j                  t        j
                     S y r#   )r   r	   r$   r)   r   INPUT)r   r+   s     r   _get_input_to_remove_paddingz1PackingAttentionBase._get_input_to_remove_paddingd   s1    !!Y%8%88'--.?.E.EFFr    use_symbolic_shape_inferc                 p   t         j                  d       | j                         sy | j                         }|sy | j	                         }| j                         }|sy | j                  |      }|sy |dz   }|dz   }|dz   }|dz   }	| j                  ||g||||	g       | j                  j                  ||       t         j                  d       |j                  d   dz   }
| j                  |
|g|j                  d   g       | j                  j                  |j                  d   |
       t         j                  d	|j                   d
       | j                  ||       t         j                  d| j                   d| j                          | j                  j!                  | j"                         | j                  j%                  | j&                  | j(                         | j*                  r| j                  j+                          n2| j"                  s| j&                  r| j                  j-                          | j                  j/                          |r^t1        | j                  j                  d      }|j3                  | j                  j                  dd      }|r|| j                  _        y y y )Nz$start converting to packing model..._no_padding_token_offset_cumulated_seq_len_max_seq_lenz'inserted RemovePadding before Attentionr   _restore_inputz#inserted RestorePadding after last z layerz	replaced z with PackedverboseTF)
auto_mergeguess_output_rank)loggerdebugr;   r.   r'   r7   rT   rJ   r   replace_input_of_all_nodesoutputrM   replace_output_of_all_nodesr3   rQ   r   remove_nodesr   	add_nodesr   r   r   update_graphclean_shape_inferr   infer_shapes)r   rU   r,   r+   r6   input_to_remove_paddingoutput_without_paddingrN   cumulated_seq_lenmax_seq_lenrestorepadding_inputshape_infer_helperinferred_models                r   convertzPackingAttentionBase.converti   sw   ;<--/99;#@@B">>@" #'"C"CDX"Y&!8=!H.@36JJ->''$n5#\3DkR	
 	

--.EG]^>?  399!<?OO((*>)MPcPjPjklPmOno

../B/I/I!/LNbc:;N;V;V:WW]^_ 	66|EVWy!7!7 8TE[E[D\]^

 4 45

T..0L0LMJJ""$!!T%6%6JJ##%

$$&# ">djj>N>NXY!Z/<<TZZ=M=MZ^rw<xN#1

   $r    T)__name__
__module____qualname__r   strr   r   r.   r   r'   r7   boolr;   r   rJ   rM   rQ   rT   rq    r    r   r   r      s    Ri RC RU39-= ('eItO.D '#U9d?-C #$4 $
Kc 
KT#Y 
KSW 
K
K$s) 
Kd3i 
KTX 
K$c $gj $os $E#t)DT 
72 72 72r    r   c                   D     e Zd Zdef fdZdefdZdededdfdZ xZ	S )	PackingAttentionr   c                 B    t         |   |t        j                         y r#   )superr   r	   r$   r   r   	__class__s     r   r   zPackingAttention.__init__   s    	 3 34r    r!   c                    | j                   D ]  }t        j                  |d       yt        j                  |d       yt        j                  |d      }||dk7  r yt        |j                        t
        j                  kD  r|j                  t
        j                     s yt        |j                        t
        j                  kD  s|j                  t
        j                     r y y)Npast_present_share_bufferF	do_rotaryunidirectionalr   T)r   r   get_node_attributer(   r)   r   PASTPAST_SEQUENCE_LENGTH)r   r-   unidirection_attrs      r   r;   z*PackingAttention._are_attentions_supported   s    ((D++D2MNZ++D+>J ) < <TCS T ,1Ba1G4::!2!7!77

K\KaKa@bDJJ"3"H"HH

#4#I#IJ ) r    rN   rO   Nc           
         | j                   D ]  }t        |j                        t        j                  kD  r|j                  t        j                     nd}t        j                  t        j                  |j                  t        j                     |j                  t        j                     |j                  t        j                     |||g|j                  t        j                     g| j                  j!                  t        j                              }g }|j"                  D ]"  }|j$                  dv s|j'                  |       $ |j"                  j)                  |       d|_        | j,                  j'                  |       | j.                  j'                  |       | j0                  | j2                  |j$                  <    t4        j7                  dt        | j                                y )N rA   )	num_headsqkv_hidden_sizesscalerB   z0Converted %d Attention nodes to PackedAttention.)r   r(   r)   r   ATTENTION_BIASr
   rC   r	   PACKEDATTENTIONrS   WEIGHTSBIASrc   r   OUTPUTr   rE   	attributer   rG   extendrF   r   r   r   r   r`   info)r   rN   rO   	attentionattention_biaspacked_attention
attributesattrs           r   rQ   z:PackingAttention._replace_attention_with_packing_attention   s   --I y'*;*J*JJ  1 @ @A 
  &//))OO$5$;$;<OO$5$=$=>OO$5$:$:; ." #))*<*C*CDEZZ001J1JK  J!++99 JJ%%d+ , &&--j9&5#$$%56  ''	2BFBVBVD(()9)>)>?; .> 	FDL`L`Habr    )
rs   rt   ru   r   r   rw   r;   rv   rQ   __classcell__r~   s   @r   rz   rz      s;    5i 54 $ cc  cgj  cos  cr    rz   c                   z     e Zd Zdef fdZdedefdZdedefdZde	fdZ
d	ed
eddfdZdeedf   fdZ xZS )PackingMultiHeadAttentionr   c                 B    t         |   |t        j                         y r#   )r|   r   r	   MULTI_HEAD_ATTENTIONr}   s     r   r   z"PackingMultiHeadAttention.__init__   s    	 > >?r    indexr   c                     t        |j                        |kD  r:t        |j                  |         dkD  rt        j                  d| d| d|        yy)'Check a node does not have given input.r   znode input  (0) is not supported in PackedMultiHeadAttention: FT)r(   r)   r`   errorr   r-   r   r   s       r   _check_empty_inputz,PackingMultiHeadAttention._check_empty_input   sP    tzz?U"4::e$%){5'D69ijniopqr    c                     t        |j                        |kD  r:t        |j                  |         dkD  rt        j                  d| d| d|        yy)r   r   znode output r   r   FT)r(   rc   r`   r   r   s       r   _check_empty_outputz-PackingMultiHeadAttention._check_empty_output   sQ    t{{e#4;;u%&*|E7"TF:jkojpqrr    r!   c                 f   | j                   D ]!  }|j                  D ]8  }|j                  dvst        j	                  d|j                   d|          y |j
                  t        j                     r4|j
                  t        j                     st        j	                  d        y| j                  |t        j                  d      re| j                  |t        j                  d      rD| j                  |t        j                  d      r#| j                  |t        j                  d      r" y y)	Nr   mask_filter_valuer   znode attribute z/ is not supported in PackedMultiHeadAttention: Fz=packed kv format is not supported in PackedMultiHeadAttentionpast_keypresent_keyT)r   r   r   r`   r   r)   r   KEYVALUEr   PAST_KEY
PAST_VALUEr   r   PRESENT_KEYPRESENT_VALUE)r   r-   r   s      r   r;   z3PackingMultiHeadAttention._are_attentions_supported   s    ((D99$OOLL?499+=lmqlr!st  '
 zz4889$**MgMmMmBn\] ''.H.Q.QS]^++D2L2W2WYcd,,T3N3Z3Z\ij,,T3N3\3\^kl! )$ r    rN   rO   Nc                 F   d}| j                   D ]P  }t        |j                        t        j                  kD  r|j                  t        j                     nd}t        j                  t        j                  |j                  t        j                     |j                  t        j                     |j                  t        j                     |j                  t        j                     |||g|j                  t        j                     g| j                   j#                  t        j                              }g }|j$                  D ]"  }|j&                  dv s|j)                  |       $ |j$                  j+                  |       d|_        | j.                  j)                  |       | j0                  j)                  |       | j2                  | j4                  |j&                  <   |s| j                   j7                  |t        j                        }	|	s|	j8                  dk(  st        |	j                        dk(  s1|	j                  j)                  |       |dz  }S t:        j=                  d	t        | j                                t:        j=                  d
|       y )Nr   r   rA   r   rB   GatedRelativePositionBias      zBConverted %d MultiHeadAttention nodes to PackedMultiHeadAttention.z=Converted %d GatedRelativePositionBias nodes to packing mode.)r   r(   r)   r   r   r
   rC   r	   PACKED_MULTI_HEAD_ATTENTIONQUERYr   r   r   rc   r   r   r   rE   r   r   rG   r   rF   r   r   r   r   
get_parentr3   r`   r   )
r   rN   rO   gated_relative_pos_bias_countmhar   
packed_mhar   r   rel_pos_bias_nodes
             r   rQ   zCPackingMultiHeadAttention._replace_attention_with_packing_attention  s   ()%''C syy>$>$M$MM 		4CCD 
  ))55II8>>?II8<<=II8>>?II8==> ." $?$F$FGHZZ001V1VWJ J99 KK%%d+ &   ''
3 /J$$Z0  '',<@<P<PD((9 $(JJ$9$9#?Y?h?h$i!%)115PP-3349%++22<@1Q61S (V 	XZ]^b^r^rZstSUrsr    c                 |    | j                   j                  |d      }|r|j                  dk(  r|j                  d   S y )Nr   MatMul)r   r   r3   r)   )r   r+   matmuls      r   rT   z6PackingMultiHeadAttention._get_input_to_remove_padding5  s8    &&';Q?fnn0<<?"r    )rs   rt   ru   r   r   intrv   r   r   rw   r;   rQ   r   rT   r   r   s   @r   r   r      sy    @i @c  s # 4 *.tc .tgj .tos .t`E#t)DT r    r   c                   *    e Zd ZdefdZddeddfdZy)PackingModer   c                     || _         y r#   )r   )r   r   s     r   r   zPackingMode.__init__>  s	    
r    rU   r!   Nc                    | j                   j                  t        j                        re| j                   j                  t        j                        rt
        j                  d       y t        | j                         }|j                  |      S | j                   j                  t        j                        r&t        | j                         }|j                  |      S t
        j                  d       y )NzRPacking mode does not support both Attention and MultiHeadAttention in same graph.zPPacking mode requires either Attention or MultiHeadAttention node in onnx graph.)
r   r   r	   r$   r   r`   r   rz   rq   r   )r   rU   packings      r   rq   zPackingMode.convertA  s    ::**9+>+>?zz..y/M/MNqr&tzz2G??#;<<ZZ,,Y-K-KL/

;G??#;<<LLklr    rr   )rs   rt   ru   r   r   rw   rq   rx   r    r   r   r   =  s!    i   r    r   c                  R   t        j                  d      } | j                  ddt        d       | j                  ddt        d       | j                  d	d
dd       | j	                  d
       | j                  dd
dd       | j	                  d
       | j                         }|S )Nz_Convert to packing mode tool for ONNX Runtime. It converts BERT like model to use packing mode.)descriptionz--inputTzinput onnx model path)requiredtypehelpz--outputzoptimized onnx model pathz	--verboseF
store_truezshow debug information.)r   actionr   r\   z--use_external_data_formatz4use external data format to store large model (>2GB)use_external_data_format)argparseArgumentParseradd_argumentrv   set_defaults
parse_args)parserargss     r   _parse_argumentsr   P  s    $$uF 	DsAXY

TB]^
eLOhi
&
$C	   7DKr    c                 d    | rt        j                  dd       y t        j                  d       y )NDEBUGz8[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s)levelfmtz%(funcName)20s: %(message)s)r   )coloredlogsinstallr\   s    r   _setup_loggerr   h  s*    J	

 	 =>r    c                     t               } t        | j                         t        j	                  d|         t
        j                  j                  | j                        t
        j                  j                  | j                        k(  rt        j                  d       t        | j                        }t        t        |            }|j                          |j                  j!                  | j                  | j"                         y )Nz
arguments:zYSpecified the same input and output path. Note that this may overwrite the original modelr   )r   r   r]   r`   ra   ospathrealpathr)   rc   warningr   r   r   rq   r   save_model_to_filer   )r   r   packing_modes      r   mainr   r  s    D$,,
LL:dV$%	ww

#rww'7'7'DDrstzz"Ey/0L))$++PTPmPm)nr    __main__)r   loggingr   typingr   r   r   	constantsr   r   r   r   r	   onnxr
   r   
onnx_modelr   r   ro   r   	getLoggerrs   r`   r   rz   r   r   r   r   r   rx   r    r   <module>r      s      	    $ + ;			8	$F2 F2R6c+ 6cr^ 4 ^B &0?o  zF r    