
    g                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZmZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z m!Z!m"Z"m#Z#  e jH                  e%      Z&g dZ'ejP                  dejR                  dejT                  diZ+ G d de"      Z, G d de#      Z- G d de"      Z. G d de!      Z/ G d de!      Z0e/ddfe0ddfe.ddfdZ1 G d d       Z2 G d! d"      Z3y)#    N)Path)DictListTupleUnion)	Precision)float_to_float16_max_diff)FusionOptions)IOBindingHelper)	OnnxModel)optimize_model)torch_onnx_export)
GPT2ConfigGPT2LMHeadModel	GPT2ModelTFGPT2Model)
distilgpt2gpt2zgpt2-mediumz
gpt2-largezgpt2-xlMb@?g?g      @c                   ,     e Zd ZdZ fdZ fdZ xZS )GPT2ModelNoPastState2Here we wrap a class to disable past state output.c                 $    t         |   |       y Nsuper__init__selfconfig	__class__s     e/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/models/gpt2/gpt2_helper.pyr   zGPT2ModelNoPastState.__init__+            c                 (    t         |   |dd      S )NF)	use_cachereturn_dict)r   forwardr   	input_idsr!   s     r"   r(   zGPT2ModelNoPastState.forward.   s    wyEuMMr$   __name__
__module____qualname____doc__r   r(   __classcell__r!   s   @r"   r   r   (   s    <!N Nr$   r   c                   ,     e Zd ZdZ fdZ fdZ xZS )TFGPT2ModelNoPastStater   c                 2    d|_         t        | 	  |       y )NF)r&   r   r   r   s     r"   r   zTFGPT2ModelNoPastState.__init__5   s      r$   c                 &    t         |   |d      S )NF)r&   )r   callr)   s     r"   r(   zTFGPT2ModelNoPastState.forward9   s    w|I|77r$   r+   r1   s   @r"   r3   r3   2   s    <!8 8r$   r3   c                   <     e Zd ZdZ fdZed        Z fdZ xZS )MyGPT2ModelzMHere we wrap a class for Onnx model conversion for GPT2Model with past state.c                 $    t         |   |       y r   r   r   s     r"   r   zMyGPT2Model.__init__@   r#   r$   c           	         t        | d   d   t        t        f      rt        | d         |k(  rt        | d   d         dk(  sJ g }t	        |      D ]Z  }|j                  t        j                  | d   |   d   j                  d      | d   |   d   j                  d      fd             \ | d   t        |      fS | S )N   r      )dim)	
isinstancetuplelistlenrangeappendtorchcat	unsqueeze)result	num_layerpresentis       r"   post_processzMyGPT2Model.post_processC   s    fQilUDM2vay>Y.3vay|3D3IIIG9% II1a2215vay|A7P7PQR7ST & 1IuW~..r$   c                     t         |   ||||d      }t        j                  || j                  j
                        S NF)position_idsattention_maskpast_key_valuesr'   r   r(   r8   rK   r    n_layerr   r*   rN   rO   pastrG   r!   s         r"   r(   zMyGPT2Model.forwardV   sD    %)  ! 
 ''0C0CDDr$   )	r,   r-   r.   r/   r   staticmethodrK   r(   r0   r1   s   @r"   r8   r8   =   s+    W!  $E Er$   r8   c                   ,     e Zd ZdZ fdZ fdZ xZS )MyGPT2LMHeadModelzSHere we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state.c                 $    t         |   |       y r   r   r   s     r"   r   zMyGPT2LMHeadModel.__init__d   r#   r$   c                     t         |   ||||d      }t        j                  || j                  j
                        S rM   rQ   rS   s         r"   r(   zMyGPT2LMHeadModel.forwardg   sD    %)  ! 
 ''0C0CDDr$   r+   r1   s   @r"   rW   rW   a   s    ]!	E 	Er$   rW   c                   ,     e Zd ZdZ fdZ fdZ xZS )MyGPT2LMHeadModel_NoPaddinga  Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and no padding.
    When you always use batch_size=1 in inference, there is no padding in inputs. In such case, position_ids
    and attention_mask need no be in inputs.
    c                 $    t         |   |       y r   r   r   s     r"   r   z$MyGPT2LMHeadModel_NoPadding.__init__y   r#   r$   c                 |    t         |   ||d      }t        j                  || j                  j
                        S )NF)rP   r'   rQ   )r   r*   rT   rG   r!   s       r"   r(   z#MyGPT2LMHeadModel_NoPadding.forward|   s4    DeT''0C0CDDr$   r+   r1   s   @r"   r[   r[   s   s    
!E Er$   r[   logitsTF
last_state)r   GPT2LMHeadModel_NoPaddingr   c                   0    e Zd Zd ZdefdZdefdZd Zy)
Gpt2Inputsc                 <    || _         || _        || _        || _        y r   )r*   rN   rO   rT   )r   r*   rN   rO   rT   s        r"   r   zGpt2Inputs.__init__   s     +4.:\jLP	r$   returnc                     | j                   | j                  | j                  fD cg c]  }||	 }}| j                  r|j	                  | j                         |S c c}w r   )r*   rN   rO   rT   extend)r   v
input_lists      r"   to_listzGpt2Inputs.to_list   sZ    "&..$2C2CTEXEX!Yk!YA]^]ja!Y
k99dii(	 ls
   AAc                 ~    t        d | j                  | j                  | j                  | j                  fD              S )Nc              3   &   K   | ]	  }||  y wr    ).0rg   s     r"   	<genexpr>z&Gpt2Inputs.to_tuple.<locals>.<genexpr>   s     u c1ghgtQ cs   )r?   r*   rN   rO   rT   )r   s    r"   to_tuplezGpt2Inputs.to_tuple   s3    u1B1BDDWDWY]YbYb cuuur$   c                    d }| j                   ]| j                   j                  t        j                  k(  r*| j                   j	                  t        j
                        n| j                   }| j                  D cg c]"  }|j	                  t        j
                        $ }}t        | j                  | j                  ||      S c c}w )N)dtype)
rO   rq   rD   float16tofloat32rT   rb   r*   rN   )r   rO   prT   s       r"   to_fp32zGpt2Inputs.to_fp32   s    * ''--> ##&&U]]&;((  4899=9a5==)9=$..$*;*;^TRR >s   :'CN)	r,   r-   r.   r   r   ri   r   ro   rv   rl   r$   r"   rb   rb      s(    Q v% vSr$   rb   c            "       p   e Zd ZdZedddej                  ej                  ej                  dfdededededed	ed
edej                  de	de	de	dej                  dej                  dej                  de	def d       Ze	 dAdedededededeeee   f   fd       Zed        ZedBd       ZedBd       ZedCd       ZedDd       Zeddddej                  ej                  ej                  fdede	d e	de	de	dej                  dej                  dej                  fd!       Ze	 	 	 dEd#       Zeg d$fd%ed&ee   fd'       ZedFd(ed)efd*       ZedFd(ed)efd+       Zed,        ZedGd-       Ze	 	 	 dHd(ed.eeej>                  f   d/eeee   f   d)ed0e	d1e	fd2       Z ed3        Z!ed4        Z"edd5d5d6d7ddddej                  ej                  ej                  d"ddfd8       Z#edd9ddddej                  ej                  ej                  d:d7d;fd<       Z$edId=       Z%edddg d>fdefd?       Z&y@)J
Gpt2HelperzEA helper class for Gpt2 model conversion, inference and verification.FT
batch_sizepast_sequence_lengthsequence_lengthnum_attention_headshidden_sizerH   
vocab_sizedevicerr   has_position_idshas_attention_maskinput_ids_dtypeposition_ids_dtypeattention_mask_dtypeleft_side_paddingrd   c                    |rt         j                  nt         j                  }d| ||t        ||z        g}t	        |      D cg c]   }t        j
                  |||      dz  dz
  " }}t        j                  d|dz
  | |f||      }d}|
re||z   }t        j                  | |g||      }|dk\  rAt	        |       D ]3  }t        j                  d|dz
        }|r
d||d|f<   (d||||z
  df<   5 d}|	rQ|j                         j                  d	      dz
  }|j                  |dk  d       |dd|df   j                  |      }t        ||||      S c c}w )
zCreate random inputs for GPT2 model.
        Returns torch tensors of input_ids, position_ids, attention_mask and a list of past state tensors.
        r<   rq   r   g       @      ?r   r;   )lowhighsizerq   r   N)rD   rr   rt   intrB   randrandintonesrandomlongcumsummasked_fill_rs   rb   )ry   rz   r{   r|   r}   rH   r~   r   rr   r   r   r   r   r   r   
float_type
past_shape_rT   r*   rO   total_sequence_lengthrJ   padding_lengthrN   s                            r"   get_dummy_inputszGpt2Helper.get_dummy_inputs   s   * '.U]]5==
 112

 `een_op_oZ[JjH3NQTT_opMMao.!
	 $8?$J!"ZZ23*N %)z*A%+^^A7Lq7P%QN(=>q/>/'9:VWq*?.*P*R'RS + )..077;a?L%%lQ&6:'+?+@(@ADDEWXL)\>4HHC qs   %Er   r    model_classc                    |j                   }|j                  }|j                  }|j                  }t        |   d   }	| ||	dk(  r|n|g}
d| |||z   t        ||z        g}|	|
i}t        |      D ]  }||dt        |      z   <    |S )zAReturns a dictionary with output name as key, and shape as value.r;   r^   r<   present_)r|   r}   num_hidden_layersr~   MODEL_CLASSESr   rB   str)ry   rz   r{   r    r   r|   r}   rH   r~   output_namelast_state_shapepresent_state_shapeoutput_shapesrJ   s                 r"   get_output_shapeszGpt2Helper.get_output_shapes   s     %88((,,	&&
#K03 %1J{
  ?2112
 %&67y!A1DM*s1v-. " r$   c                    |D ]|  }|| v sJ | |   }t        j                  ||         |j                         kD  s8t        j                  t        j                  ||         |j
                  |j                        | |<   ~ y )Nr   )numpyprodnelementrD   emptyrq   r   )output_buffersr   keybuffers       r"   auto_increase_buffer_sizez$Gpt2Helper.auto_increase_buffer_size  ss     C.(((#C(Fzz-,-0AA&+kkJJ}S12 ,,!=='s#	 !r$   c                     |rt         j                  nt         j                  }i }| j                         D ]3  \  }}t        j                  t        j                  |      ||      ||<   5 |S )zpReturns a dictionary of output name as key, and 1D tensor as value. The tensor has enough space for given shape.r   )rD   rr   rt   itemsr   r   r   )r   r   
is_float16	data_typer   nameshapes          r"   get_output_bufferszGpt2Helper.get_output_buffers  sX     &0EMMU]]	(..0KD%#(;;uzz%/@	Z`#aN4  1r$   c                    | d   j                         j                         }t        j                  ||d   z
        }|r.t        j                  |t        j                  |      dz   z        S t        j                  |      S )zGReturns the maximum difference between PyTorch and OnnxRuntime outputs.r   ư>)cpur   absamax)torch_outputsort_outputsrelativeexpected_outputsdiffs        r"   diff_outputszGpt2Helper.diff_outputs&  sn     )+//1779yy)KN:;::deii0@&AD&HIJJ::d##r$   c           	         t        j                  |d   | d   j                         j                         ||      }t        j	                  d|        |}t        |      dz
  }t        |      D ]g  }t        j                  |d|z      | d   |   j                         j                         ||      }t        j	                  d| d| d|        |xr |}i |s/t        j                  | |      }	t        j                  d|	d	       |S )
zReturns True if torch and ORT outputs are close for given thresholds, and False otherwise.
        Note: need kwargs since Gpt2BeamSearchHelper.compare_outputs has an extra parameter model_class
        r   )rtolatolz9PyTorch and OnnxRuntime output 0 (last_state) are close: r;   zPyTorch and OnnxRuntime layer z state (present_z) are close:z@PyTorch and OnnxRuntime results are not all close: max_abs_diff=.5f)
r   allcloser   loggerdebugrA   rB   rx   r   info)
r   r   r   r   kwargsis_closeis_all_close
num_layerslayermax_abs_diffs
             r"   compare_outputszGpt2Helper.compare_outputs0  s   
 >>+a.-2B2F2F2H2N2N2PW[bfgPQYPZ[\%)
:&E~~AI&a '++-335	H LL9%@PQVPWWcdlcmno'4HL ' %22=+NLKKZ[ghkZlmnr$   c                    d}d}g }g }t        t        |            D ]  }||   }|dk(  r| d   n
| d   |dz
     j                         j                         }	t        j                  ||	|d      }
|j                  t        j                  t        j                  |	|z
                     |xr |
}t        j                  |	      j                         rt        j                  d| d       t        j                  |	      j                         rt        j                  d| d       t        j                  |      j                         rt        j                  d	| d       t        j                  |      j                         rt        j                  d	| d       t        j                  ||	z
        }t        j                  |j                         |j                         }|j                  d
||   dd| d||   ddt#        |	|         d       |dk(  st        j                  t        j                  |d      |j                         }t        j                  t        j                  |	d      |	j                         }t        j$                  ||      } |j'                  t)        |            }|t)        |      |||fS )a  Compare outputs from PyTorch and OnnxRuntime

        Args:
            torch_outputs (Tuple[Torch.Tensor]): PyTorch model output
            ort_outputs (List[numpy.ndarray]): OnnxRuntime output
            atol (float, optional): Absolute tollerance. Defaults to 1e-06.

        Returns:
            is_all_close(bool): whether all elements are close.
            max_abs_diff(float): maximum absolute difference.
            messages(str): a list of debug message for each output
        TFr   r;   )r   r   zPyTorch output z has nanz has infzORT output zdiff=z.9fz index=z ort=z torch=N)axis)rB   rA   r   r   r   rC   r   r   isnananyr   r   isinffabsunravel_indexargmaxr   floatarray_equalindexmax)r   r   r   r   is_top1_matched	max_diffsmessagesrJ   
ort_outputtorch_outputr   r   idxort_max_indextorch_max_indexmax_diff_output_indexs                   r"   compare_outputs_v2zGpt2Helper.compare_outputs_v2K  s`    	s;'(A$QJ01QM!,M!<LQQRU<SXXZ``bL~~j,TPQRHUZZ		,2K(LMN'4HL{{<(,,.qc:;{{<(,,.qc:;{{:&**,{1#X67{{:&**,{1#X67::j<78D%%dkkmTZZ@COOS	#gcU%
37LGTYZfgjZkTlmpSqr Av % 3 3ELLRV4WYcYiYi j"'"5"5ell<VZ6[]i]o]o"p"'"3"3M?"S3 )6 !*I ?	N!
 	
r$   onnx_model_pathverboseuse_external_data_formatc
                    | j                   }
|
j                  }t        j                  ddd|
j                  |
j
                  ||
j                  |d|||||	      }|j                         }t        j                         5   | | }ddd       t        |      D cg c]  }d| 	 }}t        |      D cg c]  }d| 	 }}d   j                  d   |
j                  k(  s!|d   j                  d   |
j
                  k(  sJ |d   j                  d   |
j                  k(  rd	nd
g|}dddd|d   dddi}|D ]
  }ddd||<    |D ]
  }ddd||<    dg}|rddd|d<   |j                  d       |rddd|d<   |j                  d       |j                  |       t        |      dk(  rt        |d         |k(  sJ t        j!                  d|j"                  j                   d|j$                  d   j                   d|d   j                   d|d   d   j                          t'        |      j(                  j+                  dd       |rt-        j.                         5 }t0        j2                  j5                  |d      }t'        |      j(                  j+                  dd       t7        | t9        |      |d|||ddd|       t;        j<                  |d      } t?        j@                  | |dd       ddd       yt7        | t9        |      |d|||ddd|       y# 1 sw Y   xY wc c}w c c}w # 1 sw Y   yxY w)z1Export GPT-2 model with past state to ONNX model.r;   F)ry   rz   r{   r|   r}   rH   r~   r   rr   r   r   r   r   r   Npast_r   r   r<   r^   r_   r*   ry   seq_len)r   r;   past_seq_len)r;      total_seq_lenrN   rO   zShapes: input_ids=z past=z output=z	 present=T)parentsexist_okz	gpt2.onnx   )
argsfexport_paramsinput_namesoutput_namesdynamic_axesopset_versiondo_constant_foldingr   r   )load_external_data)save_as_external_dataall_tensors_to_one_file)!r    rR   rx   r   r|   r}   r~   ri   rD   no_gradrB   r   rC   rf   rA   r   r   r*   rT   r   parentmkdirtempfileTemporaryDirectoryospathjoinr   r?   onnx
load_modelr   save)modelr   r   r   r   r   r   r   r   r   r    rH   dummy_inputsrh   outputsrJ   
past_namespresent_namesr   r   r   r   tmp_dir_nametemp_onnx_model_paths                           r"   export_onnxzGpt2Helper.export_onnx  s    #\\NN	!22!" & : :**((-1+1!5 3 
  "))+
]]_Z(G  ,1+;<+;aaSk+;
<16y1AB1AA8A31AB qz"f&7&7771:;K;KA;NRXRdRd;ddd$+AJ$4$4Q$76;L;L$LR^oano \i8O)<
 D%1n!EL !D%1o!FL " #m/;	+JL(~.1=/-RL)*/0:&7|q S_	%AAA !7!7!=!= >f\EVEVWXEYE_E_D``hipqrisiyiyhz  {D  EL  MN  EO  PQ  ER  EX  EX  DY  Z	
 	_$$**4$*G#,,.,')ww||L+'N$)*1177t7T!z**"& +!-!-"$(,-1# (<QUV#*.,0	' /.4 :&!"')) $().Q _ =BT /.s%   >L$L14L6%BL;$L.;Mr   c           	          t        d      }	t        | d||d|	d      }
|r5|rt        j                  |
       nd|vrd|d<    |
j                  dddi| |
j                  ||       |
S )	zHOptimize ONNX model with an option to convert it to use mixed precision.r   r   F)
model_type	num_headsr}   	opt_leveloptimization_optionsuse_gpukeep_io_typesuse_symbolic_shape_inferTrl   )r
   r   rx   auto_mixed_precisionconvert_float_to_float16save_model_to_file)r   optimized_model_pathr   r|   r}   r   r  stager   r  ms              r"   optimize_onnxzGpt2Helper.optimize_onnx  s      -V4)#!5
 #//2"&0.3F?+***SDSFS	13KLr$   )AddLayerNormalizationSkipLayerNormalizationFastGeluEmbedLayerNormalization
onnx_modelop_block_listc                 n   | j                         D ch c]  }|j                   }}t        |      }|j                  |      }t        j                  d| d|        | j                         j                  d   j                  }d}| j                         }||v sJ ||   }d}	|j                  dk(  r|}	t        j                  d|j                          d}
|j                  D ]  }| j                  |      }
|
 n t        |
      }t        j                  d|j                   d	|        |d
k  }n/t        j                  d|j                   d|j                          g }g }|s|	|g}|	j                  g}||||d}t        j                  d|         | j                  dddi| |S c c}w )a?  Convert GPT-2 model to mixed precision.
           It detects whether original model has fp16 weights, and set parameters for float16 conversion automatically.
        Args:
            onnx_model (OnnxModel): optimized ONNX model
            op_block_list (List[str], optional): operators to compute in fp32. Defaults to ["Add", "LayerNormalization",
                                                 "SkipLayerNormalization", "FastGelu", "EmbedLayerNormalization"]
        Returns:
            parameters(dict): a dictionary of parameters used in float16 conversion
        z	fp32 op: z
 fp16 op: r   FNMatMulz#Found last MatMul node for logits: z3max diff of converting weights in last MatMul node : r   z-Failed to find MatMul node for logits. Found z	 of node )r  r%  node_block_listforce_fp16_initializersz!auto_mixed_precision parameters: r  Trl   )nodesop_typeset
differencer   r   graphoutputr   output_name_to_nodeinputget_initializerr	   r   warningr  )r$  r%  nodeop_full_setfp32_op_setfp16_op_setlogits_output_nameis_weight_fp16_precisionr1  last_matmul_nodeinitializerr2  max_diffr  r)  
parameterss                   r"   r  zGpt2Helper.auto_mixed_precision  s   ( 1;0@0@0BC0Bt||0BC-(!,,[9i}J{mDE (--/66q9>> $) (<<>!%8888"#56<<8##KK=dii[IJK(88?* $ 1=HLLNtyykY[\d[efg'/$$NNJ4<<.Xabfbkbkalmn(/?/K/0M/445O +*.'?	

 	7
|DE+
++XTXZXa Ds   F2inputs
total_runsc                 `   t         j                  d       |j                         j                         }t	        j
                         5   | | }ddd       |dk(  rS g }t	        j
                         5  t        |      D ]A  }t        j                         } | | }|j                  t        j                         |z
         C 	 ddd       t        |      dz  t        |      z  }t         j                  dj                  t        |d                   |fS # 1 sw Y   xY w# 1 sw Y   axY w)zfRun inference of PyTorch model, and returns average latency in ms when total_runs > 0 besides outputs.zstart pytorch_inferenceNr     zPyTorch inference time = {} ms.2f)r   r   rv   ri   rD   r   rB   timerC   sumrA   format)	r  r?  r@  rh   r
  latencyr   startaverage_latencys	            r"   pytorch_inferencezGpt2Helper.pytorch_inferencec  s     	./ ^^%--/
]]_Z(G  ?N]]_:&		,tyy{U23 '  g,-G<5<<VOUZ=[\]''! _ _s   D3AD$D!$D-c                    t         j                  d       dt        j                  |j                  j                         j                               i}|j                  Tt        |j                        D ]<  \  }}t        j                  |j                         j                               |d| <   > |j                  >t        j                  |j                  j                         j                               |d<   |j                  >t        j                  |j                  j                         j                               |d<   | j                  d|      }|dk(  r|S g }t        |      D ]N  }t        j                         }	| j                  d|      }|j                  t        j                         |	z
         P t        |      dz  t        |      z  }
t         j                  d	j!                  t!        |
d
                   ||
fS )zcRun inference of ONNX model, and returns average latency in ms when total_runs > 0 besides outputs.zstart onnxruntime_inferencer*   Nr   rO   rN   r   rB  z"OnnxRuntime Inference time = {} msrC  )r   r   r   ascontiguousarrayr*   r   rT   	enumeraterO   rN   runrB   rD  rC   rE  rA   rF  )ort_sessionr?  r@  
ort_inputsrJ   past_ir   rG  r   rH  rI  s              r"   onnxruntime_inferencez Gpt2Helper.onnxruntime_inference}  s    	23!5#:#:6;K;K;O;O;Q;W;W;Y#Z[
;;"&v{{3	6*/*A*A&**,BTBTBV*W
U1#;' 4   ,+0+B+B6CXCXC\C\C^CdCdCf+gJ'(*).)@)@ATATAXAXAZA`A`Ab)cJ~&!oodJ7?z"AIIKE%//$
;KNN499;./ #
 g,-G<9@@Y^A_`aO++r$   c           	      8    t        j                  | ||||||      S )z)Returnas IO binding object for a session.)r   prepare_io_binding)rO  r*   rN   rO   rT   r   r   s          r"   rT  zGpt2Helper.prepare_io_binding  s,     11
 	
r$   c                 2    t        j                  | |||      S )z3Copy results to cpu. Returns a list of numpy array.)r   "get_outputs_from_io_binding_buffer)rO  r   r   return_numpys       r"   rV  z-Gpt2Helper.get_outputs_from_io_binding_buffer  s      AA
 	
r$   r   r   rW  include_copy_output_latencyc           	      h   t         j                  d       t        j                  | |j                  |j
                  |j                  |j                  ||      }| j                  |       t        j                  | |||      }|dk(  r|S g }	t        |      D ]g  }
t        j                         }| j                  |       |rt        j                  | |||      }
|	j                  t        j                         |z
         i t        |	      dz  t        |	      z  }t         j                  d|       ||fS )zUInference with IO binding. Returns outputs, and optional latency when total_runs > 0.z*start onnxruntime_inference_with_binded_ior   rB  z4OnnxRuntime with IO binding inference time = %.2f ms)r   r   rx   rT  r*   rN   rO   rT   run_with_iobindingrV  rB   rD  rC   rE  rA   )rO  r?  r   r   r@  rW  rX  
io_bindingr   rG  r   rH  rI  s                r"   $onnxruntime_inference_with_binded_ioz/Gpt2Helper.onnxruntime_inference_with_binded_io  s    	AB  22!!KK

 	&&z2 !CC
 ?z"AIIKE**:6*AA NN499;./ # g,-G<K_]O++r$   c                 T   t        d|  dd      5 }t        j                  ||       d d d        t        j	                  d|  d       t        d|  dd      5 }t        j                  ||       d d d        t        j	                  d|  d       y # 1 sw Y   kxY w# 1 sw Y   /xY w)Nort_outputs_.picklewbz$ORT output are saved to ort_outputs_torch_outputs_z(Torch output are saved to torch_outputs_openpickledumpr   r   )rJ   r   r   r   s       r"   save_outputszGpt2Helper.save_outputs  s    L7+T2aKKQ' 3:1#WEFN1#W-t4KKq) 5>qcIJ 32 54s   BBBB'c                     t        d|  dd      5 }t        j                  ||       d d d        t        j	                  d|  d       y # 1 sw Y   #xY w)Ndummy_inputs_r_  r`  z!inputs are saved to dummy_inputs_rb  )rJ   r	  r   r   r   s        r"   save_inputszGpt2Helper.save_inputs  sI    M!G,d3qKKa( 47s'BC 43s   A

Ar   i'  r;   c                    |j                   }t        j                  d| d| d| d| d|	 d| d       d}d	}d
}d}|r0t        j	                  |||||	      }t        j                  |||      }d}d}g }dg|z  }||z  }t        |      D ]E  }t        ||z        }t        j                  d|      }|dk(  rdnt        j                  d|      } t        j                  d|      }!t        j                  d|! d|  d       t        j                  |!| ||j                  |j                  |j                  |j                  |||
||||d      }"t        j!                  ||"      }#|rt        j#                  | |"      }$n1t        j	                  |!| |||	      }%t        j%                  | |"||%      }$t        j'                  |#|$|      \  }&}'}(})}*t)        j*                  |'      s|j-                  |'       |&r|dz  }|*r|dz  }||xx   dz  cc<   |rr|&spt        j                  d| d|! d|  d| d|' 
       t/        |)      D ]>  \  }}+t        j                  d| d| j1                         |   j2                   d|+        @ |st)        j*                  |'      s
|'d|z  kD  st        j5                  ||"       t        j7                  ||$|#       H |r*dD ,ci c]  },d|, t)        j8                  ||,      d  }-},ndD ,ci c]  },d|, d
 }-},|d z  |z  |-d!<   |D .cg c]
  }.|.d z  |z   c}.|-d"<   |d z  |z  |-d#<   |t;        |      z
  d z  |z  |-d$<   t        j                  d%| d&| d'|t;        |      z
   d(|        |d)|z  kD  r)t        j                  d*t        |dz  |z        d+d,       |-S c c},w c c},w c c}.w )-zKGenerate random inputs and compare the results of PyTorch and Onnx Runtime.zRunning parity test (atol=z, test_cases=z, runs=z, use_io_binding=z, model_class=z, is_float16=z) ...      r<   Nr   r;   z#Running parity test for batch_size=z past_sequence_length=z...T)r   r   r   r   )r   z
test_case=z batch_size=z sequence_length=z	 MaxDiff=	z: Name=z, d   )2   Z   _   c   max_diff_percentile_r   nanr   top1_match_ratetop1_match_rate_per_rundiff_pass_ratenan_ratezParity Test Cases=z	; Passed=z; Nan=z; Top1_Matched=gffffff?zParity is good: passed rate=z.0f%)r    r   r   rx   r   r   rB   r   r   r   r   r   r|   r}   rR   r~   rJ  rR  r\  r   r   r   rC   rM  get_outputsr   ri  rf  
percentilerA   )/rO  r  r   r   r   r   test_cases_per_runr@  use_io_bindingr   r   r   r   r   r   r  r   enable_pickle_outputr    max_batch_sizemax_past_seq_lenmax_seq_lenr   max_output_shapespassed_test_casestop1_matched_casesmax_abs_diff_listtop1_matched_cases_per_runtotal_test_casesrJ   run_idr{   rz   ry   r	  r
  r   r   r   r   r   r   r   messageru   rG   xs/                                                  r"   test_parityzGpt2Helper.test_parity  s   . #\\(m<N;OwWaVbbs  uC  tD  DR  S^  R_  _l  mw  lx  x}  ~	
  * < < 0+v{! (::;LfV`aN&'S:%5"-
:'(A//0F$nnQ<O).!1&..L\:] >:JLL5j\AWXlWmmpq &66$**""!! " /#5%9"& 7 L" !225,GG(>>{LY * < <(#! )MM~} --g{-N%;;|,!((6!Q&!"a'"*62a72| <
|;QRfQggx  zI  yJ  JS  T`  Sa  b #,H"5JAwKK"QCw{/F/F/H/K/P/P.QQST[S\ ]^ #6 $\)BlUX[_U_F_&&q,7'';@G )J eueu`a&qc*u/?/?@QST/UVY.Z[eu   BRRAQA,QC0%7AQFR$6$<?O$O !Sm,nSmaQW7I-ISm,n()#4s#:=M#M .5F1GG3NQaaz !1 2)<M;NfUefij{f|U|T}  ~M  N`  Ma  b	
 t&666KK6s;LS;PQa;a7bcf6gghij% S -os   #N9N>"Orn  rk      c                    |j                   }d}|r0t        j                  |||||      }t        j                  |||      }t        j	                  ||||j
                  |j                  |j                  |j                  |||||	|
|      }|rt        j                  | ||      \  }}|S t        j                  | |||      \  }}|S )zCGenerate random inputs and measure average latency of Onnx Runtime.N)r   r   r   )r    rx   r   r   r   r|   r}   rR   r~   rR  r\  )rO  r  r   r   r@  r}  r   r   r   r   r   r   ry   r{   rz   r    r   r   r	  r   rG  s                        r"   test_performancezGpt2Helper.test_performance  s    ( #\\&880/6;M (::=&R\]N!22 &&NN+1!5 3 
" #99+|U_`JAw 	 $HH\>=*JAw r$   c                     t         j                  ddd|j                  |j                  |j                  |j
                  |d||      j                         }t        j                  j                  | |      S )zJIT trace for TorchScript.r;   F)ry   rz   r{   r|   r}   rH   r~   r   rr   r   r   )
rx   r   r|   r}   rR   r~   ri   rD   jittrace)r  r    r   r   r   rh   s         r"   torchscriptzGpt2Helper.torchscript  ss      00!" & : :**nn((-1 1 
 ') 	 yyuj11r$   rawfp32fp16int8c           
         |}t         j                  j                  |      rt        |      j                  d   }n|j                  d      d    |dk7  r|d|z   z  }|r|dz  }|rdddd	d
}d
D ]  }t         j                  j                  | |||   z         }	t         j                  j                  |	      sI||v r/	 t        j                  |	       t        j                  d|	        |t        j                  d| d|	         t         j                  j                  t         j                  j                  | |      |dz         t         j                  j                  t         j                  j                  | |dz         |dz         t         j                  j                  t         j                  j                  | |dz         |dz         t         j                  j                  t         j                  j                  | |d	z         |dz         d
S t         j                  j                  | |dz         t         j                  j                  | |dz         t         j                  j                  | |dz         t         j                  j                  | |dz         d
S # t        $ r0}
t        j                  d|	 d|
j                          Y d}
~
dd}
~
ww xY w)z=Build a  path name for given model based on given attributes.r   /r   r   _past _fp32_fp16_int8r  zRemoved the existed directory: zFailed to remove the directory r(  NzDirectory for z
 existed: z.onnxz
_fp32.onnxz
_fp16.onnxz
_int8.onnx)r  r  isdirr   partssplitr  existsshutilrmtreer   r   OSErrorstrerror)
output_dirmodel_name_or_pathr   has_past
new_folderremove_existing
model_namesuffixr  new_dires              r"   get_onnx_pathszGpt2Helper.get_onnx_paths  s`    (
77==+,0177;JS!"%++#++J'!J'7SF=
'',,z:z@R3RS77>>'*!_4c"MM'2"KK*I'(ST nZL
7)$TU > ww||BGGLLZ$H*W^J^_GGLLZ'-AB- GGLLZ'-AB- GGLLZ'-AB- " 77<<
J,@AGGLLZ,-FGGGLLZ,-FGGGLLZ,-FG	
 	
-  ' c"KK*I'RTUVU_U_T`(abbcs   =-J	K(%KKN)r   )F)MbP?r  )r   )FFr   )r   )T)r   TF)TT)'r,   r-   r.   r/   rU   rD   int32r   r   boolrq   rb   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r  rJ  rR  rT  rV  Tensorr\  rf  ri  r  r  r  r  rl   r$   r"   rx   rx      s   O !%#'',{{*/++,1KK"&>I>I!>I >I !	>I
 >I >I >I >I >I >I !>I >I "KK>I $kk>I  >I  
!>I >I@  -  !    	 
   
c49n	   D 	 	   $ $  4 3
 3
j 
 ).!%#'',{{*/++,1KKu u 	u
 #'u u !u u "KKu $kku un  "'"! !F $
CCCyC CJ ( ( ( (2 ,: ,3 , ,> 
 
( 
 
  !,10,0, S%,,./0, CcN+	0,
 0, 0, &*0, 0,d K K D D
 
  % ;;"[["%E EN 
 % ;;"[[4 4l 2 2"  -7:
 :
 :
r$   rx   )4loggingr  rd  r   r  r   rD  pathlibr   typingr   r   r   r   r   r  rD   benchmark_helperr   rr   r	   fusion_optionsr
   io_binding_helperr   r$  r   	optimizerr   torch_onnx_export_helperr   transformersr   r   r   r   	getLoggerr,   r   PRETRAINED_GPT2_MODELSFLOAT32FLOAT16INT8DEFAULT_TOLERANCEr   r3   r8   rW   r[   r   rb   rx   rl   r$   r"   <module>r     s    	       + +    & - ( -   $ 6 L L			8	$W  vsNNC N9 N8[ 8!E) !EHE E$E/ E" *8T:"=x!O|T2S S>_
 _
r$   