
    g                        d Z ddlZddlZddlZddlZddlmZ ddlZddlZddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ dd	l m!Z!  ejD                  d
      Z# ejH                  d      Z$dejJ                  vr e&e$      ejJ                  d<   ddl'Z'ddl(m)Z)m*Z*m+Z+ d Z,d Z-de.de.fdZ/d Z0d Z1d Z2e3dk(  r e2        yy)a]   Benchmarking the inference of pretrained transformer models.
    PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
    One difference is that random input_ids is generated in this benchmark.

    For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.

    Example commands:
        Export all models to ONNX, optimize and validate them:
            python benchmark.py -b 0 -o -v -i 1 2 3
        Run OnnxRuntime on GPU for all models:
            python benchmark.py -g
        Run OnnxRuntime on GPU for all models with fp32 optimization:
            python benchmark.py -g -o
        Run OnnxRuntime on GPU with fp16 optimization:
            python benchmark.py -g -o -p "fp16"
        Run TorchScript on GPU for all models:
            python benchmark.py -e torchscript -g
        Run TorchScript on GPU for all models with fp16:
            python benchmark.py -e torchscript -g -p "fp16"
        Run ONNXRuntime and TorchScript on CPU for all models with quantization:
            python benchmark.py -e torchscript onnxruntime -p "int8" -o
        Run OnnxRuntime with the ROCM provider and graph optimization script:
            python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
        Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
            python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm

    It is recommended to use run_benchmark.sh to launch benchmark.
    N)datetime)ConfigModifierOptimizerInfo	Precisioncreate_onnxruntime_sessionget_latency_resultinference_ortinference_ort_with_io_bindingoutput_detailsoutput_fusion_statisticsoutput_summarysetup_logger)FusionOptions)MODEL_CLASSESMODELS)create_onnxruntime_inputexport_onnx_model_from_ptexport_onnx_model_from_tfload_pretrained_model)version)QuantizeHelper F)logicalOMP_NUM_THREADS)
AutoConfigAutoTokenizerLxmertConfigc                    dd l }g }| rMd|j                         vr;d|j                         vr)d|j                         vrt        j                  d       |S d}|dk(  r;t        j
                  }d}d|j                         vrt        j                  d	       |S |t        j
                  k(  rt        j                  d
| d       |D ]  }t        |   d   }|
D ]u  }|t        |      kD  r &|d | }t        |   d   |_	        t        j                  |      }d|v r[t        j                         5  t        |t        |   d   t        |   d   t        |   d   |||||| |||||||      \  }} }!}"d d d        d|v r>t        |t        |   d   t        |   d   t        |   d   |||||| |||||||      \  }} }!}" st!        | |d|||      }#|#|#j#                         D $cg c]  }$|$j$                   }%}$g }&| rdnd}'t'        j(                  ||      }(t+        j,                  t/        |      t/        |      t/        !|(j0                        g      })t+        j,                  t/        |      |(j0                  g      }*|D ]  }+|+dk  r
|D ]  },"|,|"kD  rd|v rt*        j2                  nt*        j4                  }-t7        |!|+|,||(|-      }.d|j8                  ||'||| ||||+|,|j;                         t=        t?        j@                               d}/|(j                  dv r4t        jC                  d| d|+d|(jD                  |(jD                  g        nt        jC                  d| d|+|,g        |rtG        |#|.|/|	|+|      }0n|#jI                  |%|.      }1|)g}2tK        t        |1            D ]9  }3|3dk(  r!t        |   d   dk(  r|2jM                  |*       )|2jM                  |)       ; d|v rt*        jN                  nt*        jP                  }4tS        |#|.|/|	|%|1|&|2|+|'|4|      }0t        jC                  |0       |jM                  |0         x  |S # 1 sw Y   xY wc c}$w )Nr   CUDAExecutionProviderROCMExecutionProviderDmlExecutionProviderzPlease install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance.tensorrt   TensorrtExecutionProviderzhPlease install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance.zOptimizerInfo is set to zA, graph optimizations specified in FusionOptions are not applied.   pt      tfT)enable_all_optimizationnum_threadsverbose(enable_mlas_gemm_fastmath_arm64_bfloat16cudacpu	cache_dironnxruntimeenginer   	providersdevice	optimizer	precision
io_binding
model_nameinputsthreads
batch_sizesequence_lengthcustom_layer_numr   vitswinzRun onnxruntime on  with input shape gpt)*r2   get_available_providersloggererrorr   NOOPTwarningr   len
model_typer   parsetorchno_gradr   r   r   get_outputsnamer   from_pretrainednumpyprodmaxhidden_sizeint64int32r   __version__get_layer_numstrr   nowinfo
image_sizer	   runrangeappendlonglongintcr
   )5use_gpuprovidermodel_namesmodel_classconfig_modifierr8   r+   batch_sizessequence_lengthsrepeat_timesinput_countsoptimizer_infovalidate_onnxr1   onnx_dirr,   	overwritedisable_ort_io_bindinguse_raw_attention_maskmodel_fusion_statisticsmodel_source(enable_arm64_bfloat16_fastmath_mlas_gemmargsr2   resultswarm_up_repeatr:   all_input_names
num_inputsinput_namesfusion_optionsonnx_model_fileis_valid_onnx_model
vocab_sizemax_sequence_lengthort_sessionnode_argort_output_namesoutput_buffersr6   configmax_last_state_sizemax_pooler_sizer=   r>   input_value_type
ort_inputsresult_templateresultort_outputsoutput_buffer_max_sizesi	data_types5                                                        W/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/benchmark.pyrun_onnxruntimer   Y   s    2 G$K,O,O,QQ$K,O,O,QQ#;+N+N+PP ]	
 N:&,,&k.Q.Q.SSLLz N,,,&~&66wx	
 "
 ,Q/&JC00)+:6K$Z03DO*006N|#]]_ 2"z*1-z*1-z*1-#'! #!&%.!/&#'+"+ %2 |# .:&q):&q):&q)#"!*+"##''* '4(,'9aK ">I>U>U>WX>W(>WXN&VEF//
iPF"'**$()
F$6$67# $jj#k*:F<N<N)OPO)
?'7O*6?M`;` 6:l6Ju{{PUP[P[$!9""'#("J #0#.#:#:%-"(%3%.*@&@&0",#.&0+:,;,I,I,K$'$7'O" ((O;1*=OQ[]^`f`q`qsy  tE  tE  QF  PG  H &9*EWYcetXuWv$wx-!.'&+(&*" '2oo6F
&S3F2G/!&s;'7!8A Av&*<Q*?5*H 7 > > O 7 > >?R S "9 7;l6JENNPUPZPZ	!>'&+(,'*3&"%*" KK'NN6*S (8 *o ' "N Ny %_D  Ys   /?Q"Q#Q c                    g }| r5t         j                  j                         st        j	                  d       |S t        j
                  d       |D ]B  }t        j                  ||	|      }|j                  |       t        ||||      }|j                  dv r|d   g}n3t        j                  ||      }|j                  j                  |d      }t        j                  d	|        t        j                  d
|j                                 |t         j"                  k(  r|j%                          t        j&                  | rdnd      }|j)                  |       |t         j*                  k(  rt-        j.                  |      }|D ]  }|dk  r
|D ]  }|j                  dv rt        j1                  d| d|d|j2                  |j2                  g        t        j4                  |d|j2                  |j2                  f|t         j"                  k(  rt         j6                  nt         j8                  |      n\||kD  rt        j1                  d| d||g        t        j:                  d|j<                  dz
  ||ft         j>                  |      	 |	r t         j@                  jC                  |      n|
rt        jD                  |      n|        tG        jH                  fd|d      }|	rdn|
rdndt         jJ                  d| rdndd|d|d||||jM                         tO        tQ        jR                               d}|jU                  tW        ||             t        j1                  |       |jY                  |        	 E |S # tZ        $ r>}t        j]                  |       t         j                  j_                          Y d }~Ed }~ww xY w)NzYPlease install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.F)torchscriptr1   )r   r1   custom_model_classr@   r   r0      zModel zNumber of parameters zcuda:0r/   zRun PyTorch on rC   r%   )sizedtyper6   r'   )lowhighr   r   r6   c                              S N )	inference	input_idss   r   <lambda>zrun_pytorch.<locals>.<lambda>  s
    Yy5I    repeatnumberr   torch2rM   NAr.   r   r3   )0rM   r.   is_availablerF   rG   set_grad_enabledr   rQ   modifyr   rK   r   max_model_input_sizesgetdebugnum_parametersr   FLOAT16halfr6   toINT8r   quantize_torch_modelr\   r]   randnfloat16float32randintr~   longjittracecompiletimeitr   rX   rY   rZ   r   r[   updater   r`   RuntimeError	exceptionempty_cache)rc   re   rf   rg   r8   r+   rh   ri   rj   r   r   r1   r,   rv   r:   r   model	tokenizermax_input_sizer6   r=   r>   runtimesr   er   r   s                            @@r   run_pytorchr   9  sl    Guzz..0pq	5!!
++JK[dev&%*	
 / 0 34%55jIVI&<<@@TRNveW%&,U-A-A-C,DEF	)))JJL'hu=	&"77>E%JQ#3$$7KK)*5GUVX^XiXikqk|k|H}G~ !&(!V->->@Q@QR/8I<M<M/MemmSXS`S`%!I &1o6V KK/*=OQ[]lPmOn op %#..2(/:#jj%!I-=H		y9flemm\aNbrw  i(%}}-IR^ghiH 4?-PVH\c#(#4#4%),3&%'%.&(&0"##.&0+:,;,I,I,K$'$7F  MM"4Xz"JKKK'NN6*a $4	 &= "n N	 $ -$$Q'JJ**,,-s   C4N	O"	$3O	O"	do_eager_modeuse_xlac                 2     ddl m dd l fd}|S )Nr   )wrapsc                               fd       }        j                         fd              }du rdu sJ d       |S |S )Nc                       | i |S r   r   ru   kwargsfuncs     r   run_in_eager_modezFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_eager_mode  s    (((r   )experimental_compilec                       | i |S r   r   r   s     r   run_in_graph_modezFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_graph_mode  s     (((r   TFzcCannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`.)function)r   r   r   r   r)   r   r   s   `  r   run_funcz+run_with_tf_optimizations.<locals>.run_func  st    	t	) 
	) 
t	'	2	) 
3 
	) D 5 utu $$$$r   )	functoolsr   
tensorflow)r   r   r   r)   r   s   `` @@r   run_with_tf_optimizationsr     s    %$ Or   c                    !" g }dd l ""j                  j                  j                  |       | s"j                  j	                  g d       | r1"j
                  j                         st        j                  d       |S | r"j                  j                  d      }	 "j                  j	                  |d   d       "j                  j                  j                  |d   d       "j                  j                  d       |t         j"                  k(  s|t         j$                  k(  rt'        d      |D ]*  }t)        j*                  ||	      |j-                         t/        ||	|d	      !t1        j*                  ||	      }|j2                  j5                  |d
      }|D ]  }|dk  r
|D ]  }|||kD  rt        j7                  d| d||g        dd l}|j;                         }t=        ||z        D cg c]!  }|j?                  dj@                  dz
        # }}"jC                  |||f"jD                         	 tG        dd       !fd       }tG        dd       !fd       }tG        dd       !"fd       }|jH                  r|ntK        tL              r|         tO        jP                  fd|d      }d"jR                  d| rdndd|d|d||||jU                         tW        tY        jZ                               d}|j]                  t_        ||             t        j7                  |       |ja                  |         - |S # t        $ r }t        j                  |       Y d }~d }~ww xY wc c}w # t        $ rF}t        j                  |       ddl1m2} |jg                         }|ji                          Y d }~.d }~ww xY w)Nr   GPUzVPlease install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.Tz/gpu:0)r6   z+Mixed precision is currently not supported.r0   )r   r1   r   is_tf_modelr   zRun Tensorflow on rC   r'   )shaper   F)r   r   c                        d      S )NF)trainingr   r   r   s   r   encoder_forwardz'run_tensorflow.<locals>.encoder_forward  s    $Y??r   c                         d      S )NF)decoder_input_idsr   r   r   s   r   encoder_decoder_forwardz/run_tensorflow.<locals>.encoder_decoder_forward  s    $Y)V[\\r   c                      j                   j                  ddj                  g      } j                   j                  ddj                  g      } | |d      S )Nr'   F)visual_feats
visual_posr   )randomnormalvisual_feat_dimvisual_pos_dim)featsposr   r   r   r)   s     r   lxmert_forwardz&run_tensorflow.<locals>.lxmert_forward  s^     "		 0 0!Q8N8N1O P ii..1f6K6K/LM$%).'*%*	  r   c                               S r   r   )r   s   r   r   z run_tensorflow.<locals>.<lambda>&  s    Y[r   r   r   r   r.   r/   r   r3   )r.   )5r   r   	threading set_intra_op_parallelism_threadsset_visible_devicestestis_built_with_cudarF   rG   list_physical_devicesexperimentalset_memory_growth
distributeOneDeviceStrategyr   r   r   r   r   NotImplementedErrorr   rQ   r   r   r   r   r   r\   r   Randomr_   r   r~   constantrW   r   is_encoder_decoder
isinstancer   r   r   rX   rY   rZ   r   r[   r   r   r`   numbar.   get_current_devicereset)#rc   re   rf   rg   r8   r+   rh   ri   rj   r1   r,   rv   physical_devicesr   r:   r   r   r=   r>   r   rngr   valuesr   r   r   r   r   r.   r6   r   r   r   r   r)   s#                                 @@@@@r   run_tensorflowr    s    GII88E
		%%b%0rww113mn99::5A	 II))*:1*=uEII""445Ea5H$OMM++8+< I%%%inn)D!"OPP!
++J)Lv&%*
 "11*	R	"88<<ZN%JQ#3!-/N2R0<NPZ\kOlNmnommoINz\kOkIlmIlA#++a):):Q)>?IlmKKz?6S[][c[cKd	7#.UER@ S@ /UER] S] /UER S !0I00$;	#FL9$2	K%}}-@^_`H #/#%>>%),3&%'%.&(&0"##.&0+:,;,I,I,K$'$7F  MM"4Xz"JKKK'NN6*{ $4	 &! "r N  	 Q	 F nj $ #$$Q'*!446FLLNN#s8    A$M' &N

DN'	N0NN	O'	!;O"	"O'	c                  8   t        j                         } | j                  ddddt        g dt	        t        j                               ddj                  t        j                               z          | j                  d	dd
t        dddgd       | j                  ddt        d t	        t              ddj                  t              z          | j                  ddddt        dgg dd       | j                  dddt        t        j                  j                  dd      d       | j                  ddt        t        j                  j                  dd      d       | j                  dd dd!d"#       | j                  d$dt        d d%       | j                  d&d't        t        j                  t	        t              d()       | j                  d*dd!d+#       | j                  d,dd!d-#       | j                  d.d/t        t        j                  t	        t              d0)       | j                  d1d2dd!d3#       | j                  d4d5dd d67       | j                  d8d9dd d:7       | j                  d;d<dd d=7       | j                  d>d?ddd
gt        g d@dAB       | j                  dCdDddEt        dFG       | j                  dHdIdt        d
gJ       | j                  dKdLdt        g dMJ       | j                  dNdd!dO#       | j!                  dP       | j                  dQdRddt        dSgdTU       | j                  dVdt        d dW       | j                  dXdd!dY#       | j!                  dZ       t#        j$                  |        | j'                         }|S )[Nz-mz--modelsF+)zbert-base-casedzroberta-basegpt2z Pre-trained models in the list: z, )requirednargstypedefaultchoiceshelpz--model_sourcer'   r&   r)   zExport onnx from pt or tfz--model_classz!Model type selected in the list: )r  r  r  r	  r
  z-ez	--enginesr2   )r2   rM   r   r   r   zEngines to benchmarkz-cz--cache_dir.cache_modelsz%Directory to cache pre-trained models)r  r  r  r
  z
--onnx_dironnx_modelszDirectory to store onnx modelsz-gz	--use_gpu
store_truezRun on gpu device)r  actionr
  z
--providerzExecution provider to usez-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r  r  r	  r
  z	--verbosezPrint more informationz--overwritezOverwrite existing modelsz-oz--optimizer_infozjOptimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_optz-vz--validate_onnxzValidate ONNX modelz-fz--fusion_csvz:CSV file for saving summary results of graph optimization.)r  r  r
  z-dz--detail_csvz#CSV file for saving detail results.z-rz--result_csvz$CSV file for saving summary results.z-iz--input_counts)r'   r(   r%   zXNumber of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.)r  r  r  r  r	  r
  z-tz--test_timesd   z8Number of repeat times to get average inference latency.)r  r  r  r
  z-bz--batch_sizes)r  r  r  z-sz--sequence_lengths)             @         z--disable_ort_io_bindingz=Disable running ONNX Runtime with binded inputs and outputs. )rp   z-nz--num_threadsr   zThreads to use)r  r  r  r  r
  z--force_num_layersz%Manually set the model's layer numberz*--enable_arm64_bfloat16_fastmath_mlas_gemmzHEnable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP )rt   )argparseArgumentParseradd_argumentrZ   listr   keysjoinr   ospathr   FLOAT32r   BYSCRIPTintset_defaultsr   add_arguments
parse_args)parserru   s     r   parse_argumentsr'  E  s0   $$&F
;V[[]#/$))FKKM2JJ  	 t(   ]#0499]3KK   O#  	 S.14   S-0-   kE,Uhi
(   !!Yu   eLOgh
(	   &&]#y   "   I   2   3   g  	 G   oSsQCP
,   "L	   u5
   4   4W	   G'DKr   c                  ,   t               } t        | j                         | j                  t        j
                  k(  r"| j                  st        j                  d       y | j                  t        j                  k(  r0| j                  r$| j                  dvrt        j                  d       y t        | j                        dk(  r#t        | j                  d      d   dv rdg| _        t        | j                   D ch c]  }|dk  rt"        n| c}      | _        t        j%                  d	|         t&        j(                  j+                  | j,                        s 	 t'        j.                  | j,                         d| j2                  v }d| j2                  v }d| j2                  v }d| j2                  v }d| j2                  v }|r`t5        j6                  t8        j:                        t5        j6                  d      k  r't        j                  dt8        j:                          y t=        | j>                        }g }| j                   D ]7  }	t9        j@                  |	       t        jC                  t8        jD                  jG                                |s|s|r| jH                  dgk7  rt        jK                  d       |rt|tM        | j                  | j                  | jN                  || j                  |	| jP                  | j                  | jR                  dd| j,                  | j                        z  }|rt|tM        | j                  | j                  | jN                  || j                  |	| jP                  | j                  | jR                  dd| j,                  | j                        z  }|rt|tM        | j                  | j                  | jN                  || j                  |	| jP                  | j                  | jR                  dd| j,                  | j                        z  }|rr|tU        | j                  | j                  | jN                  || j                  |	| jP                  | j                  | jR                  | j,                  | j                        z  }i }
|sR	 | jV                   }|tY        | j                  | j                  | j                  | jN                  || j                  |	| jP                  | j                  | jR                  | jH                  | jZ                  | j\                  | j,                  | j^                  | j                  | j`                  | jb                  ||
| jd                  | jf                  |       z  }: tm        jn                         jq                  d      }
r | jr                  xs d| d}tu        |
|       t        |      dk(  r&| jP                  dgk7  rt        jK                  d       y | jv                  xs d| d}ty        ||       | jz                  xs d| d}t}        |||        y c c}w # t0        $ r$ t        j                  d
| j,                         Y w xY w# th        $ r t        jk                  d       Y Lw xY w)Nzfp16 is for GPU only)migraphxrocmzint8 is for CPU onlyr'   r   r%   )rA   swimr   zArguments: z#Creation of the directory %s failedrM   r   r   r2   r   z2.0.0z2PyTorch version must be >=2.0.0 and you are using zB--input_counts is not implemented for torch or torchscript engine.TF	Exceptionz%Y%m%d-%H%M%Sbenchmark_fusion_z.csvzNo any result available.benchmark_detail_benchmark_summary_)?r'  r   r,   r8   r   r   rc   rF   rG   r   rd   rJ   modelsr   ri   sortedr+   	cpu_countr\   r  r  existsr1   mkdirOSErrorenginesr   rL   rM   rX   r   force_num_layersset_num_threadsr   
__config__parallel_infork   rI   r   rf   rh   
test_timesr  use_mask_indexr   rl   rm   rn   ro   rp   rs   rt   r,  r   r   r[   strftime
fusion_csvr   
detail_csvr   
result_csvr   )ru   xenable_torchenable_torch2enable_torchscriptenable_onnxruntimeenable_tensorflowrg   rv   r+   rr   rq   
time_stampcsv_filenames                 r   mainrI    so   D~~***4<<+,~~'DLLT]]Rf=f+,
4;;1A!7!:o!M!#4CSCSTCSaAFy9CSTUD
KK+dV$%77>>$..)	PHHT^^$ dll*L,M&$,,6&$,,6$4u'8'89GMM'<RRI%J[J[I\]^$T%:%:;OG''k*U%%3356=,>  QC'cd!;LLKK$$#NN$$))OONNLL   ;LLKK$$#NN$$))OONNLL   ;LLKK$$#NN$$))OONNLL   ~    %% G #%.-1-@-@)@&?LLMMKK$$#NN$$))OO%%''&&NNMMLLNN//*+%%AA/ [ (R ((9JN,=j\*N !8,G
7|qs"NN56??J(9*T&JL7L)??K(::,d&KL7L$/_ U  	PLL>O	Pn  .  -.s+   :X<Y C%Y1)Y.-Y.1ZZ__main__)4__doc__r  loggingr  r   r   rR   psutilbenchmark_helperr   r   r   r   r   r	   r
   r   r   r   r   r{   r   huggingface_modelsr   r   onnx_exporterr   r   r   r   	packagingr   quantize_helperr   	getLoggerrF   r2  environrZ   rM   transformersr   r   r   r   r   boolr   r  r'  rI  __name__r   r   r   <module>rX     s    :   	        ) 4   *			2	FU+	 BJJ&$'	NBJJ !  @ @]@m`T D 4DEP_0D zF r   