
    gGX                     .   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlZ ej8                  e      Z G d	 d
e      Z G d de      Z  G d d      Z!dejD                  iZ#ddddddi fdZ$d(dZ%d)dZ&d Z'd Z(d Z)d Z*d*dZ+ejX                  d fdZ-d Z.d+dZ/deeee0ef         fdZ1 G d  d!e      Z2 G d" d#e2      Z3 G d$ d%e2      Z4d,d&Z5d' Z6y)-    N)ABCabstractmethod)ThreadPoolExecutor)datetime)Enum)sleep)AnyDictListOptional)versionc                   "    e Zd ZdZdZdZdZd Zy)	Precisionfp32fp16int8int4c                     | j                   S Nvalueselfs    ^/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/benchmark_helper.py__str__zPrecision.__str__&       zz    N)__name__
__module____qualname__FLOAT32FLOAT16INT8INT4r    r   r   r   r       s    GGDDr   r   c                       e Zd ZdZdZdZd Zy)OptimizerInfono_optby_ort	by_scriptc                     | j                   S r   r   r   s    r   r   zOptimizerInfo.__str__1   r   r   N)r   r   r    NOOPTBYORTBYSCRIPTr   r%   r   r   r'   r'   *   s     EEHr   r'   c                       e Zd Zd Zd Zd Zy)ConfigModifierc                     || _         y r   
num_layers)r   r3   s     r   __init__zConfigModifier.__init__6   s	    $r   c                    | j                   y t        |d      r3| j                   |_        t        j	                  d| j                           t        |d      r3| j                   |_        t        j	                  d| j                           t        |d      r4| j                   |_        t        j	                  d| j                           y y )Nnum_hidden_layersz6Modifying pytorch model's number of hidden layers to: encoder_layersz7Modifying pytorch model's number of encoder layers to: zdecoder_layers z7Modifying pytorch model's number of decoder layers to: )r3   hasattrr6   loggerinfor7   decoder_layers)r   configs     r   modifyzConfigModifier.modify9   s    ??"6./'+F$KKPQUQ`Q`Pabc6+,$(OOF!KKQRVRaRaQbcd6,-$(OOF!KKQRVRaRaQbcd .r   c                     | j                   S r   r2   r   s    r   get_layer_numzConfigModifier.get_layer_numF   s    r   N)r   r   r    r4   r=   r?   r%   r   r   r0   r0   5   s    %er   r0   float32TFc	                    d }		 t        j                         }
|r t         j                  j                  |
_        nt         j                  j
                  |
_        |rd|
_        |dkD  r)||
_        t        j                  d|
j                          |rd|
_
        nd|
_
        t        j                  d|         |r7|dk(  rddg}n0|d	k(  rd
dg}n&|dk(  rg d}n|dk(  rddg}n|dk(  rg d}nddg}ndg}|r|D cg c]  }||v r|||   fn| }}|r|
j                  dd       t        j                  | |
|      }	|	S c c}w # t        $ r t        j                  dd       Y |	S w xY w)NTr   z%Session option: intra_op_num_threads=   zCreate session for onnx model: dmlDmlExecutionProviderCPUExecutionProviderrocmROCMExecutionProvidermigraphx)MIGraphXExecutionProviderrH   rF   cudaCUDAExecutionProvidertensorrt)TensorrtExecutionProviderrL   rF   z(mlas.enable_gemm_fastmath_arm64_bfloat161)	providers	Exception)exc_info)onnxruntimeSessionOptionsGraphOptimizationLevelORT_ENABLE_ALLgraph_optimization_levelORT_ENABLE_BASICenable_profilingintra_op_num_threadsr9   debuglog_severity_leveladd_session_config_entryInferenceSessionrQ   error)onnx_model_pathuse_gpuproviderenable_all_optimizationnum_threadsrY   verbose(enable_mlas_gemm_fastmath_arm64_bfloat16provider_optionssessionsess_optionsrP   names                r   create_onnxruntime_sessionrk   P   s    G51"113"4?4V4V4e4eL14?4V4V4g4gL1,0L)?0;L-LL@AbAb@cde./L+./L+66GHI5 35KL	V#46LM	Z'	
 V#46LM	Z'	 56LM	/0Ijstjsbf4CS;S$ 0 67Y]]jsIt3112\^ab..Xab N u  1[40N1s$   C7E ;D?.E ?E  E('E(c                     | rt        j                  dd       y t        j                  d       t        j                  d      j	                  t        j
                         y )NDEBUGz8[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s)levelfmtz%(message)s)ro   transformers)coloredlogsinstalllogging	getLoggersetLevelWARNING)re   s    r   setup_loggerrw      sF    J	

 	..)227??Cr   c                    | r4t         j                  j                  |       st        j                  |        |r4t         j                  j                  |      st        j                  |       |rW|dk(  rdt	        j
                         v s<J d       t        t	        j
                               j                  g d      rJ d       t        j                  dt        j                          t        j                  dt        j                          t        j                  dt        j                          t        j                  t        j                        t        j                  d	      k\  sJ t        j                  t        j                        t        j                  d
      k\  sJ t        j                  t        j                        t        j                  d	      k\  sJ y )NrD   rE   zBPlease install onnxruntime-directml package to test GPU inference.)rL   rH   rJ   zWPlease install onnxruntime-gpu package, or install ROCm support, to test GPU inference.zPyTorch Version:zTransformers Version:zOnnxRuntime Version:z1.10.0z4.12.0)ospathexistsmakedirsrS   get_available_providersset
isdisjointr9   r:   torch__version__rp   r   parse)	cache_dir
output_dirra   rb   s       r   prepare_environmentr      sh   	2
I"''..4
Ju&+*M*M*OOTSTO ;>>@ALL_ ihi  KK"5#4#4"567
KK'(@(@'ABC
KK&{'>'>&?@A ==**+w}}X/FFFF==112gmmH6MMMM==001W]]85LLLLr   c                 p   t        |       t        t        |             z  dz  }t        j                  | t        j
                        dz  }|d|z  z  }t        |       |dt        j                  | d      dz  dt        j                  | d      dz  dt        j                  | d      dz  d|d|ddS )Ng     @@)dtypez.2fZ   _   c   )
test_timeslatency_variancelatency_90_percentilelatency_95_percentilelatency_99_percentileaverage_latency_msQPS)sumfloatlennumpyvarfloat64
percentile)latency_list
batch_size
latency_msr   
throughputs        r   get_latency_resultr      s    \"U3|+<%==FJyyU]]CfLv
23J ,'/4$)$4$4\2$F$OPS#T$)$4$4\2$F$OPS#T$)$4$4\2$F$OPS#T!+C 0S! r   c                    t        |ddd      5 }g d}t        j                  ||      }|j                          | D ]  }|j	                  |        	 d d d        t
        j                  d|        y # 1 sw Y   "xY w)Na asciimodenewlineencoding)enginer   rP   device	precision	optimizer
io_binding
model_nameinputsthreadsr   sequence_lengthcustom_layer_numr   r   r   r   r   r   r   r   
fieldnamesz&Detail results are saved to csv file: )opencsv
DictWriterwriteheaderwriterowr9   r:   )resultscsv_filenamecsv_filecolumn_names
csv_writerresults         r   output_detailsr      su    	lb7	Cx
0 ^^HF
 F' 7 
D< KK8GH= 
D	Cs   AA66A?c                    t        |ddd      5 }g d}g }|j                  D ]O  }|j                  dgk(  r|j                  d|        (|j                  D ]  }|j                  d| d|         Q t	        j
                  |||z         }|j                          |j                  D ]  }	d	D ]  }
|j                  D ]  }d
D ]  }|j                  D ]  }i }| D ]  }|d   |	k(  s|d   |
k(  s|d   |k(  s|d   |k(  s'|d   |k(  s0|j                         D ci c]  \  }}||v s|| }}}|s2|j                  |       |j                  |D ci c]  }|d c}       n|D ]  }||   ||   k(  rJ  |d   }|d   }|r|d   |d| d| <   |d   |d| <    |s|j                  |            	 d d d        t        j                  d|        y c c}}w c c}w # 1 sw Y   -xY w)Nr   r   r   r   )r   r   r   r   r   rP   r   r   r   r   r   b_sr   )         )TFr   r   r   r   r   r   r   r   r   z'Summary results are saved to csv file: )r   batch_sizessequence_lengthsappendr   r   r   modelsenginesrd   itemsupdater   r9   r:   )r   r   argsr   header_names
data_namesr   r   r   r   input_countengine_namer   r   rowr   kvheadersr   ss                        r   output_summaryr      s=   	lb7	Cx
 
**J$$,!!Aj\"23'+'<'<O%%*R7H&IJ (=	 + ^^H
9RS
 ++J(#'<<K&7
'+'7'7G"$C*1$*<$8J$F(.x(8K(G(.x(8K(G(.|(<
(J(.y(9W(D@F.d1RSWcRcq!tG.d+.(+

7(;(+

:3N:aArE:3N(O1=A36q6WQZ3G,G3G 2>(.|(<A(./@(AA'(<BCW<Xas"QCL(97=>R7SasG) +2*  # * 3 3C 81 (8 '8 $0  ) &1 
Dl KK9,HI! /e 4OS 
D	CsZ   CG1(G11G1:G1G1G1G&,G&1%G1
G, G19-G1'G1&G11G:c           
      .   t        |ddd      5 }ddddgt        t        t        | j	                                     j                               }t        j                  ||	      }|j                          | D ]m  }t        t        j                               | |   d<   t        j                  | |   d<   t        j                  | |   d<   || |   d<   |j                  | |          o 	 d d d        t         j#                  d
|        y # 1 sw Y   "xY w)Nr   r   r   r   model_filenamer   rp   r   r   z(Fusion statistics is saved to csv file: )r   listnextitervalueskeysr   r   r   strr   nowrp   r   r   r   r9   r:   )model_fusion_statisticsr   r   r   r   keys         r   output_fusion_statisticsr   )  s   	lb7	Cx	

 $t3::<=>CCEF
 ^^HF
 *C7:8<<>7J#C(4;G;S;S#C(8494E4E#C(1=@#C()9: 7 <= + 
D  KK:<.IJ! 
D	Cs   CDDc                      i }t        j                   fdd|       t        j                   fdd|      }|j                  |       |j                  ddi       |j                  t        ||             |S )Nc                  (    j                  d        S r   run
ort_inputsort_sessions   r   <lambda>zinference_ort.<locals>.<lambda>?  s    +//$
;r   r   numberrepeatc                  (    j                  d        S r   r   r   s   r   r   zinference_ort.<locals>.<lambda>@  s    z)Jr   r   F)timeitr   r   r   )r   r   result_templaterepeat_timesr   warm_up_repeatr   r   s   ``      r   inference_ortr   =  sd    F
MM;An]==!JST]ijL
MM/"
MM<'(
MM$\:>?Mr   c           
      b    i } j                         |D ]  }t        j                  ||         j                  |	      }t        j                  t        ||   j                        |
      }j                  ||j                  j                  d||j                  |j                                 t        |      dk(  rt        |||	       t        |      D ]^  \  }}j!                  |||   j                  j                  dt"        j$                  ||   j                  ||   j                                ` t'        j(                   fdd|       t'        j(                   fdd|      }|j+                  |       |j+                  ddi       |j+                  t-        ||             |S )Nr   c                  &    j                         S r   run_with_iobindingr   r   s   r   r   z/inference_ort_with_io_binding.<locals>.<lambda>t      ..z:r   r   r   c                  &    j                         S r   r   r   s   r   r   z/inference_ort_with_io_binding.<locals>.<lambda>z  r   r   r   T)r   r   
from_numpytoIO_BINDING_DATA_TYPE_MAPgetr   r   
bind_inputr   typeshapedata_ptrr   allocateOutputBuffers	enumeratebind_outputr   r@   r   r   r   r   )r   r   r   r   ort_output_namesort_outputsoutput_buffersoutput_buffer_max_sizesr   r   	data_typer   r   rj   np_input
input_typeiort_output_namer   r   s   `                  @r   inference_ort_with_io_bindingr  G  s    F '')J##Jt$4588@-11#j6F6L6L2MyY
OO  NN	
  >an.EvN'(89?1$$))MMN  1&&(	
 : MM: ==:L
 MM/"
MM<&'
MM$\:>?Mr   c                 |    |D ]7  }| j                  t        j                  |t        j                  |             9 y )N)r   r   )r   r   emptyr@   )r  r  r   r  s       r   r  r    s-     %ekk!5==PQ %r   c                    t        j                  |        t        j                   j                  |        t        j                  |        t        j
                  j	                  |        t        j
                  j                  |        y)z5Set random seed manually to get deterministic resultsN)randomseedr   r   manual_seedrK   manual_seed_all)r  s    r   set_random_seedr    sR    
KK	LLd	d	JJ4 	JJt$r   returnc            	         ddl m} m}m}m}m}m}m} 	  |        g } |       }t        |t              sy t        |      D ]c  }	 | ||	            }
t        |
t              r y |j                  |	 | ||	            |
j                  |
j                  |
j                  d       e  |        |S # | $ r}t!        d|       Y d }~y d }~ww xY w)Nr   	NVMLErrornvmlDeviceGetCountnvmlDeviceGetHandleByIndexnvmlDeviceGetMemoryInfonvmlDeviceGetNamenvmlInitnvmlShutdown)idrj   totalfreeused-Error fetching GPU information using nvml: %s)py3nvml.py3nvmlr  r  r  r  r  r  r   
isinstanceintranger   r   r"  r#  r$  print)r  r  r  r  r  r  r   r   device_countr  r:   r_   s               r   get_gpu_infor,    s      
)+,,|$A*+Ea+HID$$MM-.H.KL!ZZ II II	 % 	 =uEs#    B/ -B/ #AB/ /C
4CC
c                   F    e Zd ZddZd Zedeeee	e
f         fd       Zy)MemoryMonitorc                     || _         y r   )keep_measuring)r   r0  s     r   r4   zMemoryMonitor.__init__  s
    ,r   c                     dd l }d}	 t        ||j                  t        j                               j                         j                  dz        }t        d       | j                  s	 |S c)Nr      {Gzt?)	psutilmaxProcessry   getpidmemory_inforssr   r0  )r   r4  	max_usages      r   measure_cpu_usagezMemoryMonitor.measure_cpu_usage  s[    	Iv~~biik'B'N'N'P'T'TW^'^_I%L&& r   r  c                     t               r   )NotImplementedErrorr   s    r   measure_gpu_usagezMemoryMonitor.measure_gpu_usage  s    !##r   NT)r   r   r    r4   r;  r   r   r   r
   r   r	   r>  r%   r   r   r.  r.    s9    -	 $8Dc3h,@#A $ $r   r.  c                   B     e Zd Zd fd	Zdeeeeef         fdZ	 xZ
S )CudaMemoryMonitorc                 $    t         |   |       y r   )superr4   )r   r0  	__class__s     r   r4   zCudaMemoryMonitor.__init__  s    (r   r  c                    ddl m}m}m}m}m}m}m} g }g }		  |         |       }
t        |
t              st        j                  d|
        y t        |
      D cg c]  }d }}t        |
      D cg c]  } | ||             }	}	 t        |
      D ]Y  } | ||            }t        |t              rt        j                  d|         y t        ||   |j                  dz        ||<   [ t!        d       | j"                  sn |        t        |
      D cg c]  }||	|   ||   d c}S c c}w c c}w c c}w # |$ r }t        j                  d|       Y d }~y d }~ww xY w)	Nr   r  z*nvmlDeviceGetCount result is not integer: z%nvmlDeviceGetMemoryInfo returns str: r2  r3  	device_idrj   max_used_MBr%  )r&  r  r  r  r  r  r  r   r'  r(  r9   r_   r)  r   r5  r$  r   r0  )r   r  r  r  r  r  r  r   max_gpu_usagegpu_namer+  r  r:   r_   s                 r   r>  z#CudaMemoryMonitor.measure_gpu_usage  s   	
 	
 	
 	J-/LlC0I,XY(-l(;<(;1Q(;M<RWXdRefReQ)*DQ*GHReHf|,A23Ma3PQD!$,'LTF%ST#'*=+;TYY=P'QM!$ - e**  N |, -A	 "#$QK#0#3
 -  =f  	LLH%P	sO   6E E 	D6%E 4D;	AE AE  E 3E 6E E*
E%%E*r?  )r   r   r    r4   r   r   r
   r   r	   r>  __classcell__rD  s   @r   rA  rA    s&    )+8Dc3h,@#A +r   rA  c                   ,     e Zd Zd fd	Zd Zd Z xZS )RocmMemoryMonitorc                 @   t         |   |       d}t        j                  j	                  |      r1|t
        j                  vrt
        j                  j                  |       	 dd l}|| _        | j                  j                          y # t        $ r
 d | _        Y y w xY w)Nz/opt/rocm/libexec/rocm_smir   )
rC  r4   ry   rz   r{   sysr   rocm_smiinitializeRsmiImportError)r   r0  rocm_smi_pathrQ  rD  s       r   r4   zRocmMemoryMonitor.__init__  sv    (477>>-(CHH,.	!$DMMM((* 	! DM	!s   $%B
 
BBc                 f    | j                   y| j                   j                  |d      d   dz  dz  S )NrA   VRAMr   i   )rQ  
getMemInfo)r   devs     r   get_used_memoryz!RocmMemoryMonitor.get_used_memory  s5    == }}''V4Q7$>EEr   c                    | j                   y | j                   #t        | j                   j                               nd}t        |      D cg c]  }d }}t        |      D cg c]  }d| 	 }}	 t        |      D ]#  }t	        ||   | j                  |            ||<   % t        j                  d       | j                  snTt        |      D cg c]  }|||   ||   d c}S c c}w c c}w c c}w )Nr   GPUr3  rF  )	rQ  r   listDevicesr)  r5  rY  timer   r0  )r   r+  r  rI  rJ  s        r   r>  z#RocmMemoryMonitor.measure_gpu_usage  s
   == ;?==;Ts4==4467Z[$),$78$7q$78',\':;':!c!I':;<(#&}Q'79M9Ma9P#Qa  )JJu&&  <(
 )	  ,Q/
 )
 	
 9;
s   	C*$C/C4r?  )r   r   r    r4   rY  r>  rK  rL  s   @r   rN  rN    s    !F

r   rN  c                 |   d }|dk(  rt         }nt        } |d      }| r#||}n|j                         }|y ||S t               5 } |       }|j	                  |j                        }	 |j	                  |      }	|	j                         }
d|_        |j                         }|
	 d d d        y t        j                  d| d|        t        |      dk\  rct        |      dk\  rUt        |      t        |      k(  r>d}t        |      D ]#  \  }}|d   }||   d   }||z
  }t        ||      }% |cd d d        S d d d        y ||}n|j                         }||S t               5 } |       }|j	                  |j                        }	 |j	                  |      }	|	j                         }
d|_        |j                         }t        j                  d|d	d
|d	d       ||z
  cd d d        S # d|_        |j                         }w xY w# 1 sw Y   y xY w# d|_        |j                         }w xY w# 1 sw Y   y xY w)NrG   FzGPU memory usage: before=z  peak=r   r   rH  zCPU memory usage: before=z.1fz
 MB, peak=z MB)rN  rA  r>  r   submitr   r0  r9   r:   r   r  r5  r;  )is_gpufuncmonitor_typestart_memorymemory_monitor_typemonitormemory_before_testexecutor
mem_thread	fn_thread_r:  max_usedr  memory_beforebeforeafterr$  s                     r   measure_memoryro  /  sm   v//!%(G#!-!(!:!:!<%<%%!X)+G!)B)BCJ0$OOD1	$$&).&&--/	  "! KK34F3GwykZ[%&!+I!0CL^H_cfgpcqHq(12D(E$A}*=9F%aL7E 6>D"8T2H	 )F
  - "!!.  )$668|!!		%'__W%>%>?
	, -I  "A%*G""))+I/0B3/GzR[\_Q``cde-- 
	7 */&&--/	 ". $ &+G""))+I 
	sO   #H
*!G.H
.BH
&#H2
!H+9H2.HH

HH//H22H;c                  r    g d} d}| D ]+  }t        j                  |      }||r|dz  }|| d| z  }- |S )N)ORT_DISABLE_FUSED_ATTENTION!ORT_ENABLE_FUSED_CAUSAL_ATTENTION!ORT_DISABLE_FUSED_CROSS_ATTENTIONORT_DISABLE_TRT_FLASH_ATTENTION&ORT_DISABLE_MEMORY_EFFICIENT_ATTENTIONORT_TRANSFORMER_OPTIONSORT_CUDA_GEMM_OPTIONSr   ,=)ry   getenv)	env_namesenvrj   r   s       r   get_ort_environment_variablesr}  s  sW    I C		$=3JC$q    Jr   r?  r   )r   ){   )rK   N)7r   rs   ry   r  rP  r]  r   abcr   r   concurrent.futuresr   r   enumr   r   typingr	   r
   r   r   rq   r   r   rp   	packagingr   rS   rt   r   r9   r   r'   r0   r@   r   rk   rw   r   r   r   r   r   r   longlongr  r  r  r   r,  r.  rA  rN  ro  r}  r%   r   r   <module>r     s@     	  
   # 1    , ,      			8	$ D  , u}}   -2CLDM8 ID7JtK(* nn:zR%#htDcN34 #L$C $(/ /d(
 (
VA.Hr   