
    gh                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZ	d dl
Z
d dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&Z' ejP                  e)      Z*d	 Z+d
e jX                  de-fdZ.d
e jX                  fdZ/d Z0d Z1d Z2d Z3d Z4d Z5ddZ6d Z7e)dk(  r e7        yy)    N)measure_memorysetup_logger)get_rankget_size)add_io_bindings_as_ortvalues%get_merged_sample_with_past_kv_inputsget_msft_sample_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)ORTModelForCausalLM)ProfilerActivityprofilerecord_function)trange)
AutoConfigAutoModelForCausalLMAutoTokenizerc                     | j                   dv ry| j                   dk(  r	 t        |j                        S t        |j                               S # t        $ r" t        |j                  j
                        cY S w xY w)N   hf-pt-eagerhf-pt-compiler   hf-ort)benchmark_typeleninputs_names	Exceptiondecoderinput_names
get_inputs)argsmodels     d/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/models/llama/benchmark.pyget_ort_model_inputs_lenr$   (   st    >>h&	2u))** u!""  	2u}}0011	2s   A (A98A9r!   ort_model_inputs_lenc                    d\  }}| j                   dk(  rdn| j                  j                  }| j                   dv rt        | j                  | j                  | j
                  | j                  d      }t        | j                  | j                  | j
                  | j                  | j                  d      }||fS | j                   dv r*|d	k(  rt        | j                  | j                  | j
                  | j                  d      }t        | j                  | j                  | j
                  | j                  | j                  d      }||fS t        | j                  | j                  | j
                  | j                  d
|| j                  | j                  dd
      }t        | j                  | j                  | j
                  d| j                  || j                  | j                  dd
      }||fS | j                   dk(  rt        | j                  | j                  | j
                  | j                  d
|| j                  | j                  dd| j                        }t        | j                  | j                  | j
                  d| j                  || j                  | j                  dd| j                        }||fS | j                   dk(  r|dkD  }t        | j                  | j
                  d
| j                  || j                  | j                  |      }t        | j                  | j
                  | j                  d|| j                  | j                  |      }||fS t        d      )NNNort-msfti   r   T)return_dict)use_fp16r)   >   r      r   pt)seq_lenpast_seq_lenmax_seq_lenr*   use_buffer_shareenginer)      ort-convert-to-onnxort)r-   r.   r/   r*   r0   r1   r)   
world_size   )r.   r-   r/   r*   r0   split_kvz/Unable to auto-detect inputs for provided model)r   configmax_position_embeddingsr
   target_device
batch_sizesequence_lengthr   r*   r   r0   r5   r	   r   )r!   r%   init_inputsiter_inputsr/   r7   s         r#   r    r    5   s>   )K
 --;$AdAdK>>'KKOO  
 5KKOO  ]]
^ ##M 
		
	*1$+""$$ K 9""$$ Kx ##e @"",,'!%!6!6 K @""!11'!%!6!6 KL ##s 
		 5	5;KKOO((#]]!22
 <KKOO--#]]!22
T ##9 
		
	*'!+,KKOO((#]]!22	
 -KKOO--#]]!22	
 ## IJJ    c                    d\  }}d\  }}| j                   dv r| j                  r| j                  n| j                  }t        j                         }t	        j
                  || j                  rt        j                  nt        j                  | j                  | j                  d| j                        j                  | j                        }t        j                         }| j                   dk(  r|t        j                  |      }nf| j                   dv r@t        j                          }| j"                  |_        | j&                  r'd|_        d|_        nt-        d| j                          | j                   d	k(  rt/        | j0                        t2        u r| j0                  d
   n| j0                  }t/        | j0                        t2        u r| j0                  d   nd }d }d }	t5        j6                  | j8                        D ])  }
d|
vsd|
v sd|
v rd|
v s|
dk(  r|
}d|
v r|
}	d|
v s&|
}|
}	+ t        j                         }t;        j
                  | j8                  ||	| j                  | j                  d|dk(  rdnd |||
      }t        j                         }| j                   dv rt<        j?                  d| j@                  jC                  | jD                                t        j                         }t        jF                  | j@                  jC                  | jD                        || j0                  g      }t        j                         }t<        j?                  d||z
   d       |S )Nr'   r   T)torch_dtypeuse_auth_tokentrust_remote_code	use_cache	cache_dirr   >   r   r(   r3   r2   Cannot recognize r   r   z.onnxz
.onnx_dataz
.onnx.datadecoder_modelz
model.onnxdecoder_with_past_modeldecoder_merged_model)	decoder_file_namedecoder_with_past_file_namerB   rC   use_io_binding
use_mergedproviderprovider_optionssession_options   r(   r3   zLoading model from )	providerszLoaded model in  s)$r   hf_pt_dir_path
model_nametimer   from_pretrainedr*   torchfloat16float32authrE   tor:   compiler4   SessionOptionsr   enable_profilingverboselog_verbosity_levellog_severity_levelr   typeexecution_providertupleoslistdirhf_ort_dir_pathr   loggerinfoort_model_pathformatrankInferenceSession)r!   r"   sess_options
start_timeend_timesourcerN   rO   rJ   rK   filenames              r#   	get_modelrt      s   $E<%J >>(,(;(;$$YY[
$44)-EMM99"iinn
 "T
  	 99;/1MM%(E			 M	M))+(,%<</0L,./L+ +D,?,?+@ABBh&15d6M6M1NRW1W4**1-]a]t]t9=d>U>U9VZ_9_42215ei &*#

4#7#78Hh&,(*BlV^F^(*h,.F$,!(H4.6+%1$,!.6+ 9 YY[
#33  /(C99"ii 1\ At-(
 99;AA)$*=*=*D*DTYY*O)PQRYY[
$$&&tyy1../

 99;
KK"8j#8"9<=Lr?   c                      j                   dv rt         j                        n%t         j                  t        j
                  d      } j                  r ||      }t        j                  |        fd} fd}|D ]  } |         ||        |         d} j                   dv rt         j                        n%t         j                  t        j
                  d      }	|	D ]H  } |        t        j                         }
 ||        |        t        j                         }|||
z
  z  }J  j                   dvrt        j                  d       | j                  z  } j                  |z  } j                  dk(  rvt        j                  d	 j                          t        j                  d
 j                          t        j                  d| d       t        j                  d| d       y )NrQ   zWarm up)filedescc                  |    j                   dk7  r(j                  dv rj                  j                         S fdS )NcpurQ   c                      j                   dk7  r<t        j                  j                         rt        j                  j	                         S d S )Nry   c                       y N kwargss    r#   <lambda>z=time_fn.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>      r?   devicerX   cudais_availablesynchronizer   r!   s    r#   r   z+time_fn.<locals>.<lambda>.<locals>.<lambda>  =    {{e#

(?(?(A JJ""$ &%&r?   )r   r   
io_bindingsynchronize_inputsr   s    r#   r   ztime_fn.<locals>.<lambda>  s>    ;;%D$7$7;^$^ 	**, 	

	
r?   c                  |    j                   dk7  r(j                  dv rj                  j                         S fdS )Nry   rQ   c                      j                   dk7  r<t        j                  j                         rt        j                  j	                         S d S )Nry   c                       y r|   r}   r~   s    r#   r   z=time_fn.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>(  r   r?   r   r   s    r#   r   z+time_fn.<locals>.<lambda>.<locals>.<lambda>%  r   r?   )r   r   r   synchronize_outputsr   s    r#   r   ztime_fn.<locals>.<lambda>"  s>    ;;%D$7$7;^$^ 	++- 	

	
r?   r   	Benchmark zBatch Size: zSequence Length: z	Latency: rS   zThroughput: z tps)r   rangewarmup_runsr   sysstdoutr`   ri   rj   num_runsrV   r;   rm   r<   )r!   fninputswarmup_rangeoutputs
input_syncoutput_sync_
total_timebench_rangerp   rq   latency
throughputs   `             r#   time_fnr     s    "EE 	dD$$3::IF  ||V*GJK 
6
  J "EE 	dmmDMM

E 
 YY[

6
99;h++
  "EEB4==(G7*JyyA~l4??"345'(<(<'=>?iy+,l:,d34
r?   c                 f   d| j                    d| j                   d| j                  j                          d| j                   d| j
                   d|j                  j                  dd       d| dt        j                  j                         d}d }| j                  dv rt        t        j                  t        j                  gdd      5 }t        d	      5   ||       d d d        d d d        j                  d
      j!                  | j"                  | j$                        }t&        j(                  j+                  | j,                  | d      }t/        |d      5 }|j1                  |       d d d        |S  ||       | d}|S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   |S xY w)Nb_sr   -z%Y-%m-%d_%H:%M:%Sr   T)
activitiesrecord_shapesprofile_memorymodel_inferencer6   )group_by_stack_n)sort_by	row_limitz.logwz.json)r;   r<   r   lower	precisionr   __name__replacedatetimenowr   r   CPUCUDAr   key_averagestablept_filter_bypt_num_rowsrf   pathjoin
log_folderopenwrite)	r!   r   r   inputs_typeprefixrs   prof	prof_datafs	            r#   
profile_fnr   R  s    !D$8$8#94;N;N;T;T;V:WWXY]YgYgXhhijnjujuivvwxz  yD  yD  yL  yL  MP  RU  yV  xW  WX  Yd  Xe  ef  go  gx  gx  g|  g|  g~  P  fQ  RFH>>(,,.>.C.CDTXim
 !236
 4

 %%q%9??HYHYeieueu?v	77<<F84A(C AGGI ! O 	6
 XU#O 43
 
 ! Os0   F	F#F#F&F	FF#&F0c                    t        j                         }t        j                  |      }|j	                  d               | j
                  dk(  r@t        j                  d|j	                  d       t        j                  d      z   d       t        j                          t        j                  j                          t        | j                  dk7  fd	
       t         j"                  j%                          y )Ng?)intervalr   zCPU usage: F)logical%ry   c                              S r|   r}   )r   r   s   r#   r   zmeasure_fn.<locals>.<lambda>|  s	    r&zr?   )is_gpufunc)rf   getpidpsutilProcesscpu_percentrm   ri   rj   	cpu_countgccollectrX   r   empty_cacher   r   r   r   flush)r!   r   r   pidprocesss    ``  r#   
measure_fnr   o  s    
))+CnnS!G%vJyyA~k'"5"5t"5"DvGWGW`eGf"f!gghij JJL	JJ4;;%/7IJ JJr?   c                    fd}|}| j                   dk(  r ||        ||       | j                  r5t        | ||d      }| j                   dk(  r}j                  j                  j                         }t        j                  d| d|        t        j                  |t        j                  j                  | j                  |             t        | ||d      }| j                   dk(  r}j                  j                  j                         }t        j                  d| d|        t        j                  |t        j                  j                  | j                  |             y t        j                  d       t        | ||       t!        | ||       t        j                  d	       t        | ||       t!        | ||       y )
Nc                      di | }|S )Nr}   r}   r   r   r"   s     r#   
get_logitsz$run_hf_inference.<locals>.get_logits  s    /&/r?   r   promptr   	Renaming  to token7
Evaluating `model(inputs)` step to get past_key_values5
Evaluating `model(inputs)` step with past_key_values)r   r   r   r   sessionend_profilingri   warningrf   renamer   r   r   decoder_with_pastrj   r   r   )r!   r=   r>   r"   r   generate_fnnew_lognameold_lognames      `    r#   run_hf_inferencer     se   4 Ko-K K || {KJ(*--//==?KNNY{m4}EFIIk277<<#MN {KI(*1199GGIKNNY{m4}EFIIk277<<#MN KKJKD+{+t[+.
KKHID+{+t[+.r?   c                 |     fd}fd}fd} j                   dk7  r|n|}i } j                  r |||      \  }	}t         ||	d      }
j                         }t        j                  d| d|
        t        j                  |t        j                  j                   j                  |
             t                |||      \  }}t         ||d      }
j                         }t        j                  d| d|
        t        j                  |t        j                  j                   j                  |
             y t        j                  d	        |||      \  }	}t         ||	       t         ||	       t        j                  d
        |||      \  }}t         ||       t         ||       y )Nc                     t        |       } j                  dk7  rKt        | j                  t        j                        j
                  |      \  }}t        d|       ||fS | |fS )Nry   r   )r   r   r   intrm   r0   setattr)r   kv_cache_ortvaluesr   r!   r"   s      r#   prepare_ort_inputsz-run_ort_inference.<locals>.prepare_ort_inputs  ss    "5&1 ;;%-Ivt{{C		ND<Q<QSe.*J* D,
3111)))r?   c                 (    j                  |        y r|   )run_with_iobinding)r   r"   s    r#   with_io_bindingz*run_ort_inference.<locals>.with_io_binding  s      ,r?   c                 ,    j                  d |       }|S r|   )runr   s     r#   without_io_bindingz-run_ort_inference.<locals>.without_io_binding  s    ))D&)r?   ry   r   r   r   r   r   r   )r   r   r   r   ri   r   rf   r   r   r   r   rt   rj   r   r   )r!   r=   r>   r"   r   r   r   r   r   ort_init_inputsr   r   ort_iter_inputss   `  `         r#   run_ort_inferencer     s   *-
 &*[[E%9/?QK||.@N`.a++ {OXN ))+;-tK=AB
		+rww||DOO[IJ $.@N`.a++ {OWM ))+;-tK=AB
		+rww||DOO[IJ KKJK*<[J\*]'O'D+/t[/2
KKHI*<[J\*]'O'D+/t[/2r?   c                     | j                   dv rt        | |||       y | j                   dv rt        | |||       y t        d| j                          )N>   r   r   r   rQ   rF   )r   r   r   r   )r!   r=   r>   r"   s       r#   run_inferencer     sV    HH{K?			 C	C$[%@+D,?,?+@ABBr?   c           	         t        j                         }|j                  ddt        dg d       |j                  ddt        dd	       |j                  d
dddd       |j                  dddt        dg dd       |j                  dt        dd       |j                  dt        dd       |j                  dt        dd       |j                  ddd !       |j                  d"d#d$!       |j                  d%d&t        t        j
                  j                         rd'nd(g d)*       |j                  d+d,t        d-.       |j                  d/d0t        d1.       |j                  d2d3t        d4.       |j                  d5t        d6.       |j                  d7t        d8.       |j                  d9t        d:.       |j                  d;dd<       |j                  d=t        d>d?       |j                  d@t        dAdB       |j                  dCdd<       |j                  dDt        t        j                  j                  dE      dF       |j                  dGt        ddHdIJ       |j                         }t        j                  j                  |j                         t	        j                  |j                         dK|j                   v rxt#        |dL|j$                  j'                          dM       |j(                  dNk(  r|j(                  dO| if|_        n+|j(                  dPk(  r|j(                  dO| if|_        d'|_        |j                   dQk(  r|j*                  sJ dR       |j                   dSv r|j,                  sJ dT       |j.                  j1                  dU      |_        |j2                  j1                  dU      |_        |j4                  dVv s|j4                  dWk(  r|j$                  d(k(  rdndX|_        |j6                  r7t9        |j.                        d:k(  rt9        |j2                        d:k(  sJ dY       |S )ZNz-btz--benchmark-typeT)r   r   r   r(   r3   )rc   requiredchoicesz-mz--model-namez<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rc   r   helpz-az--authF
store_truez5Use Hugging Face authentication token to access model)defaultactionr   z-pz--precisionfp32)int4int8fp16r  zePrecision for model. For ONNX models, the model's precision should be set before running this script.)r   rc   r  r   r   z--hf-pt-dir-pathr   zNPath to directory containing all PyTorch files (e.g. tokenizer, PyTorch model))rc   r  r   z--hf-ort-dir-pathzhPath to directory containing all ONNX files (e.g. tokenizer, decoder_merged, decoder, decoder_with_past)z--ort-model-pathzPath to ONNX modelz-bz--batch-sizesz1 2)r  z-sz--sequence-lengthsz32 64 128 256 512z-dz--devicer   ry   )ry   r   rocm)rc   r  r   z-idz--device-idr   )rc   r  z-wz--warmup-runsr6   z-nz
--num-runs
   z--seed   z--max-length    z--num-return-sequencesr2   z	--profile)r  r  z--pt-filter-byself_cpu_time_totalz"What to filter PyTorch profiler byz--pt-num-rowsi  z.Number of rows for PyTorch profiler to displayz	--verbosez--log-folder.zFolder to cache log filesz--cache-dirz./model_cachez-Cache dir where Hugging Face files are stored)rc   r   r  r   r4   rd   ExecutionProviderCUDAExecutionProvider	device_idROCMExecutionProviderr   z,Please specify a path to `--hf-ort-dir-path`rQ   z+Please specify a path to `--ort-model-path` >   r  r  r  r  zOPlease provide only one (batch_size, sequence_length) combination for profiling)argparseArgumentParseradd_argumentstrrX   r   r   r   rf   r   r   
parse_argsnprandomseedmanual_seedr   r   r   upperrd   rh   rk   batch_sizessplitsequence_lengthsr   r   r   )rm   parserr!   s      r#   get_argsr     sb   $$&F

   K   hlAx  
 0t   ]	   w	   !	     
 #  
 **113'   }3B
oCC
lbA
sA6 S"=
0sAF U<H
s,AHl   c4Fvw
U<H
S"'',,s:KRmn
<   D IINN499	dii  ###*t{{/@/@/B.CCT,UV""&=='+'>'>d@S&TD#$$(??'+'>'>d@S&TD# DK h&##S%SS#AA""Q$QQ"''--c2D 1177<D ..$4469QVZVaVaejVjqw 	N
 ||  !Q&3t/D/D+E+J	]\	]J Kr?   c                     t               } t               }t        |       }t        |j                         t
        j                  |j                         dt        j                  j                  _        | |_        ||_        t        j                  |j                   |j"                  |j$                  |j$                        }t'        j                  |j                   |j"                  |j$                  |j$                        }|j(                  dk7  rd|j                   n|j(                  }|j*                  dk(  }t-        |d|       t-        |d|       t-        |d|       t-        |d	|       t/        |      }t1        ||      }|j2                  d
v rt5        j6                  |j8                  j;                  |j                        d      }	t=        t?        d |	j@                  jB                              }
|xr tE        |
      dkD  xr |j(                  dk7  }t-        |d|       nt-        |dd       tG        jH                  |jJ                  |jL                        D ]y  \  }}|j                  dk(  rt
        j                  d| d| d       t-        |dtO        |             t-        |dtO        |             tQ        ||      \  }}tS        ||||       { y )NT)rE   rB   rC   ry   zcuda:r  	tokenizerr8   r:   r*   rQ   F)load_external_datac                      | j                   dk(  S )NGroupQueryAttention)op_type)nodes    r#   r   zmain.<locals>.<lambda>  s    T\\=R-Rr?   r   r0   z
Batch size = z and sequence length = z...r;   r<   )*r   r   r   r   r`   ri   rj   __dict__rX   backendscudnn	benchmarkrm   r5   r   rW   rU   rE   r[   r   r   r   r   rt   r$   r   onnx
load_modelrk   rl   listfiltergraphr'  r   	itertoolsproductr  r  r   r    r   )rm   r5   r!   r"  r8   r:   r*   r"   r%   
onnx_model	gqa_nodesr0   r;   r<   r=   r>   s                   r#   mainr5    sQ   :DJD>D
KK%)ENN"DI DO--4>>$))_c_h_hI ''4>>$))_c_h_hF ,0;;%+?eDII;'T[[M~~'HD+y)D(F#D/=1D*h' dOE3D%@ AA__T%8%8%?%?		%J_de
 RT^TdTdTiTijk	#SI(:St{{e?S(*:;(%0 (1'8'89I9I4K`K`'a#
O99>KK/*5L_L]]`ablC
O4'_)=>#-d4H#I [dKe< (br?   __main__)r   )8r  r   r   r1  loggingrf   r   rV   numpyr  r,  r   rX   benchmark_helperr   r   dist_settingsr   r   llama_inputsr   r   r	   r
   r   r   optimum.onnxruntimer   torch.profilerr   r   r   tqdmr   transformersr   r   r   onnxruntimer4   	getLoggerr   ri   r$   	Namespacer   r    rt   r   r   r   r   r   r   r   r5  r}   r?   r#   <module>rC     s      	   	 
      9 ,  4 E E  H H 			8	$
#$X'' $s $DRH&& RjCL:&;/|83vCEP/=d zF r?   