
    ga                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZmZmZmZ d dlZ ej8                  e      ZddZd Z d Z!d	 Z"d
 Z#d Z$d Z%edk(  r e%        yy)    )annotationsN)setup_logger)add_io_bindings_as_tensorsget_initial_inputs_and_outputs)
AutoConfigAutoModelForCausalLMAutoTokenizerBitsAndBytesConfigc                   | j                   dv rd }| j                  dk(  r| j                  dk(  rt        dddt        j
                        }t        j                  | j                  dk7  r| j                  n| j                  | j                  | j                  | j                  | j                  dd|| j                  d	i
	      }n	 t        j                  | j                  dk7  r| j                  n| j                  | j                  | j                  | j                  | j                  d| j                  dk(  rdnd      j                  | j                         }|j'                          | j                   dk(  rt	        j(                  |      }|S t+        j,                         }| j                  dk(  rdd| j                  ifnd}t+        j.                  | j0                  ||g      }|S # t"        $ r}t%        d|       t        j                  | j                  dk7  r| j                  n| j                  | j                  | j                  | j                  | j                  dd      j                  | j                         }Y d }~-d }~ww xY w)N   pt-eager
pt-compileint4cudaTnf4)load_in_4bitbnb_4bit_use_double_quantbnb_4bit_quant_typebnb_4bit_compute_dtype flash_attention_280GB)	cache_dirtorch_dtypeuse_auth_tokentrust_remote_code	use_cacheattn_implementationquantization_config
max_memorysdpa)r   r   r   r   r   r   z&Try to load a model using eager mode: eagerr   CUDAExecutionProvider	device_idCPUExecutionProvider)sess_options	providers)benchmark_typeonnx_precisiondevicer
   torchfloat16r   from_pretrainedhf_dir_path
model_namer   r   authtrustr$   totarget_device	ExceptionprintevalcompileortSessionOptionsInferenceSessiononnx_model_path)argsmodel
bnb_configer&   eps         h/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/models/llama/benchmark_e2e.py	get_modelrB   8   s3   88&(T[[F-B+!*.$)',}}	J )88$($4$4$:  .. ,,#yy"&**$7$. NNF3
E),<<(,(8(8B(>D$$DOO"nn $ 0 0#'99&*jj"@Dv@U)<[a "T''( . 	

,.MM%(E L ))+ {{f$ %{DNN&CD' 	
 $$T%9%9ac`deL9  ) >B,<<(,(8(8B(>D$$DOO"nn $ 0 0#'99&*jj"(/ "T''( 	)s   BG) )	J2BJJc                6   | j                   dk(  r%t        j                         5   |di |}d d d        d }| j                   dv r9| j                  dk7  r]t        j                  j                  | j                         n3t        |||| j                  | j                        }|j                          t        j                         }t        |      D ]  }| j                   dv r^t        j                         5   |di |}| j                  dk7  r)t        j                  j                  | j                         d d d        o|j                  |       |j                           t        j                         }||z
  |z  }	|	|fS # 1 sw Y   WxY w# 1 sw Y   xY w)Nr   r   cpu )r(   r+   no_gradr*   r   synchronizer3   r   use_fp16use_buffer_sharesynchronize_inputstimeperf_counterrangerun_with_iobindingsynchronize_outputs)
r<   r=   runsinputsoutputs
io_bindingstart_endavgs
             rA   run_inferencerX   x   sL   l*]]_ofoG  J88;;%JJ""4#5#56/vwW[WlWlm
%%' E4["<</&/;;%'JJ**4+=+=> !
 $$Z0**,  


C;$
C<5 _  !s   	F5AFFF	c           
         t                t        ||||| j                  | j                  | j                  | j
                        \  }}t        | || j                  ||      \  }}||fS N)clear_cacher   r3   rH   rI   enginerX   warmup_runs)	r<   r=   config	tokenizerprompt_lengthpromptrQ   rR   rU   s	            rA   prepare_model_for_inferencerb      sf    M4	=&$2D2DdmmUYUjUjlplwlwOFG tUD,<,<fgNJAw7?    c                 h    t        j                          t        j                  j	                          y rZ   )gccollectr+   r   empty_cacherE   rc   rA   r[   r[      s    JJL	JJrc   c                    t        j                  | ddddddddd	|d
z   dd|d
z   dd	| dd| dddg      }|j                  |d       t        j	                  d| d       y )Nz
Batch SizezPrompt LengthzPrompt Processing Latency (ms)z"Prompt Processing Throughput (tps)zSampling Latency (ms)zSampling Throughput (tps)z"First Token Generated Latency (ms)z&First Token Generated Throughput (tps)Average Latency of First    z Tokens Generated (ms)Average Throughput of First z Tokens Generated (tps)zWall-Clock Latency (s)zWall-Clock Throughput (tps))columnsF)indexzResults saved in !)pd	DataFrameto_csvloggerinfo)resultsfilename
gen_lengthdfs       rA   save_resultsrx      s    	,0#'04'
a'88NO*:?*;;RS'
|3IJ*:,6MN$)

B( IIheI$
KK#H:Q/0rc   c                    t        j                         } | j                  ddt        dg d       | j                  ddt        dd	
       | j                  ddddd       | j                  ddddd       | j                  ddt        t        j
                  j                  dd      d       | j                  dt        dd       | j                  dddd       | j                  d d!dt        j
                  j                  dd"d#d$      d%&       | j                  d'ddd(       | j                  d)ddd*      f | j                  d+d,d-.       | j                  d/d0d1.       | j                  d2d3dt        d4g d5d67       | j                  d8d9t        d:d;       | j                  d<d=t        t        j                  j                         rd>nd?d?d>g@       | j                  dAdBt        dCD       | j                  dEdFt        dGD       | j                  dHdIt        dJD       | j                  dKt        dLD       | j                         }t        j                  j                  |j                         t        j                  |j                         dM|j                   v rVt#        |dN|j$                  j'                          dO       |j(                  dPk(  r|j(                  dQ|j*                  if|_        |j                   dMk(  r|j,                  sJ dR       |j.                  j1                  dS      |_        |j2                  j1                  dS      |_        t#        |dT|j4                         |j4                  dUv s|j4                  dVk(  r|j$                  d?k(  rd4ndW|_        |j$                  d?k7  rdX|j*                   n|j$                  }|j4                  dWk(  rt        j6                  nt        j8                  }|j                   dMk(  rdMndY}t#        |dZ|       t#        |d[|       t#        |d\|       t#        |d]|j4                  dWk(         |j:                  xr |dMk(  |_        |S )^Nz-btz--benchmark-typeT)r   r   r8   )typerequiredchoicesz-mz--model-nameFz<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rz   r{   helpz-az--auth
store_truez5Use Hugging Face authentication token to access model)defaultactionr}   z-tz--trustzeWhether or not to allow for custom models defined on the Hugging Face Hub in their own modeling filesz-cz--cache-dir.model_cachezPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(model_name, cache_dir=cache_dir)`.)rz   r   r}   z--hf-dir-pathr   zPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(folder_path)`.z-oz--onnx-model-pathzPath to ONNX model)r{   r}   z-fz--prompts-filemodelsllamazprompts.jsonzsJSON file containing entries in the format 'prompt length: prompt' where prompt length = tokenized length of prompt)r{   r   r}   z--use_buffer_sharez3Use when GroupQueryAttention (GQA) is in ONNX modelz--anomaly-filteringzUse this flag to filter anomaly accelerator times for tokens generated.               This may give more accurate latency and throughput metrics for tokens generated.               Wall-clock metrics are still reported with anomaly times though.z-bz--batch-sizesz1 2)r   z-sz--prompt-lengthsz16 64 256 1024z-pz--precisionfp32)r   int8fp16r   zePrecision for model. For ONNX models, the model's precision should be set before running this script.)r{   rz   r   r|   r}   z-gz--generation-length   z Number of new tokens to generatez-dz--devicer   rD   )rz   r   r|   z-idz--device-idr   )rz   r   z-wz--warmup-runs   z-nz
--num-runsd   z--seedrj   r8   execution_providerExecutionProviderr#   r$   z,Please specify a path to `--onnx-model-path` r)   >   r   r   r   r   zcuda:ptr3   r   r\   rH   )argparseArgumentParseradd_argumentstrospathjoinintr+   r   is_available
parse_argsnprandomseedmanual_seedr(   setattrr*   upperr   r$   r;   batch_sizessplitprompt_lengths	precisionr,   float32rI   )parserr<   r3   r   r\   s        rA   get_argsr      s   $$&F
1   K   D   t   S-0 D    p	   !	   S(G^D C   B	   P	           0t   /   **113   }3B
oCC
lcB
sA6D IINN499	dii  ###*t{{/@/@/B.CCT,UV""&=='+'>'>dnn@]&^D# e###S%SS#''--c2D--33C8D D"DNN3..$4469QVZVaVaejVjqw 	N 15u0DeDNN+,$++M#'>>V#;%--K))U2UFD/=1D--D(F#D*dnn67 11EfoDKrc   c                   9: t               } t        d       t        j                  | j                         d }t        | j                        5 }t        j                  |d       }d d d        t        j                  | j                  dk7  r| j                  n| j                  | j                  | j                  | j                        }t!        j                  | j                  dk7  r| j                  n| j                  | j                  | j                  | j                        }t#        |       }g }t%        j&                  | j(                  | j*                        D ]^  \  }}t-        |      t-        |      }}t        j                  d| d|        t/                || j0                  z   }	||vr[t3        t5        j6                  d| d	| j                   d
| j                   d| d| d| d| d| d| j                   d| d            ||   g|z  }
||g}	 t        j                  d       t9        | |||||
      \  }}t;        | || j<                  ||      \  }}|dz  }|||z  z  }t        j                  d| d       t        j                  d|||z  z   d       |j?                  ||g       t        j                  d       t/                t9        | |||||
      \  }}|d   jA                         }|jB                  d   }|jD                  }tG        |d      r|jH                  n|jJ                  |jL                  z  }tO        jP                  || jR                  tN        jT                        }g }g }tW        jX                         }||	k  ryt;        | |d||      \  }}|j[                  |       tW        jX                         }|d   jB                  d   dkD  r|d    j]                  d      dz
  }|j_                  d!      ja                  d|jb                        je                  |d|jb                        }tO        jf                  |d   d|      ji                         }n|d   d d dd d f   }tO        jj                  |d!      }||z  |jl                  k(  }|jo                  ||jl                        jq                  |dg      }tW        jX                         } |j[                  | |z
         tO        jr                  ||gd!      }|dz  }||d<   tO        jr                  |d    | ju                  tN        jv                        jq                  |d      gd      |d <   d"|v r3tO        jx                  |d"   d!      d#   jq                  |d      dz   |d"<   |d   jB                  d   dk7  r"|d   d d d dd d f   j{                         |d<   |d   j}                          | j~                  d$k(  r
|d%   |d%<   n	| j                  st        |j                        D ]"  }!|d&|! d'   |d(|! d'<   |d&|! d)   |d(|! d)<   $ |d    jB                  d   }"t        |j                        D ]  }!tO        jP                  |||"|| jR                  | j                        }#tO        jP                  |||"|| jR                  | j                        }$|j                  d&|! d'|#j{                         d&|! d)|$j{                         i        ||	k  rytW        jX                         }%|j                  d#       | j                  rad*9t        |      :t        |      }&t        t        9:fd+|            }t        |      }'t        j                  d,|&|'z
   d-9 d.:dz   d/       t]        |      t        |      z  }(|(dz  })|d|(z  z  }*t        j                  d0|) d       t        j                  d1|* d       |d#   }+|+dz  },|d|+z  z  }-t        j                  d2|, d       t        j                  d3|- d       | j0                  d4z  }.t]        |d |.       t        |d |.       z  }/|/dz  }0|d|/z  z  }1t        j                  d5|. d6|0 d       t        j                  d7|. d6|1 d       t]        |      t        |      z  }2|2dz  }3|d|2z  z  }4t        j                  d5| j0                   d6|3 d       t        j                  d7| j0                   d6|4 d       |%|z
  }5||| j0                  z   |5z  z  }6t        j                  d8|5 d9       t        j                  d:||| j0                  z   |5z  z   d       t        j                  d;       |j?                  |)|*|,|-|0|1|3|4|5|6g
       |j[                  |       a d>| j~                   d?t        j                  j                         d@dA}8t        ||8| j0                         y # 1 sw Y   	xY w# t        $ r)}7t        j                  d<| d| d=|7        Y d }7~7d }7~7ww xY w)BNFc                h    | j                         D ci c]  \  }}t        |      | c}}S c c}}w rZ   )itemsr   )dkvs      rA   <lambda>zmain.<locals>.<lambda>h  s.    STSZSZS\<]S\41aSVQYS\<]<]s   .)object_hookr   )r   r   r   zRunning batch size = z, prompt length = z2
                                A prompt of size z was not found in 'zv'. There are a couple of solutions to fix this.
                                1) You can change one of the keys in 'z' to be z).
                                    If za < actual prompt's length, the benchmark E2E tool will repeat the first word in the prompt until zB = actual prompt's length.
                                    If zm > actual prompt's length, the benchmark E2E tool will automatically trim the actual prompt's length so that zd = actual prompt's length.
                                2) You can add a new key-value entry in 'z' of the form 'z,': 'your prompt goes here'.
                zMeasuring prompt processing...i  z&Average Latency of Prompt Processing: z msz)Average Throughput of Prompt Processing: z tpszMeasuring token generation...	input_idshead_dim)r*   dtype   logitsattention_mask)dimposition_idsr   r   past_key_valueszpresent.z.keyzpast_key_values.z.value
   c                    | z  k  S rZ   rE   )acc_timeanomaly_threshold_factor
min_time_ss    rA   r   zmain.<locals>.<lambda>	  s    H7OR\7\,\rc   zFiltered out z$ anomaly accelerator times that are zx greater than z ms...zAverage Latency of Sampling: z Average Throughput of Sampling: z"Latency of First Token Generated: z%Throughput of First Token Generated: rj   ri   z Tokens Generated: rk   zWall-Clock Latency: z szWall-Clock Throughput: zAdding results to CSVz$Could not benchmark at batch size = z - 
benchmark__e2e_z%Y-%m-%d_%H:%M:%Sz.csv)Or   r   rr   rs   __dict__openprompts_filejsonloadr   r-   r.   r/   r   r0   r1   r	   rB   	itertoolsproductr   r   r   r[   generation_lengthNotImplementedErrortextwrapdedentrb   rX   num_runsextendcloneshapenum_key_value_headshasattrr   hidden_sizenum_attention_headsr+   zerosr3   boolrK   rL   appendsum	unsqueezerepeat
vocab_sizeviewgathersqueezeargmaxeos_token_idmasked_fillreshapecatr2   int64max
contiguouszero_r\   rI   rM   num_hidden_layersr   updatepopanomaly_filteringminlenlistfilterr4   datetimenowrx   );r<   size_to_promptfr^   r_   r=   all_csv_metrics
batch_sizer`   
max_lengthra   csv_metricsrQ   rR   accelerator_prompt_latency_saccelerator_prompt_latency_msaccelerator_prompt_thrptall_token_idscurrent_length	num_heads	head_sizehas_eosaccelerator_timessampling_timeswall_clock_start_timeaccelerator_time_latency_ssampling_start_timeprompt_end_indicesidxsnext_token_logitsnext_tokenstokens_to_addsampling_end_timeinew_sequence_lengthpresent_keypresent_valuewall_clock_end_time	orig_sizenew_sizeavg_sampling_latency_savg_sampling_latency_msavg_sampling_thrptfirst_token_latency_sfirst_token_latency_msfirst_token_thrpthalfwayhalfway_token_latency_shalfway_token_latency_mshalfway_token_thrptall_token_latency_sall_token_latency_msall_token_thrptwall_clock_latency_swall_clock_thrptr?   ru   r   r   s;                                                            @@rA   mainr  `  sH   :D
KK N	d	 A12]^ 
! '' ,,2..yy**	F -- ,,2..yy**	I dOEO%.%6%6t7G7GI\I\%]!
M$'
OS5GM
+J<7I-YZ"T%;%;;
.%22?@STXTeTeSf gGGKGXGXFYYaboap q((5  7X  Yf  Xg g((5  7d  er  ds sJJNJ[J[I\\klykz {
 
 !/0:=!=1H	tKK899$vyZgiopOFG4A$t}}^dfm4n1(' -I4,O)'1]Ea5a'b$KK@A^@__bcdKK;J-ZvJv<w;xx|}  =?WXY KK78M9$vyZgiopOFG";/557M*004N22I#*6:#>FDVDVZ`ZtZtDt  kk*T5G5GuzzZG !#N$($5$5$7! J.6CD%QRTZ\c6d3*G!(()CD '+&7&7&9#8$**1-1)/0@)A)E)Ea)H1)L&*444;6#4#45j!V->->? 
 ).WX5F4(P(X(X(Z%(/(9!R((C%#ll+<"E "K/93I3II !, 7 7AWAW X ` `blnoap q$($5$5$7!%%&7:M&MN %		=-*Hb Q!# '4{#+099,-}}U[[/I/Q/QR\^_/`acd,'( "V+-2YYvn7MST-UVW-X-`-`akmn-ors-sF>* 8$**1-2(/(9!RaR((C(N(N(PGH%!'') ;;$&078I0JF,-.."6#;#;<=DxPQsRVEW=X!1!D9:?FRSQTTZG[?\!1!F;< = +11A*B*H*H*K'"6#;#;<&+kk&%/%#'#5#5"&"2"2' ).&%/%#'#5#5"&"2"2)  "*1#T 2K4J4J4L"*1#V 4m6N6N6P# =o !J.^ #'"3"3"5 !!!$%%+-( !23
 12	$(\^op%! 01#I$8#99]^v]w  xG  HR  UY  HY  GZ  Z`  a &)%83~;N%N"&<t&C#!+q3I/I!JKK78O7PPSTUKK:;M:NdST %6a$8!%:T%A" *a2G.G HKK<=S<TTWXYKK?@Q?RRVWX ,,1G&)*;HW*E&FM^_g`gMhIi&i#'>'E$",4K0K"LKK3G9<OPhOiilmnKK6wi?RSfRggklm #&&7"83?P;Q"Q#6#= (A0C,CDOKK+D,B,B+CCVWkVllop KK6t7M7M6NNabqarrvwx $79N#N )md>T>T.TXl-lmKK./C.DBGHKK)*I_I_9_cw8w*x)yy}~
 KK/0+&*%,'(#($ "";/u &^~ DKK=h.?.?.C.C.EFW-XX\]H(D,B,BCg 
!	 ^  	tKK>zlJ\]j\kknopnqrss	ts,   hS2hJ-hh	iii__main__)r<   zargparse.Namespace)&
__future__r   r   r   re   r   r   loggingr   r   rK   numpyr   pandasro   r+   benchmark_helperr   llama_inputsr   r   transformersr   r   r	   r
   onnxruntimer8   	getLogger__name__rr   rB   rX   rb   r[   rx   r   r  rE   rc   rA   <module>r*     s   @ #   	    	      ) S \ \ 			8	$=@>
12_DzDz zF rc   