
    gf^                        d dl Z d dlZd dlZd dlZd dlZd dlmZ 	 g dZddZd Z	d Z
ddZddZd	 Zd
 Zd Zd Z	 	 	 ddZd Zd Zd Zd Zedk(  ru e       Z ede       d dlmZ  eej6                         ej8                  sej:                  sJ d        ee      Znej8                  Z eee      ZeD ]
  Z  ee         yy)    N)TensorProto)ScanLoopIfc                 ^   t        j                         }|j                  dddt        d       |j                  dddt        d       |j                  d	d
dt        dd       |j                  dddt        dd       |j                  ddt        dd       |j                  ddt        dd       |j                  ddt        dd       |j                  ddt
        dd       |j                  ddt        dd       |j                  ddt        d d        |j                  d!dt        d d"       |j                  d#dt        d d$       |j                  d%dd&g d'd()       |j                  d*d+dd,d-.       |j                  d/       |j                  d0dt        d1d2       |j                  d3dd,d4.       |j                  d5       |j                  d6dd,d7.       |j                  d8       |j                  d9d:dd,;       |j                  d<       |j                  |       S )=Nz-iz--inputFz2Set the input file for reading the profile results)requiredtypehelpz-mz--modelzIonnx model path to run profiling. Required when --input is not specified.z-bz--batch_size   zbatch size of input)r   r	   defaultr
   z-sz--sequence_length    zsequence length of inputz--past_sequence_lengthzpast sequence length for gpt2z--global_lengthz&number of global tokens for longformerz	--samplesi  z\number of samples to test. Set it large enough to reduce the variance of performance result.z--thresholdg{Gz?zfThreshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.z--thread_numznumber of threads to usez--input_ids_namez"input name for input IDs, for bertz--segment_ids_namez$input name for segment IDs, for bertz--input_mask_namez'input name for attention mask, for bertz--dummy_inputsr   )bertgpt2
longformerr   zEType of model inputs. The default will create dummy inputs with ones.)r   r   choicesr
   z-gz	--use_gpu
store_truezuse GPU)r   actionr
   )use_gpuz
--providercudazExecution provider to usez--basic_optimizationz_Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime)basic_optimizationz--kernel_time_onlyz.Only include the kernel time and no fence time)kernel_time_onlyz-vz	--verbose)r   r   )verbose)argparseArgumentParseradd_argumentstrintfloatset_defaults
parse_args)argvparsers     V/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/profiler.pyparse_argumentsr%      s   $$&F
A   X   "   '    ,   5   k   u   '   1   3   6   9T   kE,U^_
&
(   n	   51
=	   /
kE,O
&T""    c                     ddl m}  || ||| |d      }|D ]  }|j                  d |      }	 |j                         }
|
S )Nr   )create_onnxruntime_sessionT)enable_all_optimizationnum_threadsenable_profiling)benchmark_helperr(   runend_profiling)onnx_model_pathr   providerr   
thread_num
all_inputsr(   sessioninputs_profile_files              r$   run_profiler7      sV    ;($6 6G KKf%  ((*Lr&   c                     t        d|  d       t        |       5 }t        j                  |      }d d d        t	        t
              sJ |S # 1 sw Y   xY w)Nzloading profile output z ...)printopenjsonload
isinstancelist)r6   opened_file	sess_times      r$   load_profile_jsonrA      sR    	#L>
67	l	{IIk*	 
 i&&&	 
	s   AAc                    i }i }i }d}d}| D ]  }|d   dk(  r
|d   dk(  rd}|s|d   dk(  s!d	|v s&d
|v s+d|d
   v s3|d   }|d
   d   }	|	t         v rI|	sd| d}	||v r||xx   |d	   z  cc<   ||xx   dz  cc<   n|d	   ||<   d||<   |	||<   ||d	   z  } |sdgS g }
|
j                  d|dz  dd       |
j                  d       |
j                  d       t        |j                         d d      D ]I  \  }}||z  }||k  r||   }|t	        |      z  }|
j                  |dd|dz  dd|dd|dd| 	       K i }|j                         D ]!  \  }}	||   }|	|v r||	xx   |z  cc<   |||	<   # |
j                  d       |
j                  d       |
j                  d       t        |j                         d  d      D ](  \  }	}||z  }|
j                  |dd|dz  dd|	        * |
S )!  Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.

    Args:
        sess_time (List[Dict]): profile data
        kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
        threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.

    Returns:
        List[str]: lines of string for output.
    r   FcatSessionnamesession_initializationTKerneldurargsop_name()r   zNo kernel record found!z%
Top expensive kernels with Time% >= d   .2f:@----------------------------------------------------------------u&   Total(μs)	Time%	Calls	Avg(μs)	Kernelc                     | d   S Nr    xs    r$   <lambda>z&parse_kernel_results.<locals>.<lambda>   s    1Q4r&   keyreverse10d	      Y@5.2f5d8.1fz
Group kernel time by operator:u   Total(μs)	Time%	Operatorc                     | d   S rS   rT   rU   s    r$   rW   z&parse_kernel_results.<locals>.<lambda>  s    1Q4r&   )NODES_TYPE_CONTAINING_SUBGRAPHappendsorteditemsr   )r@   	thresholdkernel_name_to_op_namekernel_timekernel_freqtotalsession_inititemkernel_namerK   linesdurationratiocallsavg_timeop_times                   r$   parse_kernel_resultsrt      s     KKEL;)#V8P(PL;("u}4IY]^dYeLev,K6l9-G88 k]!,k)K(DK7(K(A-(+/;K(+,K(6=&{3T%[ E5 8 )** E	LL9)C-9LANO	LL	LL=>!'(9(9(;Y]!^X5 9K(eEl*~Rd';2eBZr(SWXZ[fZghi "_ G 6 < < >W{+gG('GG !? 
LL34	LL	LL./#GMMOQUV5 ~Rd';2gYGH W Lr&   c                 d   g }i }i }i }d}| D ]  }|d   dk(  sd|v sd|v sd|d   v s|d   j                  dd	      j                  d
d	      j                  dd	      }	d|d   v r=|d   d   dk(  rd}
n|d   d   dk(  rd}
n|d   d   dk(  rd}
|	|vr
||	<   n||	   
k(  sJ |r|d   d   }|t        v r|	|v r||	xx   |d   z  cc<   ||	xx   dz  cc<   n|d   ||	<   d||	<   |j                  |	       ||d   z  } g d}d}|D ]d  }	||	   }||	   }|t        |      z  }||z  dz  }|j	                  |	d	      }||z  }|j                  |dd|dd|dd|dd|dd|dd|	        f |j                  d|dz  dd        |j                  d!       |j                  d"       t        |j                         d# d$%      D ]d  \  }	}||z  }||k  r||	   }|t        |      z  }||z  dz  }|j	                  |	d	      }|j                  |dd|dd|dd|dd|dd|	        f |S )&rC   r   rD   NoderI   rJ   rK   rF   _kernel_time _fence_before_fence_afterr0   CPUExecutionProviderCPUCUDAExecutionProviderCUDADmlExecutionProviderDMLr   )z
Nodes in the original order:rQ   u3   Total(μs)	Time%	Acc %	Avg(μs)	Calls	Provider	Nodeg        r]   r[   r\   r^   r`   r_   8sz#
Top expensive nodes with Time% >= rN   rO   rP   rQ   u-   Total(μs)	Time%	Avg(μs)	Calls	Provider	Nodec                     | d   S rS   rT   rU   s    r$   rW   z$parse_node_results.<locals>.<lambda>V  s    qtr&   TrX   )replacerb   rc   r   getrd   re   )r@   r   rf   node_name_list	node_time	node_freqnode_providerrj   rl   	node_namedevicerK   rn   before_percentagero   rq   rr   
percentager0   rp   s                       r$   parse_node_resultsr     s2    NIIME;& Ud]v~)W[\bWcJcV$$^R8@@RTU]]^lnpq  T&\)<
+/EE"F&\*-1HH#F&\*-1GG"FM1/5M),(3v===!6l9-G88I%)$U3$)$)$'+E{	)$'(	)$%%i0T%[ EC HE
 #	Y')$eEl*&%/
 $$Y3Z'nBz$/r2CD1IHUY?Z\]bce\ffhiqrthuuw  yB  xC  D	
 $ 
LL7	#c7J!LM	LL	LLEF%ioo&7^UYZ	85 9)$eEl*&%/
 $$Y3~R
4'88D/ERT:UWX`acWddfgpfqrs  [ Lr&   c                 2   i }i }d}i }i }i }i }	d}
i }| D ]  }|d   dk(  sd|v sd|v sd|d   v s|d   d   }|t         v r0d|d   vr-d|d	   v r%||	v r|	|xx   |d   z  cc<   n|d   |	|<   |
|d   z  }
d|d   j                  dd
      }||v r||xx   dz  cc<   nd||<   | d| }||v r||xx   |d   z  cc<   ||xx   dz  cc<   n|d   ||<   d||<   ||v r||xx   |d   z  cc<   n|d   ||<   ||v r||xx   |d   z  cc<   ||xx   dz  cc<   n|d   ||<   d||<   ||d   z  } d
dg}|j                  d       |j                  d       t        |j	                         d d      D ]h  \  }}|	j                  |d      }||z  }||z   }|||
z   z  }||   }||z  }|j                  |dd|dz  dd|dd|dz  dd|dd|dd|dd|        j |d
dgz  }|j                  d       |j                  d       t        |j	                         d d      D ]n  \  }}|j                  d      }|d   }|d   }|j                  dd
      }||   }||z  }|||   z  }|j                  |dd|dz  dd|dd|dd|dd|        p |S ) a  Group results by operator name.

    Args:
        sess_time (List[Dict]): profile data
        kernel_time_only (bool): Only include items for kernel time.
        use_gpu (bool): GPU is used in profiling or not.

    Returns:
        List[str]: lines of string for output.
    r   rD   rv   rI   rJ   rK   r0   fencerF   rx   r   rP   zGrouped by operatorrQ   uM   Total(μs)	Time%	Kernel(μs)	Kernel%	Calls	AvgKernel(μs)	Fence(μs)	Operatorc                     | d   S rS   rT   rU   s    r$   rW   z$group_node_results.<locals>.<lambda>  s    QqTr&   TrX   r[   r\   r]   r^   11dr_   z14.1fzGrouped by provider + operatoru<   Kernel(μs)	Provider%	Calls	AvgKernel(μs)	Provider	Operatorc                     | d   S rS   rT   rU   s    r$   rW   z$group_node_results.<locals>.<lambda>  s    RSTURVr&   ExecutionProviderz9.2fr   )rb   r   rc   rd   re   splitr   )r@   r   r   op_kernel_timeop_kernel_recordstotal_kernel_timeprovider_op_kernel_timeprovider_op_kernel_recordsprovider_kernel_timeop_fence_timetotal_fence_timeprovider_counterrl   rK   r0   rY   rn   rh   
fence_timekernel_time_ratio
total_time
time_ratiokernel_callsavg_kernel_timepartsshort_eprq   provider_time_ratios                               r$   group_node_resultsr   d  s*    N !#M;& Ud]v~)W[\bWcJc6l9-G 88f-d6l*-/%g.$u+=.15eg.$U3$F|''
B7H++ *a/*-. *Jay)C--',U;,*3/14//3E{',23*3///$X.$u+=.15e$X..(w'4;6'!'*a/**.u+w'-.!'*e,W Z &'E	LL	LLgh &~';';'=>[_ `"&&w2
'*;; :-
#47G#GH
(1%4#be!3D 9K;LBO`chOhimNnnpq}  A  qB  BD  ET  UZ  D[  []  ^h  il  ]m  mo  pw  ox  y	
 !a 
b233E	LL	LLTU"#:#@#@#B`de[		#8(##$7<*3/%-),@,JJ3r"5"=d!C2eBZrRabgQhhjkstvjwwy  {B  zC  D	
 f Lr&   c                 ~    t        | j                  d            t        k(  rt        | | j                  d            S d S )Nvalue)r	   
WhichOneofr   getattr)dims    r$   get_dim_from_type_protor     s4    489P4QUX4X73w/0b^bbr&   c                 z    | j                   j                  j                  D cg c]  }t        |       c}S c c}w N)tensor_typeshaper   r   )
type_protods     r$   get_shape_from_type_protor     s4    0:0F0F0L0L0P0PQ0P1#A&0PQQQs   8c                     i }| j                         D ]Z  }t        |j                        }g }t        |      D ]'  \  }}	t	        |	t
              s|j                  |       ) t        |      dkD  r yt        |      dkD  r|||d   <   t        |      dkD  r|||d   <   |j                  j                  j                  }
|
t        j                  t        j                  t        j                  fv sJ |
t        j                  k(  rt        j                  n2|
t        j                  k(  rt        j                   nt        j"                  }t        j$                  ||      }|||j&                  <   ] t)        |      D cg c]  }| }}|S c c}w )a  Create dummy inputs for ONNX model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        samples (int): number of samples

    Returns:
        List[Dict]: list of inputs
       Nr   r   dtype)'get_graph_inputs_excluding_initializersr   r	   	enumerater=   r   rc   lenr   	elem_typer   FLOATINT32INT64numpyfloat32int64int32onesrF   range)
onnx_model
batch_sizesequence_lengthsamplesdummy_inputsgraph_inputr   symbol_dimsir   r   	data_typedatar5   r2   s                  r$   create_dummy_inputsr     sb    L!IIK)+*:*:;&FAs#s#""1% '
 {a{a$.E+a.!{a$3E+a.!$$00::	[..0A0A;CTCTUUUU K--- MM!*k.?.?!?%++U[[ 	
 zz%y1)-[%%&/ L2 ).g71,J7 8s   >	Fc                 V    ddl m}m}  || |||      \  }	}
} ||||dd|	|
|d	      }|S )a-  Create dummy inputs for BERT model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        samples (int): number of samples
        input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
        segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
        input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.

    Returns:
        List[Dict]: list of inputs
    r   )find_bert_inputsgenerate_test_data{   F)
test_casesseedr   	input_idssegment_ids
input_maskrandom_mask_length)bert_test_datar   r   )r   r   r   r   input_ids_namesegment_ids_nameinput_mask_namer   r   r   r   r   r2   s                r$   create_bert_inputsr     sN    . D)9*nVfhw)x&I{J# 
J r&   c                    |||||z   d}i }| j                         D ]%  }t        |j                        }t        |      D ]0  \  }	}
t	        |
t
              s|
|vrt        d|
       ||
   ||	<   2 |j                  j                  j                  }|t        j                  t        j                  t        j                  fv sJ |t        j                  k(  rt        j                  n2|t        j                  k(  rt        j                  nt        j                   }t        j"                  ||      }|||j$                  <   ( t'        |      D cg c]  }| }}|S c c}w )a  Create dummy inputs for GPT-2 model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        past_sequence_length (int): past sequence length
        samples (int): number of samples

    Raises:
        RuntimeError: symbolic is not supported. Use the tool convert_to_onnx.py to export ONNX model instead.

    Returns:
        List[Dict]: list of inputs
    )r   seq_lenpast_seq_lentotal_seq_lensymbol is not supported: r   )r   r   r	   r   r=   r   RuntimeErrorr   r   r   r   r   r   r   r   r   r   r   rF   r   )r   r   r   past_sequence_lengthr   symbolsr   r   r   r   r   r   r   r   r5   r2   s                   r$   create_gpt2_inputsr   #  sL   $ !",(+??	G L!IIK)+*:*:;&FAs#s#g%&)B3%'HII&s|E!H '  $$00::	[..0A0A;CTCTUUUU K--- MM!*k.?.?!?%++U[[ 	
 zz%y1)-[%%&# L& ).g71,J7 8s   	E c                 "   ||d}i }| j                         D ]V  }t        |j                        }t        |      D ]0  \  }	}
t	        |
t
              s|
|vrt        d|
       ||
   ||	<   2 |j                  j                  j                  }|t        j                  t        j                  t        j                  fv sJ |t        j                  k(  rt        j                  n2|t        j                  k(  rt        j                  nt        j                   }d|j"                  v r#t        j$                  ||      }d|ddd|f<   nt        j&                  ||      }|||j"                  <   Y t)        |      D cg c]  }| }}|S c c}w )a  Create dummy inputs for Longformer model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        global_length (int): number of global tokens
        samples (int): number of samples

    Raises:
        RuntimeError: symbolic is not supported. Use the tool convert_longformer_to_onnx.py to export ONNX model instead.

    Returns:
        List[Dict]: list of inputs
    )r   r   r   globalr   r   N)r   r   r	   r   r=   r   r   r   r   r   r   r   r   r   r   r   r   rF   zerosr   r   )r   r   r   global_lengthr   r   r   r   r   r   r   r   r   r   r5   r2   s                   r$   create_longformer_inputsr   S  sl     (OLGL!IIK)+*:*:;&FAs#s#g%&)B3%'HII&s|E!H '  $$00::	[..0A0A;CTCTUUUU K--- MM!*k.?.?!?%++U[[ 	 {''';;uI6D&'DN]N"#::e95D)-[%%&- L0 ).g71,J7 8s   ?	Fc                     t        |       }t        ||j                        }|t        ||j                  |j                        z  }|t        ||j                  |j                        z  }|S r   )rA   rt   rf   r   r   r   r   )r6   rJ   profile_recordsrn   s       r$   process_resultsr     s]    '5O $..AE	1F1FWWE	1F1FUUELr&   c           	         | j                   dkD  r| j                   nt        j                  d      }dt        j                  vrt        |      t        j                  d<   ddlm} ddlm	}  | || j                              }d }| j                  dk(  rNt        || j                  | j                  | j                  | j                   | j"                  | j$                        }n| j                  dk(  r8t'        || j                  | j                  | j(                  | j                        }ns| j                  d	k(  r8t+        || j                  | j                  | j,                  | j                        }n,t/        || j                  | j                  | j                        }t1        | j                  | j2                  | j4                  | j6                  | j                   |      }|S )
Nr   F)logicalOMP_NUM_THREADS)r<   )	OnnxModelr   r   r   )r1   psutil	cpu_countosenvironr   onnxr<   r   r   modelr   r   r   r   r   r   r   r   r   r   r   r   r   r7   r   r0   r   )rJ   r*   r<   r   r   r2   r6   s          r$   r-   r-     s   %)__q%8$//f>N>NW\>]K 

*(+K(8

$%$4

+,JJF"'OO  LL!!  

 
		f	$'OO  %%LL

 
		l	*-OO  LL

 )T__dFZFZ\`\h\hi


L r&   __main__	Arguments)setup_loggerzMrequires either --model to run profiling or --input to read profiling resultsr   )r   )Fr   )NNN)!r   r;   r   r   r   r   r   rb   r%   r7   rA   rt   r   r   r   r   r   r   r   r   r   r-   __name__	argumentsr9   r,   r   r   inputr   r6   resultslinerT   r&   r$   <module>r     s      	    "8 I#X&M`TnaHcR'^ &R-`,^	3l z!I	+y!-""#??o oo9~ lI6Gd ! r&   