
    g`                     t   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0Z0 ejb                  e2      Z3i deddfdeddfdedd fdedd fdeddfdeddfde ddfd e dd fd!edd fd"e"dd fd#e$ddfd$eddfd%e(ddfd&e&dd'fd(e*ddfd)e,ddfd*eddfZ4dd+dd,g d+d+d-d.f	dd/d0e	e
e5ef      d1e6d2e	e5   d3e	e7   d4ee5   d5e6d6e6d7e5d8e7d9e	e5   d:e5fd;Z8	 	 	 	 dJd<ed=e5d>e7d?e7d@e	e   d:efdAZ9	 	 	 	 	 	 	 	 dKdd/dBe
e5ef   d=e5d>e7d?e7d@e	e   d3e	e7   d1e6dCe6d5e6d9e	e5   d:efdDZ:d2e5d:ee5e7f   fdEZ;dF Z<dG Z=dH Z>e2dIk(  r e>        yy)L    N)Path)DictListOptionalUnion)FusionOptions)
ModelProto
load_model)	OnnxModel)BartOnnxModel)BertOnnxModel)BertOnnxModelKeras)BertOnnxModelTF)ClipOnnxModel)ConformerOnnxModel)Gpt2OnnxModel)PhiOnnxModel)Sam2OnnxModel)T5OnnxModel)TnlrOnnxModel)UnetOnnxModel)VaeOnnxModel)extract_raw_data_from_modelhas_external_databartpytorch   bertbert_tftf2onnx
bert_keras
keras2onnxclip	conformergpt2gpt2_tfgpt_neoxphisam2swintnlrt5   unetvaevitFc    i   )provider
onnx_modeluse_gpuoptimized_model_path	opt_leveldisabled_optimizersverbosesave_as_external_dataexternal_data_filenameexternal_data_file_thresholdr3   returnc	                   |dv sJ ddl m} | |
j                  dd      } | J |rG|	Et        t	        j
                               j                  g d      rt        j                  d       | S t        | t              rt        t        | d	            n
t        |       }|j                         r|st        j                  d
       t	        j                         }|dk(  r t        j                   j"                  |_        nD|dk(  r t        j                   j&                  |_        nt        j                   j(                  |_        |Mt        | t              r$t        t+        |       j-                  d            }nd}dj/                  |||rdnd      }||_        |r]t3        |      dk(  r"t4        j6                  j9                  |      dz   }|j;                  d|       |j;                  dt        |             |rt=        d       d|_        i }|r||d<   |sdg}n|	D|	dk(  rdg}n)|	dk(  rdg}n |	dk(  rddg}n|	dk(  rdg}n|	d k(  rd!dg}ndg}|jA                  d       nBg }|jB                  r#|jA                  d       |jA                  d       n|jA                  d       t        | tD              rHtG        |       rtI        d"      tK        |       \  }}|jM                  tO        |      tO        |             t        | tD              r| jQ                         n| } t	        jR                  | |fd#|i| t4        j6                  jU                  |      rt4        j6                  jW                  |      sJ t        jY                  d$|       |S )%a  
    Use onnxruntime to optimize model.

    Args:
        onnx_model (str | ModelProto): the path of input onnx model or ModelProto.
        use_gpu (bool): whether the optimized model is targeted to run in GPU.
        optimized_model_path (str or None): the path of optimized model.
        opt_level (int): graph optimization level.
        disabled_optimizers (List[str]): a list of names of disabled optimizers
        save_as_external_data (bool): whether to save external data outside of ONNX model
        external_data_filename (str): name of external data file. If not provided, name is automatically created from ONNX model.
        external_data_file_threshold (int): threshold to decide whether to save tensor in ONNX model or in external data file
        provider (str or None): execution provider to use if use_gpu
    Returns:
        optimized_model_path (str): the path of optimized model
    )r   r-   r1   r   )versionNonnx_model_path)CUDAExecutionProviderROCMExecutionProviderMIGraphXExecutionProviderz3There is no gpu for onnxruntime to do optimization.Fload_external_dataab  This model uses float16 in the graph, use_gpu=False might cause extra Cast nodes. Most operators have no float16 implementation in CPU, so Cast nodes are added to compute them in float32. If the model is intended to use in GPU, please set use_gpu=True. Otherwise, consider exporting onnx in float32 and optional int8 quantization for better performance. r   r-   r2   optimized_modelz{}_o{}_{}.onnxgpucpuz.dataz7session.optimized_model_external_initializers_file_namez?session.optimized_model_external_initializers_min_size_in_bytesz@Using onnxruntime to optimize model - Debug level Set to verboser8   CPUExecutionProviderdmlDmlExecutionProviderrocmrB   migraphxrC   cudarA   tensorrtTensorrtExecutionProviderzModelProto has external data not loaded into memory, ORT cannot create session. Please load external data before calling this function. See https://onnx.ai/onnx/repo-docs/ExternalData.html for more information.	providersz)Save optimized model by onnxruntime to %s)-torchr?   popsetonnxruntimeget_available_providers
isdisjointloggererror
isinstancestrr   r
   use_float16warningSessionOptionsGraphOptimizationLevelORT_ENABLE_BASICgraph_optimization_levelORT_ENABLE_EXTENDEDORT_ENABLE_ALLr   with_suffixformatoptimized_model_filepathlenospathbasenameadd_session_config_entryprintlog_severity_levelappendhipr	   r   
ValueErrorr   add_external_initializerslistSerializeToStringInferenceSessionexistsisfiledebug)r4   r5   r6   r7   r8   r9   r:   r;   r<   r3   deprecated_kwargstorch_versionmodelsess_optionspath_prefixkwargsrQ   external_namesexternal_valuess                      W/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/optimizer.pyoptimize_by_onnxruntimer   H   su   < 
""".&**+<dC
!!! 	3356AA[
 	JK j#& 	*ZEBCz" 

 7t	
 --/LA~0;0R0R0c0c-	a0;0R0R0f0f-0;0R0R0a0a-#j#&d:.::2>?K+K/66{IX_uejk,@L)%&!+%'WW%5%56J%Kg%U"--EG]	
 	--MsSoOp	
 PQ*+'F(;$%+,			u/0I01I#46MNI01I#46MNI01I/0	894545 *j)Z(] 
 +Fj*Q'..tN/CT/EZ[ 4>j*3U--/[eJ  \YYYRXY77>>./BGGNNCW4XXX
LL<>RS    rz   
model_type	num_headshidden_sizeoptimization_optionsc                    |dvr"|dk(  s|dk(  rt         j                  d|        |t        vr$t         j                  d| d       t        |       S t        |   \  }}}| j                  r5|| j                  k7  r&t         j                  d| d| j                   d       |t        |      } || ||      }|j                  |       |j                          d	|j                  _        dd
l	m
}	 |	|j                  _        |S )ae  Optimize Model by graph fusion logic.

    Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable
    constant folding during exporting ONNX model, or run optimize_by_onnxruntime on the model first like optimize_model.

    For BERT model, num_heads and hidden_size are optional. For other model types, you need to specify these parameters.

    Args:
        model (ModelProto): model object
        model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
        num_heads (int, optional): number of attention heads. Defaults to 0.
                                   0 allows detect the parameter from graph automatically.
        hidden_size (int, optional): hidden size. Defaults to 0.
                                     0 allows detect the parameter from graph automatically.
        optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions.
                                                        Defaults to None.

     Returns:
        object of an optimizer class.
    )r   r*   r.   r/   r#   r)   r   zFPlease specify parameters of num_heads and hidden_size for model_type Unsupported model type: z) for graph fusion, directly return model.z&Model producer not matched: Expected "z", Got "z0".Please specify correct --model_type parameter.zonnxruntime.transformers)__version__)rX   r]   MODEL_TYPESr   producer_namer   optimizetopological_sortrz   rU   r   producer_version)
rz   r   r   r   r   optimizer_classproducer_	optimizeronnxruntime_versions
             r   optimize_by_fusionr      s   6 HHi[\n`kop`p_`j_klm$1*=fgh%0%<"_hx5+>+>>4XJhuGZGZF[ \= =	

 #,Z8y+>I+, $>IOO!>':IOO$r   inputonly_onnxruntimec	          
      B   ||dv sJ |t         vrHt        j                  d| d       t        | t              rt        t        |             S t        |       S t         |   \  }
}}||}dg}d}t        j                         }dj                  ||rdnd      }t        j                  j                  |j                  |      }d	}t        | t              rt        | d	
      n| }t        |      rd}~|dkD  r||rg ng dz  }t        | ||	|||||      }n|dk(  rt        | ||	|d|||      }|r|st        j                  d       |t        |      }nt        | t              rt        |       }n| }|r |
|||      }nt!        |||||      }|j#                          |S )a
  Optimize Model by OnnxRuntime and/or python fusion logic.

    ONNX Runtime has graph optimizations (https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html).
    However, the coverage is limited. We also have graph fusions that implemented in Python to improve the coverage.
    They can combined: ONNX Runtime will run first when opt_level > 0, then graph fusions in Python will be applied.

    To use ONNX Runtime only and no Python fusion logic, use only_onnxruntime flag and a positive opt_level like
        optimize_model(input, opt_level=1, use_gpu=False, only_onnxruntime=True)

    When opt_level is None, we will choose default optimization level according to model type.

    When opt_level is 0 and only_onnxruntime is False, only python fusion logic is used and onnxruntime is disabled.

    When opt_level > 1, use_gpu shall set properly
    since the optimized graph might contain operators for GPU or CPU only.

    If your model is intended for GPU inference only (especially float16 or mixed precision model), it is recommended to
    set use_gpu to be True, otherwise the model is not optimized for GPU inference.

    For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters.

    Args:
        input (str | ModelProto): input model path or ModelProto.
        model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
        num_heads (int, optional): number of attention heads. Defaults to 0.
            0 allows detect the parameter from graph automatically.
        hidden_size (int, optional): hidden size. Defaults to 0.
            0 allows detect the parameter from graph automatically.
        optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions.
            Defaults to None.
        opt_level (int, optional): onnxruntime graph optimization level (0, 1, 2 or 99) or None. Defaults to None.
            When the value is None, default value (1 for bert and gpt2, 0 for other model types) will be used.
            When the level > 0, onnxruntime will be used to optimize model first.
        use_gpu (bool, optional): use gpu or not for onnxruntime. Defaults to False.
        only_onnxruntime (bool, optional): only use onnxruntime to optimize model, and no python fusion.
            Defaults to False.
        provider (str, optional): execution provider to use if use_gpu. Defaults to None.

     Returns:
        object of an optimizer class.
    Nr   r   r-   r1   r   z) for optimization, directly return model.ConstantSharingzmodel_o{}_{}.onnxrG   rH   FrD   Tr   )MatMulScaleFusionMatMulAddFusionMatmulTransposeFusionGemmActivationFusionBiasSoftmaxFusion)r5   r3   r6   r7   r8   r9   r:   zKPlease specify a positive value for opt_level when only_onnxruntime is True)r   rX   r]   rZ   r[   r   r
   tempfileTemporaryDirectoryre   rh   ri   joinnamer   r   r   cleanup)r   r   r   r   r   r7   r5   r   r9   r3   r   r   default_opt_levelr8   temp_model_pathtemp_diroptimized_model_namer6   has_external_data_fileoriginal_modelrz   r   s                         r   optimize_modelr     s   l 	] :::$1*=fgh/9%/EyE*+[9UZK[[.9*.E+_a*%	
 --O**,H.55i'W\]77<<7KL #DNuVYDZZ%@`eN(!%1} 
	
 2!5 3"8	
 
a
 2!5 3"8	
 de"?+	E3	5!#E9kB	&uj)[Rfg	 r   c                 T    t        | dd      }t        |      }|j                         S )z
    Get counter of fused operators in optimized model.

    Args:
        optimized_model_path (str): the path of onnx model.

    Returns:
        A dictionary with operator type as key, and count as value
    NT)re   rE   )r
   r   get_fused_operator_statistics)r6   rz   r   s      r   get_fusion_statisticsr     s,     +DTREe$I2244r   c                     t        j                  d      } | j                  ddt        d       | j                  ddt        d       | j                  d	d
t        j                  dt        t        j                               ddj                  t        j                               z          | j                  dd
t        dd       | j                  dd
t        dd       | j                  dd
dd       | j                  d
       | j                  dd
dd       | j                  d
       t        j                  |        | j                  dd
dd       | j                  d
       | j                  d d
dd!       | j                  d
"       | j                  d#d
t        d d$       | j                  d%d
dd&       | j                  d
'       | j                  d(d
t        g d)d d*+       | j                  d,d
dd-       | j                  d
.       | j                  d/d
dd0       | j                  d
1       | j                  d2d
dd3       | j                  d
4       | j                         }|S )5NztGraph optimization tool for ONNX Runtime.It transforms ONNX graph to use optimized operators for Transformer models.)descriptionz--inputTzinput onnx model path)requiredtypehelpz--outputzoptimized onnx model pathz--model_typeFr   z!Model type selected in the list: z, )r   r   defaultchoicesr   z--num_headsr   znumber of attention heads like 12 for bert-base and 16 for bert-large. Default is 0 to detect automatically for BERT.For other model type, this parameter need specify correctly.)r   r   r   r   z--hidden_sizezhidden size like 768 for bert-base and 1024 for bert-large. Default is 0 to detect automatically for BERT. For other model type, this parameter need specify correctly.z--input_int32
store_truezyUse int32 (instead of int64) inputs. It could avoid unnecessary data cast when EmbedLayerNormalization is fused for BERT.)r   actionr   )input_int32z	--float16zConvert all weights and nodes in float32 to float16. It has potential loss in precision compared to mixed precision conversion.)float16z	--verbosezshow debug information.r9   z	--use_gpuzZUse GPU for inference. Set this flag if your model is intended for GPU when opt_level > 1.)r5   z
--providerz$Execution provider to use if use_gpuz--only_onnxruntimez<optimized by onnxruntime only, and no graph fusion in Python)r   z--opt_levelr   zonnxruntime optimization level. 0 will disable onnxruntime graph optimization. The recommended value is 1. When opt_level > 1 is used, optimized model for GPU might not run in CPU. Level 2 and 99 are intended for --only_onnxruntime.)r   r   r   r   r   z--use_external_data_formatz4use external data format to store large model (>2GB))use_external_data_formatz--disable_symbolic_shape_inferzdiable symbolic shape inference)disable_symbolic_shape_inferz--convert_to_packing_modezEconvert the model to packing mode. Only available for BERT like model)convert_to_packing_mode)argparseArgumentParseradd_argumentr[   lowerrr   r   keysr   intset_defaultsr   add_arguments
parse_args)parserargss     r   _parse_argumentsr     s   $$VF 	DsAXY

TB]^
YY[%%'(0499[=M=M=O3PP   G   G   _	   E*
U	   &'
eLOhi
&
i	   &
3   K	   /
>  	 $C	   7
(.	   U;
#T	   6DKr   c                 d    | rt        j                  dd       y t        j                  d       y )NDEBUGz8[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s)levelfmtz%(funcName)20s: %(message)s)r   )coloredlogsinstallr   s    r   _setup_loggerr   )  s*    J	

 	 =>r   c                  b   t               } t        | j                         t        j	                  d|         t
        j                  j                  | j                        t
        j                  j                  | j                        k(  rt        j                  d       t        j                  |       }t        | j                  | j                  | j                  | j                   | j"                  || j$                  | j&                  | j(                  	      }| j*                  r|j-                  d       | j.                  r|j1                          |j3                          |j5                         }d| j                  v r'|j7                  |      rt        j9                  d       nt        j9                  d       | j:                  rA| j                  dk(  r|j;                  | j<                          nt        j                  d	       |j?                  | j                  | j@                         y )
Nz
arguments:zYSpecified the same input and output path. Note that this may overwrite the original model)r7   r   r5   r3   r   T)keep_io_typesr   z#The model has been fully optimized.zThe model has been optimized.z+Packing mode only supports BERT like models)!r   r   r9   rX   rw   rh   ri   realpathr   outputr]   r   parser   r   r   r   r7   r5   r3   r   r   convert_float_to_float16r   change_graph_inputs_to_int32get_operator_statisticsr   is_fully_optimizedinfor   r   save_model_to_filer   )r   r   r   fused_op_counts       r   mainr   3  s   D$,,
LL:dV$%	ww

#rww'7'7'DDrs(..t4

..1..
I ||***>..0 %%'<<>N Y%A%A.%Q9:34##??f$--$2S2S.STNNHI  d.K.KLr   __main__)r   r   r   N)r   r   r   NNFFF)?r   loggingrh   r   pathlibr   typingr   r   r   r   r   fusion_optionsr   onnxr	   r
   r4   r   onnx_model_bartr   onnx_model_bertr   onnx_model_bert_kerasr   onnx_model_bert_tfr   onnx_model_clipr   onnx_model_conformerr   onnx_model_gpt2r   onnx_model_phir   onnx_model_sam2r   onnx_model_t5r   onnx_model_tnlrr   onnx_model_unetr   onnx_model_vaer   
onnx_utilsr   r   rU   	getLogger__name__rX   r   r[   boolr   r   r   r   r   r   r   r    r   r   <module>r      s  (   	   . .  ( '   ) ) 4 . ) 3 ) ' ) % ) ) ' E 			8	$
]Iq)
]Iq) A. %|Q7	
 ]Iq) $i3 ]Iq) y!, 	1- 
L)Q' ]Iq) ]Iq) ]Iq) 	;	1
% ]Iq)  
L)Q'!" 
M9a(#, 48*.!%'"'"$(,J  #J sJ/0J J  #3-J  }	J 
 cJ  J   J   J  #&J  smJ  	J ^ 48888 8 	8
 #=18 8z 48#"J #Jj!JJ J 	J
 #=1J }J J J J smJ JZ5 5S#X 5zz?-M` zF r   