
    gD:                        d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
Z
ddlZddlZddlmZ d Zdej                  fdZd#d	ed
e	e   fdZdej                  dedefdZdej                  dedefdZdej                  dedej                  fdZdedej2                  defdZdej                  dededededefdZdej                  dedededef
dZ ej<                         d	ed
e	e   dededef
d        Zd! Z e!d"k(  rF e        Z" ee"jF                  e"jH                  e"jJ                  e"jL                  e"jN                         yy)$z
Export LLM to onnx
    N)Path)Optional)nnc                     d t         j                  j                  _        d t         j                  j                  _        d t         j                  j                  _        d t         j                  j                  _        d t         j                  j                  _        d t         j                  j                  _        d t         j                  j                  _	        d t         j                  j                  _
        y	)
z1do not init model twice as it slow initializationc                     | S N xargskwargss      b/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/large_model_exporter.py<lambda>z*disable_huggingface_init.<locals>.<lambda>   s        c                     | S r   r	   r
   s      r   r   z*disable_huggingface_init.<locals>.<lambda>   s    r   c                     | S r   r	   r
   s      r   r   z*disable_huggingface_init.<locals>.<lambda>   s    qr   c                     | S r   r	   r
   s      r   r   z*disable_huggingface_init.<locals>.<lambda>   s    r   c                     | S r   r	   r
   s      r   r   z*disable_huggingface_init.<locals>.<lambda>       qr   c                     | S r   r	   r
   s      r   r   z*disable_huggingface_init.<locals>.<lambda>   s    ar   c                     | S r   r	   r
   s      r   r   z*disable_huggingface_init.<locals>.<lambda>    r   r   c                     | S r   r	   r
   s      r   r   z*disable_huggingface_init.<locals>.<lambda>!   s    1r   N)torchr   initkaiming_uniform_uniform_normal_	constant_xavier_uniform_xavier_normal_kaiming_normal_orthogonal_r	   r   r   disable_huggingface_initr#      s     &BEHHMM"9EHHMM8EHHMM:EHHMM$@EHHMM!#?EHHMM $@EHHMM! <EHHMMr   modelc                 \   d}d}| j                         D ]9  }||j                         |j                         z  z  }||j                         z  }; d}d}| j                         D ]9  }||j                         |j                         z  z  }||j                         z  }; ||z   dz  dz  }|S )z-to calculate how much memory this model needsr      )
parametersnelementelement_sizebuffers)r$   
param_size	param_sumparambuffer_size
buffer_sumbufferall_sizes           r   get_model_parameter_sizer2   $   s    JI!!#enn&););)===
U^^%%	 $ KJ--/v(6+>+>+@@@foo''
 " [(D047HOr   hf_model	cache_dirc                    t                t        j                  j                  | t        j
                  |d      }|| }t        j                  j                  |      }t         |dd      j                               }||fS )zU
    get the pretrained torch model from hugginface,
    and sample model-inputs
    T)torch_dtyper4   trust_remote_codezHello, my dog is cutept)return_tensors)	r#   transformersAutoModelForCausalLMfrom_pretrainedr   float16AutoTokenizertuplevalues)r3   r4   	tokenizerr$   sample_inputss        r   "initialize_model_and_sample_inputsrC   4   s|     --==emmyTX > E 	**::9EI)$;DQXXZ[M-r   gpulistrB   c                    d }fd| j                         } g }|j                  | j                  |d             t        t	        | j                                     d   }| j                         D ]  \  }}|j                         D ]%  \  }}	|j                  |	j                  |d             t        |	      t        j                  j                  fv rt        j                  t        |	      t        |      z        }
t        |	      D ]s  \  }}|j                  |j                  |d             |t        ||
z  t        |               }|j                  |        ||       t!        d| d| d| d|        u |	j                  |d          t!        d| d| d|d           ( t        t#        |j                                     dk(  sg|j                  |d          t!        d| d|d            t        j$                         5   | |d   |d	   
       ddd       | S # 1 sw Y   | S xY w)z/Make the model executable across multiple GPUs.c           	         g }d }|D ]  }t        |      t        j                  ur|j                  |       nFt	        | d      r6|j                  |j                  | j                  j                               nt	        | d      rEt        | j                         |      j                  }|j                  |j                  |             nt	        t        | j                         d       d      rL|j                  |j                  t        | j                               j                  j                               nC|0|j                  |k7  r!|j                  |j                  |             n|j                  |       |||d   j                  } |j                         D ]5  \  }}t        |      t        j                  u s"|j                  |      ||<   7 t        |      |fS )Nweightr'   r   )typer   TensorappendhasattrtorG   devicenextr'   childrenitemsr?   )	modinputsr   modifyed_inputs	first_devlayer_inputrM   keyvalues	            r   input_gpu_device_hookz5auto_pipeline_parallel.<locals>.input_gpu_device_hookJ   sw   	!KK 4&&{3h'&&{~~cjj6G6G'HIl+cnn.<CC&&{~~f'=>cllnd3X>&&{~~d3<<>6J6Q6Q6X6X'YZ&;+=+=+J&&{~~i'@A&&{3 +A.55	 "  !,,.JCE{ell*#hhy1s ) o&//r   c                 j    | j                  |       | j                         D ]  } |d   |        y )N   )rL   named_children)rQ   devlayermove_layer_to_device_rurcs      r   r^   z9auto_pipeline_parallel.<locals>.move_layer_to_device_rurcc   s/    s'')E%eAh4 *r   Twith_kwargsr   zmove .z to rZ   attention_maskN)halfrJ   register_forward_pre_hookrN   iterr[   rH   r   r   
ModuleListmathfloorlen	enumerateminrL   printlistno_grad)r$   rD   rB   rX   	all_hookspre_fixtop_name
top_modulenamemodulenum_layers_on_each_gpuidx
attn_layerto_devr^   s                 @r   auto_pipeline_parallelrz   G   s   025
 JJLEIU445JX\4]^4,,./03G % 4 4 6*&557LD&V==>Sae=fgF| 3 344)-CK#g,4N)O&'0'8OC$$Z%I%IJ_mq%I%rs$S0F)FG%UVFMM&)-j&AE'!D63%tF8DE (9 		'!*%gYavT'!*>? 8 tJ--/01Q6MM'!*%E(4
|45# !7& 
mA}Q/?@ 
L 
Ls   H44H>	with_pastc                    g fd}| j                  |d      }t        j                  | j                        j                  }t        |j                               }|D cg c]  }|j                  |      j                   }} | |d   |d         }	|j                          d   |}
t        d         D ]  \  }}d   |   |
|<    d   j                         D ]  \  }}|j                  |      }||
|<    t        t        ||
            D ]]  \  }\  }}t        |      t        j                   u r|j#                  | j$                         d|v sD||
|<   |r | |d   |d   |      n|	}	_ ||
|	j&                  fS c c}w )	zn
    auto retrieve onnx inputs from torch model as we can't enumlate all possibilities
    for all models
    c                 4    j                  ||f       d   S )Nr   )rJ   )_rR   r   user_inputss      r   hook_for_inputsz-retrieve_onnx_inputs.<locals>.hook_for_inputs   s     FF+,1~r   Tr_   r   rZ   rb   	use_cache)rc   r   )re   inspect	signatureforwardr'   rn   keysgetdefaultremoverk   rP   indexziprH   r   rI   rL   rM   past_key_values)r$   rB   r{   r   hook_handleforward_params
input_keysrV   default_valuesoutonnx_inputsrw   _valrW   r   s                 @r   retrieve_onnx_inputsr      s   
 K 11/t1TK&&u}}5@@Nn))+,JAKL#n((-55NL
a q1A
BCa.K K{1~.	T&q>#.C /!!n**,
Us# C - 's:{'CD\c5;%,,&HHU\\"#(Kcl%a(q9IU^_ruC E {C$7$777# Ms   ""Fsample_inputs_tpreturnc                    t         j                  j                  d      j                  dz  dz  }t	        dt        |       dz   d       t	        d|dz   d       t        |       |dz  kD  rt        t         j                  j                               D cg c]  }t        j                  |       }}t        |      dkD  r&t	        t        |       d       t        | ||      } | S t	        d	       | j                         j                         } | S t	        d
       | j                         j                         } | S c c}w )a	  
    According to the model size, we will upload it to
    CPU if has no GPU or enough GPU memory,
    Single GPU if has only one GPU in local or model size is enough to fit one GPU
    Multiple GPU if there is more than one gpu in local and model is too large
    r   r&   zModel_Size = z GBztotal_mem_per_cpu = g?rZ   zk GPUs are used to export onnx,                    Please set CUDA_VISIBLE_DEVICES to use specific GPU groupz5!!!! convert model to float and export onnx using CPUzExport model on a single GPU)r   cudaget_device_propertiestotal_memoryrm   r2   rangedevice_countrM   rj   rz   cpufloatrd   )r$   r   total_mem_per_cpuidevice_collections        r   move_to_appropriate_devicer      s;    

88;HH4ORVV	M259$>?s
CD	 !24!7 8
<=&):T)AA6;EJJ<S<S<U6VW6VU\\!_6VW !A%()* +M N +52CEUVE L IJIIK%%'E L 	,-

!!#L Xs   D?rM   c                     g }| D ]N  }t        |t        j                        r!|j                  |j	                  |             >|j                  |       P t        |      S )zmove inputs to device)
isinstancer   rI   rJ   rL   r?   )rB   rM   sample_inputs_
sample_ints       r   adapt_inputs_to_devicer      sQ    N#
j%,,/!!*--"78!!*-	 $
   r   r   torch_input_namesr   input_with_pastc           	         d}ddi}|t        |      }t        j                  |d   d   j                        |d   j                  d   k(  j	                         j                  d      }|j                         dk(  sJ dd|j                         di}|s| j                  j                  }t        t        t        |            D 	cg c]%  }	t        ||	   t        j                        s!||	   ' c}	      }
d|
v rd|
v sJ d	       d
}ddddddd}t        |
      D ]>  \  }}||vst        ||   j                               D 	ci c]
  }	|	| d|	  }}	|||<   @ |r4t        |      D ]&  }	|
d|	 dfz  }
|
d|	 dfz  }
|||
d   <   |||
d   <   ( |s|r$t        |      D ]  }	|d|	 dfz  }|d|	 dfz  } t        |      D ]  \  }}|s	|dk(  r|||<   |dk(  r[||   }t        j                   |t        j"                  |j                  d   df|j$                  |j&                        fd      ||<   t|dk(  sz||   }|ddddf   ||<    |
||fS c c}	w c c}	w )z"fetch onnx inputs and outputs namer   
batch_sizeNrZ   seq_len	input_idsrc   z6input_ids and attention_mask must be existed in inputs)logits)r   rZ   )r   rc   __unknown_dims__zpast_key_values.z.keyz.valuezpresent.r   )rM   dtype)dim)rj   r   tensorshapenonzeroviewnumelitemconfignum_hidden_layersr?   r   r   rI   rk   r   catonesrM   r   )r$   r   r   r   r{   r   num_of_past_keykv_cache_axis	seq_indexr   onnx_inp_namesonnx_out_namesonnx_dynamic_axesrw   rt   unknown_dims	attn_maskr   s                     r   fetch_onnx_inputs_outputs_namer      s    O%M"o.\\/!"4Q"7"="=>+a.BVBVWYBZZccejjkmn	 A%%%L)..*:IF,,88 ',S1B-C'Dq'D!
S^_`SachcocoHp	1	'DqN 	~%*:n*L@?@L N%)4*y9
 ~.	T((DI+VYJZJ^J^J`DabDaqA#&6qc::DaLb&2d# / 'A!1!D9;;N!1!F;==N4AnR014AnR01 ( O'A!D133N!F355N ( 01	T((#2C ))',	#(99

IOOA,>+B9K[K[clcrcr st$C  $',	#,QV#4C  2 >+<<<U 	r cs   "I(II onnx_io_tuple	onnx_pathopsetc                 L   |j                   }|\  }}}t        j                         5 }	t        j                  j                  |	d      }
t        j                  j                  | t        |      |
d||||       |j                  d       |j                  | dz  j                  d       t        j                  t        |
            }t        j                  |t        |      t        t        j                   |	            dkD  d| ddd	       d
d
d
       y
# 1 sw Y   y
xY w)z do export with torch.onnx.exportztmp.onnxF)r$   r   fverboseopset_versioninput_namesoutput_namesdynamic_axesT)
missing_okz	_ext.datarZ   r&   )save_as_external_dataall_tensors_to_one_filelocationsize_thresholdconvert_attributeN)rt   tempfileTemporaryDirectoryospathjoinr   onnxexportr?   unlinkparentloadstr
save_modelrj   listdir)r$   r   r   r   r   onnx_model_namer   r   r   
tmpdirnametmp_onnx
onnx_models               r   do_export_internalr     s   nnO8E5NN$5 
	$	$	&*77<<
J7

{#&'* 	 		
 	D)			/y9	9AATARYYs8}-
	N#&rzz*'=#>#B$('(	2#	
% 
'	&	&s   C*DD#onnx_path_strc                    t        | |      \  }}t        ||      }t        |t        |j	                               j
                        }t        |||      \  }}	}
t        ||	||
|d      }d}t        |      j                         }|j                  dk7  r||z  }t        |||	||       |syt        ||	||
|d      }d}|j                  |z  }t        |||	||       y)z
    do export
    model: torch model
    onnx_path: where the onnx model saved to
    sample_inputs_tp: inputs for torch model
    Fz
model.onnxz.onnxNTzmodel_with_past.onnx)rC   r   r   rN   r'   rM   r   r   r   absolutesuffixr   r   )r3   r4   r   r{   r   r$   r   rB   r   r   past_key_valuer   r   r   s                 r   export_onnxr   6  s     A9UE&u.>?E*+;T%BRBRBT=U=\=\]M /C5-Yb.c+J^25+zSaclnstM"O=)224I7"/	um[)UK25+zSaclnrsM,O  ?2Ium[)UKr   c                  J   t        j                         } | j                  dddt        dgd       | j                  ddd	t        d
d       | j                  dd	t        dd       | j                  ddd	d       | j                  dd	t        dd       | j                         S )zarguments parsing.z-mz--modelTzmeta-llama/Llama-2-70b-hfz+Pre-trained models in huggingface model hub)requiredrH   r   helpz-sz--saved_pathFz./onnx_models/z"where the onnx model will be savedz--cache_dirNz[cache directly of huggingface, by setting this to avoid useless downloading if you have onez--with_past
store_truez;The tool will export onnx without past-key-value by default)actionr   r   z--opset   zothe opset to save onnx model,               try to increase it if this opset doens't have new features you want)argparseArgumentParseradd_argumentr   int
parse_args)parsers    r   parse_argumentsr   Z  s    $$&F
,-:    1   k   K	   S  	 r   __main__r   )(__doc__r   r   rh   r   r   pathlibr   typingr   r   r   r:   r   r#   Moduler2   r   rC   rn   r?   rz   boolr   r   rM   r   r   r   r   ro   r   r   __name__r   r$   r4   
saved_pathr{   r   r	   r   r   <module>r     s      	       
=BII     #  &:")) :d :5 :z 8		  8%  8D  8Fbii 5 RYY 8!% ! !% !A=99A=A= A= 	A=
 A= A=H!
bii !
 !
E !
^b !
kn !
H  L#  L(3-  L  LX\  Leh  L  LF+\ zD

DNNDOOT^^TZZX r   