
    g                       d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZmZmZ ddlZddlZddlZddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z. ddl/m0Z0 ddl1m2Z3 ddl4m5Z5m6Z6  ejn                  d      Z8 G d de      Z9dXdeee:      dejv                  fdZ<dejv                  fdZ=dejv                  fdZ>dYde:de?fdZ@dYde:de?de?fdZAde:de?de?de)fd ZBd!ej2                  d"efd#ZCd!ej2                  d"efd$ZDd!ej2                  d"efd%ZE	 	 	 	 dZd&ed'ed(e:d)eFd*eeG   d+eeG   fd,ZHd-ed.efd/ZI	 d[d!ed)eFdee   fd0ZJd1 ZKd2 ZLd3 ZMd4efd5ZNd4ed6e?d7e?de?fd8ZOd4efd9ZP	 d\d:ed;e:d<eFd=eFd>eFf
d?ZQd4efd@ZRd4efdAZSdBefdCZTdYdDe:de?fdEZU	 dYdDe:dFe:de?de?fdGZVdH ZWe9j                  fdejv                  dIe9fdJZYdejv                  d:ee e%f   dKej                  dLej                  dMeFdNeFdOeeeF      dee:ef   fdPZ[dQ Z\d]dejv                  dReee:      dSe?fdTZ]dXdejv                  dReee:      fdUZ^d^deee:      dReee:      fdVZ-e_dWk(  r e-        yy)_a  
This converts GPT2 or T5 model to onnx with beam search operator.

Example 1: convert gpt2 model with beam search:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx

Example 2: convert gpt2 model with beam search containing specific cuda optimizations:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu                       --past_present_share_buffer --use_decoder_masked_attention

Example 3: convert gpt2 model with beam search with mixed precision and enable SkipLayerNorm strict mode:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu -p fp16 --use_sln_strict_mode

Example 4: convert T5 model with beam search in two steps:
    cd ./models/t5
    python convert_to_onnx.py -m t5-small
    cd ../..
    python convert_generation.py -m t5-small --model_type t5                                            --decoder_onnx ./models/t5/onnx_models/t5-small_decoder.onnx                                    --encoder_decoder_init_onnx ./models/t5/onnx_models/t5-small_encoder_decoder_init.onnx          --output ./models/t5/onnx_models/t5_small_beam_search.onnx

Example 5: convert T5 model with beam search. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output ./models/t5/onnx_models/t5_small_beam_search.onnx

Example 6: convert T5 model with beam search containing specific cuda optimizations. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output ./models/t5/onnx_models/t5_small_beam_search.onnx           --use_gpu --past_present_share_buffer --use_decoder_masked_attention

Example 7: convert MT5 model with external data file like mt5-base-beamsearch.onnx.data in below example.
    python convert_generation.py -m google/mt5-base --model_type mt5 --output mt5-base-beamsearch.onnx -e

Example 8: convert gpt2 model with greedy search:
    python convert_generation.py -m gpt2 --output gpt2_greedy_search.onnx --num_beams 1 --num_return_sequences 1

Example 9: convert gpt2 model with sampling:
    python convert_generation.py -m gpt2 --output gpt2_sampling.onnx --num_beams 1 --num_return_sequences 1 --top_p 0.6
    N)Enum)Path)AnyDictListOptionalUnion)	Precisionsetup_logger)NumpyHelper)
GraphProto
ModelProtoTensorProto)	OnnxModel)
GPT2ConfigGPT2LMHeadModelGPT2Tokenizer	MT5ConfigMT5ForConditionalGenerationT5ConfigT5ForConditionalGenerationT5Tokenizer)GraphOptimizationLevelInferenceSessionSessionOptionsget_available_providers)main)PRETRAINED_GPT2_MODELS)export_onnx_models)PRETRAINED_MT5_MODELSPRETRAINED_T5_MODELS c                       e Zd ZdZdZdZd Zy)GenerationTypebeam_searchgreedy_searchsamplingc                     | j                   S N)value)selfs    `/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/convert_generation.py__str__zGenerationType.__str__U   s    zz    N)__name__
__module____qualname__
BEAMSEARCHGREEDYSEARCHSAMPLINGr-    r.   r,   r$   r$   P   s    J"LHr.   r$   argvreturnc                    t        j                         }|j                  d      }|j                  dddt        ddj                  t        t        z   t        z         z          |j                  dd	t        d
g dddj                  g d      z          |j                  dd	t        t        j                  j                  dd      d       |j                  dd	t        dd       |j                  dd	t        dd       |j                  dd	dd       |j                  d	       |j                  d      }|j                  ddt        d       |j                  d d!d	t        t        j                  t        j                  t        j                  gd"       |j                  d#d$d	d%d&gd'(       |j                  d)d*d	dd+       |j                  d	,       |j                  d-d.d	dd/       |j                  d	0       |j                  d1d2d	dd3       |j                  d	4       |j                  d5d6d	dd7       |j                  d	8       |j                  d9d:d	dd;       |j                  d	<       |j                  d=      }|j                  d>d	dd?       |j                  d	@       |j                  dAd	ddB       |j                  d	C       |j                  dDd	dE       |j                  d	F       |j                  dGt        d	dHdIJ       |j                  dKd	ddL       |j                  d	M       |j                  dNd	ddO       |j                  d	P       |j                  dQd	ddR       |j                  d	S       |j                  dTd	ddU       |j                  d	V       |j                  dWd	ddX       |j                  d	Y       |j                  dZd	dd[       |j                  d	\       |j                  d]d	dd^       |j                  d	_       |j                  d`      }|j                  dat        d	dbdcJ       |j                  ddt        d	dedfJ       |j                  dgt        d	dhdiJ       |j                  djt        d	dbdkJ       |j                  dlt         d	dbdmJ       |j                  dnt         d	dbdoJ       |j                  dpt         d	dqdrJ       |j                  dst         d	dqdtJ       |j                  dut         d	t!        dv       dwJ       |j                  dxt        d	dbdyJ       |j                  dzt         d	d{d|J       |j                  d}t        d	dHd~J       |j                  dt        d	ddJ       |j                  dt        d	ddJ       |j                  dt        d	ddJ       |j                  d      }|j                  dd	dd       |j                  d	       |j                  dd	dd       |j                  d	       |j                  dd	dd       |j                  d	       |j                  dd	dd       |j                  d	       |j                  dd	dd       |j                  d	       |j                  dd	t        dbd       |j                  dd	dd       |j                  d	       |j#                  |       }|S )zParse arguments

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.

    Returns:
        argparse.Namespace: Parsed arguments.
    zInput optionsz-m--model_name_or_pathTzEPytorch model checkpoint path, or pretrained model name in the list: , )requiredtypehelpz--model_typeFgpt2)r>   t5mt5z*Model type (default is gpt2) in the list: )r;   r<   defaultchoicesr=   --cache_dir.cache_modelsz%Directory to cache pre-trained models)r;   r<   rA   r=   z--decoder_onnxr"   zLPath of onnx model for decoder. Specify it when you have exported the model.z--encoder_decoder_init_onnxzgPath of ONNX model for encoder and decoder initialization. Specify it when you have exported the model.z	--verbose
store_truezPrint more information)r;   actionr=   )verbosezOutput options--outputz,Output path for onnx model with beam search.z-p--precisionzTPrecision of model to run. fp32 for full precision, fp16 for half or mixed precisionz-b--op_block_list*autozDisable certain onnx operators when exporting model to onnx format. When using defaultvalue for gpt2 type of model fp16 precision, it will be set to ["Add", "LayerNormalization", "SkipLayerNormalization", "FastGelu"]. Other situation, it will be set to [])r;   nargsrA   r=   z-e--use_external_data_formatz!save external data for model > 2G)use_external_data_formatz-sz--run_shape_inferencezrun shape inference)run_shape_inferencez-dpvsz--disable_pad_vocab_sizezDo not pad logits MatMul weight to be a multiple of 8 along the dimension where dim value is the vocab size. The logits MatMul may hence be of poor performance for fp16 precision.)disable_pad_vocab_sizez-dsgdz,--disable_separate_gpt2_decoder_for_init_runzDo not create separate decoder subgraphs for initial and remaining runs. This does not allow for optimizations based on sequence lengths in each subgraph)*disable_separate_gpt2_decoder_for_init_runz-iz--disable_shared_initializerszdo not share initializers in encoder and decoder for T5 or in the init decoder and decoder for GPT2. It will increase memory usage of t5/mt5/gpt2 models.)disable_shared_initializersz6Beam search parameters that stored in the output modelz--output_sequences_scoreszoutput sequences scores)output_sequences_scoresz--output_token_scoreszoutput token scores)output_token_scoresz--early_stopping)r;   rG   )early_stoppingz--no_repeat_ngram_sizer   zNo repeat ngram size)r<   r;   rA   r=   z--vocab_maskz\Enable vocab_mask. This mask applies only to every generated token to filter some bad words.)
vocab_maskz--past_present_share_bufferzWUse shared buffer for past and present, currently work for gpt2 greedy/sampling search.)past_present_share_bufferz--use_decoder_masked_attentionzUses `DecoderMaskedSelfAttention` or `DecoderMaskedMultiHeadAttention` to optimize the decoding Attention computation. Must be used with `past_present_share_buffer`. Currently, only Attention head sizes of 32, 64 and 128 are supported.)use_decoder_masked_attentionz--prefix_vocab_maskzeEnable prefix_vocab_mask. This mask can be used to filter bad words in the first generated token only)prefix_vocab_maskz--custom_attention_maskz]Enable custom_attention_mask. This mask can be used to replace default encoder attention mask)custom_attention_maskz--presence_maskz!Presence mask for custom sampling)presence_maskz--seedzRandom seed for sampling op)seedzYBeam search parameters not stored in the output model, for testing parity and performancez--min_length   zMin sequence lengthz--max_length2   zMax sequence lengthz--num_beams   z	Beam sizez--num_return_sequencesz&Number of return sequence <= num_beamsz--length_penaltyz<Positive. >1 to penalize and <1 to encourage short sentence.z--repetition_penaltyz-Positive. >1 to penalize and <1 to encourage.z--temperature      ?z6The value used to module the next token probabilities.z--top_pzTop P for samplingz--filter_valueInfzFilter value for Top P samplingz--min_tokens_to_keepzAMinimum number of tokens we keep per batch example in the output.z--presence_penalty        z%presence penalty for custom sampling.z--customz&If 1 customized top P logic is appliedz--vocab_sizezIVocab_size of the underlying model used to decide the shape of vocab maskz--eos_token_idzKcustom eos_token_id for generating model with existing onnx encoder/decoderz--pad_token_idzKcustom pad_token_id for generating model with existing onnx encoder/decoderz0Other options for testing parity and performancez--use_sln_strict_modez_Enable strict mode for SLN in CUDA provider. This ensures a better accuracy but will be slower.)use_sln_strict_mode	--use_gpuz)use GPU for inference. Required for fp16.)use_gpuz--disable_parityzdo not run parity test)disable_parityz--disable_perf_testzdo not run perf test)disable_perf_testz--torch_performanceztest PyTorch performance)torch_performancez--total_runsz4Number of times of inference for latency measurementz--save_test_dataz-save test data for onnxruntime_perf_test tool)save_test_data)argparseArgumentParseradd_argument_groupadd_argumentstrjoinr   r!   r    ospathset_defaultsr
   FLOAT32FLOAT16intfloat
parse_args)r6   parserinput_groupoutput_groupmodel_groupbeam_parameters_group
test_groupargss           r,   parse_argumentsr   Y   s    $$&F++O<KT
))*-AADYY
Z[   %9DIIF[<\\   S.14   [   %v   %	   &,,-=>L;	   !!""I$5$56c   X  	 $0   u=%lQf   %8"b   U;6G   O'E   %@++,deK#&	   U;"	   7/%UE2 #   k	   .%f	   u=(	   %@t	   u5!l	   590	   51*	   %("55c &&~C%YZav&w&&~C%Y[bw&x&&}3XY`k&l&& 5 '  &&K '  &&< '  &&E '  &&! '  &&u. '  &&P '  &&4 '  &&5 '  &&X '  &&Z '  &&Z '  **+]^Jn	   6eL?j   E*%	   51#	   e4'	   e4C   <	   51T"DKr.   r   c                    | j                   }d|d| j                  dd| j                  t        j                  k(  rdndddd	d
dg}| j
                  r|j                  d| j
                  g       | j                  r|j                  d       | j                  r|j                  d       t        | j                        r-|j                  dg       |j                  | j                         | j                  t        j                  k(  r| j                  sJ d       | j                  rt        j                  d|        t!        |       y)zqConvert GPT-2 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r9   rI   z--optimize_onnxrJ   fp32fp16z--test_runs1z--test_cases10z--overwriterC   rg   rO   rK   zEfp16 or mixed precision model cannot run in CPU. Please add --use_gpuzarguments for convert_to_onnx:)r6   N)model_name_or_pathdecoder_onnx	precisionr
   rv   	cache_dirextendrh   appendrP   lenop_block_listrw   rH   loggerinfoconvert_gpt2_to_onnx)r   
model_name	argumentss      r,   gpt2_to_onnxr     s    ((J 	..I$5$556I ~~-89||%$$56
4+,-++,~~***||ddd|
 ||4YK@Ai(r.   c                    t        | j                  | j                  t        | j                        j
                  | j                  | j                  | j                  t        j                  k7  | j                  dddddd| j                        }t        j                  d|d           t        j                  d|d           |d   | _        |d   | _        y)	znConvert T5 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    FT)rh   rP   optimize_onnxr   rH   use_decoder_start_tokenmerge_encoder_and_decoder_init	overwritedisable_auto_mixed_precisionuse_int32_inputs
model_typezonnx model for encoder: r   zonnx model for decoder: r_   N)export_t5_onnx_modelsr   r   r   outputparentrh   rP   r   r
   rw   r   r   debugencoder_decoder_init_onnxr   )r   pathss     r,   
t5_to_onnxr     s     "T[[  !%!>!>~~):)::.. %'+%*??E" LL+E!H:67
LL+E!H:67%*1XD"aDr.   	onnx_pathrP   c                     ddl m} t        j                  | d      }|j	                  |dd      }|rt        j                  || |       y	t        j                  d       y	)
zShape inference on an onnx file, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    r   )SymbolicShapeInferenceTload_external_dataF)
auto_mergeguess_output_ranksave_as_external_dataz4Failed to run symbolic shape inference on the model.N)	&onnxruntime.tools.symbolic_shape_inferr   onnx
load_modelinfer_shapesr   saver   warning)r   rP   r   modelouts        r,   shape_inferencer   (  sQ     NOOI$?E
 
-
-eX]
-
^C
sI=UVMNr.   c                    t        j                  | d      }|j                  j                  d   j                  }t        |      }|j                         }||v sJ ||   }|j                  dk7  ryd}|j                  |j                  d         }|9|j                  |dd      }	|	y|j                  |	j                  d         }|yd}|j                  t        j                  j                  k7  ryt        |j                         dk7  ry|j                   d   }
|
d	z  dk(  ryt#        j$                  |
d	z        d	z  }||
z
  }|j&                  r|rpt)        j*                  |j                   d   |ft(        j,                  
      }t)        j.                  t1        j2                  |      |fd      }||j                   d<   not)        j*                  ||j                   d   ft(        j,                  
      }t)        j.                  t1        j2                  |      |fd      }||j                   d<   |j5                         |_        nyt        j6                  || |       y)zPad the logits MatMul weight in the provided decoder model, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   MatMulFr_   	Transpose      dtypeaxisr   )r   r   graphr   namer   output_name_to_nodeop_typeget_initializerinputmatch_parent	data_typer   DataTyperw   r   dimsmathceilraw_datanpzerosfloat16concatenater   to_arraytobytesr   )r   rP   decoder_model_protologits_output_namedecoder_modelr   matmul_nodepad_along_axis_1logits_weighttranspose_before_matmulactual_vocab_sizepadded_vocab_sizepaddingpadding_dataweight_with_paddings                  r,   pad_weights_of_logits_matmulr   :  sQ    //)M,2299!<AA12M';;=!4444%&89Kh&
 !11+2C2CA2FGM"/"<"<[+WX"Y"*%556M6S6STU6VW   +"6"6">">> =!# &**1-A!#		"3a"781<"33G 88]%7%7%:G$DBJJWL"$..+2F2F}2UWc1dkl"m$5Mq!88Wm.@.@.C$DBJJWL"$..+2F2F}2UWc1dkl"m$5Mq!!4!<!<!> NN&	Iabr.   
model_pathrh   rf   c                     t               }t        j                  |_        |rddgndg}|rPdt	               vrt        d      t        j                  d       |r"ddi}d|i}|D cg c]  }||v r|||   fn| }}t        | ||      }|S c c}w )a  Create OnnxRuntime session.

    Args:
        model_path (str): onnx model path
        use_gpu (bool): use GPU or not
        use_sln_strict_mode (bool): use strict mode for skip layer normalization or not

    Raises:
        RuntimeError: CUDAExecutionProvider is not available when --use_gpu is specified.

    Returns:
        onnxruntime.InferenceSession: The created session.
    CUDAExecutionProviderCPUExecutionProviderz5CUDAExecutionProvider is not available for --use_gpu!zuse CUDAExecutionProvider"enable_skip_layer_norm_strict_modeT)	providers)	r   r   ORT_DISABLE_ALLgraph_optimization_levelr   RuntimeErrorr   r   r   )	r   rh   rf   sess_optionsexecution_providerscuda_provider_optionsprovider_optionsr   ort_sessions	            r,   create_ort_sessionr     s     "#L,B,R,RL)OV24JK]s\t"*A*CCVWWKK34%I4$P! 79NOat#atY]$:J2J'-.PTTat   # #:|GZ[K#s   %Br   r   c           
         |t         j                  k(  }t        | j                        }|dz
  }|dk\  sJ g dt	        |      D cg c]  }d| 	 c}z   }t        | j                        t        |      k7  r-t        dt        |       dt        | j                               t        |      D ]  \  }}| j                  |   j                  |k7  r+t        d| d| d| j                  |   j                         t        j                  }|dk\  r"|rt        j                  nt        j                  }| j                  |   j                  j                  j                  }	|	|k7  st        d| d	| d|	        t        j                  d
       dgt	        |      D cg c]  }d| 	 c}z   }
t        | j                         t        |
      k7  r-t        dt        |
       dt        | j                                t        |
      D ]  \  }}| j                   |   j                  |k7  r+t        d| d| d| j                   |   j                         |rt        j                  nt        j                  }| j                   |   j                  j                  j                  }||k7  st        d| d	| d|        t        j                  d       yc c}w c c}w )a  Verify GPT-2 subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of GPT-2
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
       r_   )	input_idsposition_idsattention_maskpast_ Number of inputs expected to be . Got Input  is expected to be $ is expected to have onnx data type z:Verifying GPT-2 graph inputs: name and data type are good.logitspresent_!Number of outputs expected to be Output z;Verifying GPT-2 graph outputs: name and data type are good.N)r
   rw   r   r   range
ValueError	enumerater   r   INT32FLOATr<   tensor_type	elem_typer   r   r   )r   r   
is_float16input_countlayer_countiexpected_inputsexpected_inputexpected_type
input_typeexpected_outputsexpected_outputoutput_types                r,   verify_gpt2_subgraphr    s    i///Jekk"K/K!E^cdo^pHq^pYZ5QRPS^pHqqO
5;;3//;C<P;QQWX[\a\g\gXhWijkk&7>;;q>.0vaS(;N;K6RWR]R]^_R`ReReQfghh#))63=K//;CTCTM[[^((44>>
&vaS(L]O[ablamnoo 8 KKLM zU;=O$P=Oxs^=O$PP
5<<C 011<SAQ=R<SSYZ]^c^j^jZkYlmnn'(89?<<??2wqc)<_<MVTYT`T`abTcThThSijkk/9++{?P?Pll1o**66@@-'vaS(L]O[abmanopp : KKMN A Ir" %Qs   KKc           
         |t         j                  k(  }|rt        j                  nt        j                  }t	        | j
                        }|dz
  dz  }|dk\  sJ ddg}t        |      D ]*  }|j                  d|        |j                  d|        , t        |      D ]*  }|j                  d|        |j                  d	|        , t	        | j
                        t	        |      k7  r-t        d
t	        |       dt	        | j
                               t        |      D ]  \  }}| j
                  |   j                  |k7  r+t        d| d| d| j
                  |   j                         |dk  rt        j                  n|}	| j
                  |   j                  j                  j                  }
|
|	k7  st        d| d|	 d|
        dg}t        |      D ]*  }|j                  d|        |j                  d|        , t	        | j                        t	        |      k7  r-t        dt	        |       dt	        | j                               t        |      D ]  \  }}| j                  |   j                  |k7  r+t        d| d| d| j                  |   j                         | j                  |   j                  j                  j                  }||k7  st        d| d| d|        y)  Verify T5 decoder subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of T5 decoder
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
    r   ra   r_   r   encoder_attention_maskpast_key_self_past_value_self_past_key_cross_past_value_cross_r   r   r   r   r   r   present_key_self_present_value_self_r   r   N)r
   rw   r   r  r   r   r   r   r   r   r   r  r<   r  r  r   )r   r   r  
float_typer  r  r	  r  r
  r  r  r  r  r  s                 r,   verify_t5_decoder_subgraphr    s    i///J(2$$8I8IJekk"K?q(K! #$<=O;s34!1!56   ;45!21#67   5;;3//;C<P;QQWX[\a\g\gXhWijkk&7>;;q>.0vaS(;N;K6RWR]R]^_R`ReReQfghh-.U))
[[^((44>>
&vaS(L]O[ablamnoo 8 !z;"3A3 78"5aS 9:   5<<C 011<SAQ=R<SSYZ]^c^j^jZkYlmnn'(89?<<??2wqc)<_<MVTYT`T`abTcThThSijkkll1o**66@@*$wqc)Mj\Y_`k_lmnn :r.   c           
         |t         j                  k(  }t        | j                        dz
  dz  }|dk\  sJ g d}t        | j                        t        |      k7  r-t        dt        |       dt        | j                               t        |      D ]  \  }}| j                  |   j                  |k7  r+t        d| d| d| j                  |   j                         t        j                  }| j                  |   j                  j                  j                  }||k7  st        d| d	| d|        d
dg}	t        |      D ]*  }|	j                  d|        |	j                  d|        , t        |      D ]*  }|	j                  d|        |	j                  d|        , t        | j                        t        |	      k7  r-t        dt        |	       dt        | j                               t        |	      D ]  \  }}
| j                  |   j                  |
k7  r+t        d| d|
 d| j                  |   j                         |rt        j                  nt        j                  }| j                  |   j                  j                  j                  }||k7  st        d| d	| d|        t         j#                  d       y)r  r   ra   r_   )encoder_input_idsr  decoder_input_idsr   r   r   r   r   r   encoder_hidden_statesr  r  present_key_cross_present_value_cross_r   r   zMT5 encoder graph verified: name and data type of inputs and outputs are good.N)r
   rw   r   r   r   r   r   r   r   r  r<   r  r  r   r   r  r   r   )r   r   r  r  r	  r  r
  r  r  r  r  r  s               r,   'verify_t5_encoder_decoder_init_subgraphr"  -  s    i///Ju||$q(Q.K! [O
5;;3//;C<P;QQWX[\a\g\gXhWijkk&7>;;q>.0vaS(;N;K6RWR]R]^_R`ReReQfghh#))[[^((44>>
&vaS(L]O[ablamnoo 8$ !"9:;"3A3 78"5aS 9:   ;"4QC 89"6qc :;   5<<C 011<SAQ=R<SSYZ]^c^j^jZkYlmnn'(89?<<??2wqc)<_<MVTYT`T`abTcThThSijkk/9++{?P?Pll1o**66@@-'wqc)Mm_\bcnbopqq : KK_`r.   graph1graph2shared_prefixmin_elementssignature_cache1signature_cache2c                 	   i }i }g }g }	g }
| j                   D ]  }|j                  rt        |j                        |k\  s(|j                   D ]  }|j                  rt        |j                        |k\  s(t        j                  ||||      sA||j
                  z   ||j
                  <   |j                  |       |j
                  |vr@||j
                  z   }|||j
                  <   |	j                  |       |
j                  |          t        j                  d|
        | j                  D ]Q  }t        t        |j                              D ].  }|j                  |   |
v st        d|j                  |           S |j                  D ]Q  }t        t        |j                              D ].  }|j                  |   |
v st        d|j                  |           S |	D ]  }|j                   j                  |        |j                  D ]%  }|j
                  |v s||j
                     |_        ' |j                  D ]  }t        t        |j                              D ]m  }|j                  |   |v s||j                  |      }t        j                  d|j
                   d| d|j                  |    d|        ||j                  |<   o  |D ]  }| j                   j                  |        | j                  D ]%  }|j
                  |v s||j
                     |_        ' | j                  D ]  }t        t        |j                              D ]m  }|j                  |   |v s||j                  |      }t        j                  d|j
                   d| d|j                  |    d|        ||j                  |<   o  |	D ]  }||j
                     |_         |	D ]  }t         j"                  j%                  |      j&                  }t         j(                  j+                  |j
                  |j,                  |      }| j                  j                  |       |j                  j                  |        |	S )	a  Remove initializers with same value from two graphs.

    Args:
        graph1 (GraphProto): the first graph to process
        graph2 (GraphProto): the second graph to process
        shared_prefix (str): add prefix to the shared initializers among two graphs
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.
        signature_cache1 (dict): Optional dictionary to store data signatures of tensors in graph1 in order to speed up comparison
        signature_cache2 (dict): Optional dictionary to store data signatures of tensors in graph2 in order to speed up comparison
    zshared initializers:zname is found in graph 1: zname is found in graph 2: zgraph 2 rename node z input z from z to zgraph 1 rename node )initializerr   sumr   has_same_valuer   r   r   r   noder   r   r   r   remove
value_infor   numpy_helperr   shapehelpermake_tensor_value_infor   )r#  r$  r%  r&  r'  r(  mapping_initializers_1mapping_initializers_2shared_initializers_1shared_initializers_2shared_initializers_namesinitializer1initializer2shared_namer-  jr*  r/  new_namer1  s                       r,   remove_shared_initializersr>  q  s   &   "**!!c,*;*;&<&L"..L %%#l.?.?*@L*P''lDTVfg<ILL]L]<]&|'8'89%,,\:$$,BB"/,2C2C"CK@K*<+<+<=)00>-44[A /	 +& LL'(A'BCD s4::'Azz!} 99"%?

1#OPP (  s4::'Azz!} 99"%?

1#OPP (  -!!+. - ''
??444Z__EJO (
 s4::'Azz!} 661$**Q-@3DII;gaStzzZ[}o]abjaklm (

1	 (  -!!+. - ''
??444Z__EJO (
 s4::'Azz!} 661$**Q-@3DII;gaStzzZ[}o]abjaklm (

1	 (  -1+2B2BC - -!!**;7==[[778H8H+J_J_afg
  ,  , - ! r.   encoder_modelr   c                 2   t        |       }t        |      }|j                  d       |j                  d       i i }}|j                  |       |j                  |       t        |j                  j
                  |j                  j
                  d||      }|S )Ne_d_s_)r%  r'  r(  )r   add_prefix_to_namesremove_duplicated_initializerr>  r   r   )r?  r   encoderdecoderr'  r(  initializerss          r,   get_shared_initializersrI    s    &G&G%%)+R&))*:;))*:;-))L r.   c                    g }| j                   D ]8  }|j                  rt        |j                        |k\  s(|j                  |       : |D ]  }| j                   j	                  |        |D ]{  }t
        j                  j                  |      j                  }t
        j                  j                  |j                  |j                  |      }| j                  j                  |       } |S )a^  Remove initializers of a graph, when they have number of elements larger than a threshold.

    Args:
        graph (GraphProto): the graph.
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.

    Returns:
        List[TensorProto]: initializers that are removed from the graph.
    )r*  r   r+  r   r.  r   r0  r   r1  r2  r3  r   r   r/  )r   r&  moved_initializerstensorr*  r1  r/  s          r,   move_initializersrM    s     ##FKK 0L @!!&) $
 *  - * *!!**;7==[[778H8H+J_J_afg

+ *
 r.   c                    | j                   dk(  rt        d| j                   d      | j                   dk(  r| j                  }n#| j                   dk(  r| j                  }n| j                   dk(  r| j
                  }n| j                   dk(  r| j                  }n| j                   dk(  r| j                  }n| j                   d	k(  r| j                  }n| j                   d
k(  r| j                  }nz| j                   dk(  r| j                  }n^| j                   dk(  r| j                  }nB| j                   dk(  r| j                  }n&t        d| j                   d| j                    d      | j                  |fS )z
    Convert attribute to kwarg format for use with onnx.helper.make_node.
        :parameter attribute: attribute in AttributeProto format.
        :return: attribute in {key: value} format.
    r   z
attribute z does not have type specified.r_   r   r   ra            r   	   
   z has unsupported type rD   )r<   r   r   fr  stgfloatsintsstringstensorsgraphs)	attributer*   s     r,   _attribute_to_pairr^    s;    ~~:inn%55STUU ~~	1		1		1		1		1	  	1		1	!!	1	!!	2	  :inn%55KINNK[[\]^^NNE""r.   c                     i }| j                   D ]#  }t        |      \  }}|j                  ||i       % | j                  r|j                  d| j                  i       |S )Ndomain)r]  r^  updater`  )r-  kwargsattrkeyr*   s        r,   	kwargs_ofre  1  sV    F)$/esEl#  {{x-.Mr.   c                     t        | j                  j                  j                  j                  D cg c]&  }|j
                  r|j
                  n|j                  ( c}      S c c}w r)   )tupler<   r  r1  dim	dim_param	dim_value)vids     r,   shape_ofrm  ;  sJ    I\I\IbIbIfIfgIfA!++!++AKK?Ifghhgs   +A$subgc                    d}d}g }t        | j                        D ]  \  }}||k\  rft        |      }t        j                  j                  |j                  |j                  j                  j                  |d   |d   |d   d|d   g      }|j                  |g        |j                  t        j                  j                  dt        j                  j                  dg	      g       | j                  d
       | j                  j                  |       g }t        | j                        D ]  \  }}||k\  rft        |      }t        j                  j                  |j                  |j                  j                  j                  |d   |d   |d   d|d   g      }|j                  |g        | j                  d       | j                  j                  |       g }| j                  D ]  }	|	j                   dk(  rt#        |	      }
|
j%                  ddi       g }|j                  |	j                         t'        |      dk  r!|j                  dg       t'        |      dk  r!t'        |      dk  r|j                  dg       t        j                  j(                  d||	j                  fd|	j                  i|
}	|j                  |	g        | j                  d       | j                  j                  |       | S )Nr   r_   r   r   max_seq_lenra   r  r1  past_sequence_lengthr1  r   r   	AttentionrY   rP  r"   rQ  r   r-  )r   r   rm  r   r2  r3  r   r<   r  r  r   r   r  
ClearFieldr   r-  r   re  ra  r   	make_node)rn  input_past_0output_past_0
new_inputsr  rk  r1  new_outputs	new_nodesr-  rb  niss               r,   1update_decoder_subgraph_past_present_share_bufferr}  ?  s   LMJ4::&2RLE33''--77Qxq58]E!HM 4 B
 	2$ ' t{{99:PRVRbRbRhRhqrps9tuvOOGJJj!K4;;'2RLE33''--77Qxq58]E!HM 4 B
 	B4  ( 	OOHKK{#I		<<;&t_FMM6:;CJJtzz"c(Q,

B4  c(Q,3x!|

234;;((c4;;aTYYaZ`aD$   	OOFIIYKr.   is_beam_searchswitch_attentionc                    |rg }t        | j                        D ]  \  }}|j                  |g        |j                  t        j                  j                  dt        j                  j                  dg      g       |j                  t        j                  j                  dt        j                  j                  g d      g       | j                  d       | j                  j                  |       |r{g d}g }| j                  D ]9  }|j                  dk(  rt        |      }	|	j                         D ]0  }
|
d	k(  r  y
|
|vs|
dk7  rt        j                  d|
 d       |	|
= 2 g }|j                  |j                         |rot        |      dk  r!|j                  dg       t        |      dk  r!t        |      dk  r|j                  dg       t        |      dk  r|j                  dg       t        j                  j                   d||j"                  fd|j$                  i|	}|j                  |g       < | j                  d       | j                  j                  |       y)aS  Update the Attention nodes to DecoderMaskedSelfAttention.

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
        is_beam_search (bool): Boolean specifying if the sampling algo is BeamSearch
        switch_attention (bool): Boolean specifying if `Attention` is to be switched with `DecoderMaskedSelfAttention`
    
beam_widthr_   rs  cache_indirection
batch_sizer  rp  r   rY   	num_headsscalemask_filter_valuer`  rt  qkv_hidden_sizesFunidirectionalzRemoving attribute: zB from Attention node while switching to DecoderMaskedSelfAttentionrQ  r"   r   rR  DecoderMaskedSelfAttentionr   r-  T)r   r   r   r   r2  r3  r   r  ru  r-  r   re  copyr   r   r   rv  r   r   )rn  r~  r  ry  _irk  'decoder_masked_attention_supported_attrr{  r-  rb  kr|  s               r,   4update_decoder_subgraph_use_decoder_masked_attentionr  o  s3    


+FBrd# , 	4;;==lDL\L\LbLbkljm=nop22')9)9)?)?Gr 3 	
 	 

*%3
/ 	IID||{*"4A ..$ GG  00"NN"6qc9{ | #1I! '$ 

4::& "c(Q,

B4( c(Q,3x!|

L>23x!|

$7#89{{,,0#t{{IMV\ dV$I J 			#r.   c                    t               }g }t        | j                        D ci c]  \  }}|j                  | }}}i }i }| j                  D ]N  }|j                  D ]$  }	|	s|	|vr|g||	<   ||	   j                  |       & |j                  D ]
  }
|
s|||
<    P | j                  D ]  }|j                  dk(  s|j                  d   r|j                  d   s3|j                  d   |j                  d   }}d}| j                  D ]  }|j                  |k(  s|} n |zt        j                  j                  |      }|j                  dk(  s|j                         dk(  s|j                  d   |v s||   }|j                  dk(  s|j                  d   s|j                  d   |v s|j                  d   j                  d      s |j                  d   j                  d      sE|j                  |j                  d          |j                  |       t!        ||j                  d            dk(  s|j                  |        ||fS c c}}w )	az  Correct graph which originally use dim of past_seq_len from input_ids's shape which is fixed to max_seq_len after
       shared past/present buffer

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
    return:
        tensor_names_to_rename : set of tensor names which is equal to past_sequence_length
        nodes_to_remove : list of node to remove
    Gatherr_   r   Nr   Shaper  r  )setr   r   r   r-  r   r   r   r*  r   r0  r   sizeitem
startswithaddr   )rn  tensor_names_to_renamenodes_to_removeindexinpgraph_input_namesinput_name_to_nodesr   r-  
input_nameoutput_nameshape_tensor_nameshape_index_nameini_gather_indicesrL  gather_indices_arr
shape_nodes                    r,   find_past_seq_len_usager    sV    !UO;DTZZ;PQ;PZUC5;PQ		**J%887;f'
3'
3::4@ %  ;;K37#K0 '  		<<8#::a=

137::a=$**Q-/!%**;;"22)/& + ")!%!2!2!;!;<N!O!&&!+0B0G0G0IQ0NSWS]S]^_S`dwSw01BC
&&'1"((+"((+/@@"((+667GH%++A.99:LM +..t{{1~>#**40.z/@/@/CDEJ'..z:9 : "?22Y Rs   Ir   	attn_maskkv_num_heads
world_sizewindow_sizec                    | j                  t        j                  j                  dt        j
                  dgdg             t        j                  j                  d|dg|dz   g| j                  d            }t        j                  j                  d|dz   dgdg| j                  d            }t        j                  j                  d	dgd
g| j                  d	      t        j                        }t        j                  j                  d|g|dz   g| j                  d            }t        j                  j                  d|dz   dgdg| j                  d      d      }	t        j                  j                  d	dgdg| j                  d	      t        j                        }
| j                  j                  j                  j                  |||||	|
g       t        t        d | j                  j                  j                              }t        |      D ]  \  }}| j!                  |g dg d      }| j!                  |ddgddg      }d\  }}}||\  }}}n||\  }}| j!                  |g dg d      }| j!                  |ddgddg      }d\  }}}||\  }}}n||\  }}| j!                  |ddgddg      }| j!                  |dgdg      }d\  }}||\  }}n||d   }d}|/|-|j"                  D ]  }|j$                  dk(  s|j&                  }  d}|j"                  D ]  }|j$                  dk(  s|j&                  }  |j(                  d   |j(                  d   k(  xr |j(                  d   |j(                  d   k(  }|d uxr
 |d uxr |d u} |d u xr
 |d u xr |d u }!d\  }"}#}$|rJ| s|!rEt+        j,                  | j/                  |j(                  d               }%t+        j,                  | j/                  |j(                  d               }&t+        j,                  | j/                  |j(                  d               }'|%j0                  d    }(t3        j4                  |%|&|'fd!      j7                  |(d"|(z        })t        j8                  j;                  |)d#| $      })| j                  |)       t        j                  j                  d|j(                  d   |)j$                  g|)j$                   d%g| j                  d            }*| j                  j                  j                  j                  |*g       | j                  j                  j                  j=                  |       | j                  j                  j                  j=                  |       | j                  j                  j                  j=                  |       |*j>                  d   }"| rFt+        j,                  | j/                  |j(                  d               }+t+        j,                  | j/                  |j(                  d               },t+        j,                  | j/                  |j(                  d               }-|+j0                  d    }(t3        j4                  |+|,|-fd!      j7                  d"|(z        }.t        j8                  j;                  |.d&| $      }.| j                  |.       t        j                  j                  d|*j>                  d   |.j$                  g|.j$                   d%g'      }/| j                  j                  j                  j                  |/g       | j                  j                  j                  j=                  |       | j                  j                  j                  j=                  |       | j                  j                  j                  j=                  |       |/j>                  d   }"n-|j>                  d   }"|j>                  d   }#|j>                  d   }$t        j                  j                  d(|"|#|$|j(                  d)   |j(                  d*   |j>                  d   |
j>                  d   ||j(                  d   nd+||j(                  d"   nd+g	|j>                  |j$                  jA                  d,d(      d-||z  |dk(  r||z  n||z  |tC        |d uxr |d u      |.
      }0| j                  j                  j                  j=                  |       | j                  j                  j                  j                  |0g       |/| j                  j                  j                  j=                  |       |r| j                  j                  j                  j=                  |        | S )/Noner_   r   r   r   vals	ReduceSum	_row_sumsinputsoutputsr   Subseqlens_k_int64Cast	seqlens_k)r  r  r   tor  _shaper  total_seq_len_int64r   )r  r  r   r   total_seq_lenc                      | j                   dk(  S )NMultiHeadAttention)r   )r-  s    r,   <lambda>z&replace_mha_with_gqa.<locals>.<lambda>V  s    9M)Mr.   )RotaryEmbeddingAddr   )r   r   r   r  r   )NNN)r_   r   r   r  r   NNinterleavedr  )r"   r"   r"   re   r   r   QKV_Weight_r   _output	QKV_Bias_)r  r  GroupQueryAttentionrP  rQ  r"   r  com.microsoft)	r  r  r   r`  r  r  local_window_size	do_rotaryrotary_interleaved)"add_initializerr   r2  make_tensorr   INT64rv  create_node_namer  r   r   r-  r   listfilterr   match_parent_pathr]  r   r  r   r   r   r   r1  r   stackreshaper0  
from_arrayr.  r   replacerx   )1r   r  r  r  r  reduce_sum_nodesub_nodeseqlen_k_cast_noder  gather_nodetotal_seqlen_cast_node	mha_nodesidxr-  q_path_1q_path_2q_rotaryq_addq_matmulk_path_1k_path_2k_rotaryk_addk_matmulv_path_1v_path_2v_addv_matmulr  attr  root_input_is_sameall_paths_have_biasall_paths_have_no_biasq_input_to_attentionk_input_to_attentionv_input_to_attentionqwkwvwrh  
qkv_weightpacked_matmul_nodeqbkbvbqkv_biaspacked_add_nodegqa_nodes1                                                    r,   replace_mha_with_gqar    sE	    
!''	 	  	
 kk++5![()##K0	 , O {{$$K'/"###E*	 % H ..!"##F+ /  &&{X%&##G,	 ' J ++''H$e,&'##H- ( K "[[22%& !##F+ 3  
KK!!	($6
KQgh: VMu{{O`O`OeOefgIy)	T**41UW`a**42CX1NQRTUPVW$4!%(0%HeX!!)Hh **41UW`a**42CX1NQRTUPVW$4!%(0%HeX!!)Hh **4%1BQFK**4(aSA$x&OE8!{H H$8))88},"%%%K *
 	>>Cxx;&EE	 "
 &^^A.(..2CCnWXHY]e]k]klm]nHn $4/[E4E[%W[J[!&$!R5D=!RUd] LVH24H#6:P%%e&;&;HNN1<M&NOB%%e&;&;HNN1<M&NOB%%e&;&;HNN1<M&NOB((2,C2r2,Q7??QWMJ**55jUXTYGZ5[J!!*-!%!6!6 q):??;&OO,G45++H5	 "7 " KK""))+=*>?KK""))(3KK""))(3KK""))(3#5#<#<Q#?  # ))%*?*?A*OP ))%*?*?A*OP ))%*?*?A*OPhhrl88RRLq9AA!c'J,,77SVRWGX7Y%%h/"&++"7"7.55a8(--H (g67 #8 #
 !!&&--.?@!!&&--e4!!&&--e4!!&&--e4'6'='=a'@$ $,??1#5 #+??1#5 #+??1#5  ;;((!$$$

1

1"))!,&--a0%-%9q!r%-%9q!r
 KK""#79NO":-4@A4Ej0<[eKe)($.G843GH*) ) 
, 	%%d+%%xj1KK""))(3KK""))(3E *H Lr.   c           	         d}| j                   D cg c]  }|j                   }}|dk  r3||   j                  d      s|dz  }|dk  r||   j                  d      sd}t        | j                        |z
  dz  }d|z  |z   }t        |      D ci c]"  }| j                   |dz  |z      j                  |$ }}t        d|        t        | j                   |         }	t        d|	        |	d   }
|	d   }|	d   }d}| j                  D ]7  }|j                  dk(  s|j                   d   |v s&t        d	|j                   d
|j                          |dz  }||j                   d      }d| }dgdt        |j                        z
  z  }|j                  |       |j                  j                  |       |j                  j                  t        j                  j                  dd      g       t        j                  j!                  |t"        j$                  |
|d|g      }| j                  j                  |g       : ||k7  rt'        d| d|       y c c}w c c}w )Nr_   r   pastr   z    --past_key_cross_inputs=zpast_key_cross_0_shape is r   DecoderMaskedMultiHeadAttentionz'    -- add cross QK output from: node: z with output: output_cross_qk_r"   	output_qkz#Did not add cross QK for all layersz vs )r   r   r  r   r   r   printrm  r-  r   r   r   r]  r   r2  make_attributer3  r   r  r   )rn  input_self_past_0gir  output_self_present_0
num_layersinput_cross_past_0layerpast_key_cross_inputsinput_past_key_cross_0_shapebatch_size_dimnum_heads_dimcross_seq_len_dimnum_layer_output_qkr-  cross_attention_out_nameappended_namescross_attentions                     r,   .update_decoder_subgraph_output_cross_attentionr    s   +/::6:R:6
a
(9:K(L(W(WX^(_Q a
(9:K(L(W(WX^(_dkk"%::q@JZ*;;afgqarsarX]TZZ	4F(FGLLeSars	()>(?
@A#+DJJ7I,J#K 	&'C&D
EF1!4N03M4Q7		LL==DJJqMUjDj;DII;nUYU`U`Tabc1$)$**Q-8E)9%'A$ TQT[[)9%9:N!!":;KK~.NN!!4;;#=#=k1#M"NO"kk@@(+*;*;nm]^`q=rO KK01  j(>zl$ObNcdee )A 7 ts   I'I!c           
         d}| j                   D cg c]  }|j                   }}|dk  r3||   j                  d      s|dz  }|dk  r||   j                  d      sd}t        t	        | j                         |z
  dz        }d|z  |z   }g }g }| j
                  D ]$  }	|	j                  dk(  s|j                  |	g       & t	        |      |k  ryd }
| j
                  D ]  }	|	j                  dk(  s|	}
 n g d	}d
}t        |       \  }}t	        |      dkD  r|D ]  }t        d| d|         |D ]'  }t        d|j                   d|j                          ) t        j                  j                  ddgdgd      }t        j                  j                  ddg|gdt        j                        }|j                  ||g       | j
                  D ]  }	t	        |	j                        dkD  r|
|	j                  d   |
j                   d   k(  rbt        j                  j                  ddgdgdt        j                        }|j                  d   |	j                   d<   |j                  |g       |	j                  dk(  rt!        |	      }|j#                         D ]
  }||vs||=  |	j                   d   |	j                   d   |	j                   d   g}|j                  t	        |	j                         dkD  r|	j                   d   ndg       |j                  t	        |	j                         dkD  r|	j                   d   ndg       |j                  t	        |	j                         dkD  r|	j                   d   ndg       |j                  t	        |	j                         dkD  r|	j                   d   ndg       |j                  dg       |j                  dg       |j                  dg       |j                  t	        |	j                         dkD  r|	j                   d   ndg       d|d <   t        j                  j                  d!||	j                  fd"|	j                  i|}	|	|vst%        |	j                         D ]  \  }}||v s||	j                   |<    |j                  |	g        | j'                  d#       | j
                  j                  |       | j                   D cg c]  }|j                   }}g }t%        | j                         D ]  \  }}||k\  rg||k  rbt)        |      }t        j                  j+                  |j                  |j,                  j.                  j0                  |d   |d   d$|d   g%      }|j                  |g        d|vrK|j                  t        j                  j+                  dt        j                  j2                  dg&      g       d|vrK|j                  t        j                  j+                  dt        j                  j2                  dg&      g       d|vrL|j                  t        j                  j+                  dt        j                  j2                  g d'&      g       | j'                  d(       | j                   j                  |       g }t%        | j                        D ]~  \  }}||k\  rbt)        |      }t        j                  j+                  |j                  |j,                  j.                  j0                  |d   |d   d$|d   g%      }|j                  |g        | j'                  d)       | j                  j                  |       y*c c}w c c}w )+Nr_   r   r   ra   r   r  FRelativePositionBiasr  #past_sequence_length_squeezed_int64r   zFound tensor name z to be renamed to zFound node to removed: type:z, name:Squeezerr  past_sequence_length_squeezed!node_past_sequence_length_squeezer  r  &node_past_sequence_length_squeeze_cast)r   r  past_sequence_length_int64past_sequence_length_castr"   rO  rP  rQ  r  r  rY   r  r   r-  rp  rq  rs  r  r   r   T)r   r   r  rx   r   r-  r   r   r  r  r   r2  rv  r   r  r   re  r  r   ru  rm  r3  r<   r  r  r  )rn  r  r  r  output_self_past_0r	  r
  r{  	old_nodesr-  rel_pos_bias_noder  target_squeezed_past_seq_namer  r  name_to_renamenrsqueeze_node	cast_noderb  r  r|  r  r   r  orig_input_namesry  r  rk  r1  rz  s                                  r,   ?update_decoder_subgraph_share_buffer_and_use_decoder_masked_mhar(    s   +/::6:R:6
a
(9:K(L(W(WX^(_Q a
(9:K(L(W(WX^(_c$**o(99Q>?JZ*;;II		<<//dV$ 
 9~
" 		<<11 $ 
/+ %J!.Ed.K+O
!"Q&4N&~&66HIfHghi 5!B0GBGG9MN " {{,,#$,-4	 - 
 KK)),-*+9   * 
	 	,	23		t{{a$5$AdkkRSnXiXoXopqXrFr--'(-.0$$ . I &,,Q/DJJqMi[)<<//t_F[[]CCq	 # 

1

1

1C JJTZZ1)<

1"EFJJTZZ1)<

1"EFJJTZZ1)<

1"EFJJTZZ1)<

1"EFJJ./0JJ~&JJ+,-JJTZZ1)<

1"EF23F./;;((13JN))W]D &(4t11(EDJJu%  5 dV$Y \ 	OOFIIY,0JJ7JSJ7J4::&2!!a*<&<RLE33''--77Qxq=%(C 4 B
 	2$ ' %55[[//0FHXHXH^H^ghfi/jk	
 ++4;;==lDL\L\LbLbkljm=nop"2222')9)9)?)?Gr 3 	
 	OOGJJj!K4;;'2""RLE33''--77Qxq=%(C 4 B
 	B4  ( 	OOHKK{#g 7R 8s   ^ /^model_protoc                    t        |       }|j                         }g }g }|j                         D ]  }|j                  dk(  sd|j                  d   v rd|j                  d   v r7||j                  d      }||j                  d      }||j                  d      }|j                  |j                  d         }	|j                  |j                  d         }
|j                  |j                  d         }|	r|
r|s yt        j                  |	      }t        j                  |
      }t        j                  |      }t        j                  |||gd      }|j                  d	d
      }t        j                  j                  |dz   |	j                  dk(  rt        j                   nt        j"                  |j$                  d   |j$                  d   g|j'                         j)                               }| j*                  j,                  j/                  |g       t        j                  j1                  d	|j                  d   |dz   g|dz   g|      }|j2                  d   |j                  d<   d|j                  d<   d|j                  d<   |j/                  |g       |j/                  |||g        |j5                  |       |j7                  |       |j9                          |j;                          y)Nr  past_key_crossr_   past_value_crossr   r   Fr   r   
MatMul_QKV)name_prefix_weightr  _outr  r"   T)r   r   nodesr   r   r   r   r   r   r   r  r   r2  r  r   r   r  rw   r1  flattentolistr   r*  r   rv  r   	add_nodesremove_nodesupdate_graphtopological_sort)r)  
onnx_modelr   nodes_to_addr  r-  r  r  r  q_weightk_weightv_weightr  r  r  r  matmul_node_nameweightr   s                      r,   pack_qkv_for_decoder_masked_mhar?    s   ;'J$88:LO  "<<<<4::a=05G4::VW=5X*4::a=9H*4::a=9H*4::a=9H!11(..2CDH!11(..2CDH!11(..2CDHh%%h/B%%h/B%%h/BR1=J)::8Q]:^[[,,%	1/7/A/AQ/F+++KL_L_ &&q):+;+;A+>?'')002	 - F ))00&:++// q)+;i+GH)F23%	 0 K (..q1DJJqMDJJqMDJJqM.""Hh#ABU #X &O,!r.   decoder_onnx_pathc                 .   t        j                  | d      }t        t        |j                  j
                              D ]  }|j                  j
                  |   j                  dk(  s'|j                  j
                  |   j                  dk(  sP|j                  j
                  |   j                  j                  j                  j                  d   }|j                  d      r|j                          d|_         t        j                  || |       y)aQ  Update the input shapes for the inputs "input_ids" and "position_ids" and make the sequence length dim value 1 for each of them.
       The decoder model will be over-written.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   r_   ri  r   )r   r   r   r   r   r   r   r<   r  r1  rh  HasFieldClearrj  r   r   )r@  rP   r   r  shape_dim_protos        r,   *update_input_shapes_for_gpt2_decoder_modelrE    s     //*;PTU3*006678%%++A.33{B"((..q166.H177==a@EEQQWW[[\]^O ''4%%' )*O% 9 NN&(9Qijr.   init_decoder_onnx_pathc           	         t        j                  | d      }|j                  j                  d   j                  }t        |      }|j                         }||v sJ ||   }|j                  dk7  ry|j                  |g dg d      }||j                  |g dg d	      }|0|j                  |g d
g d      }||j                  |g dg d      }|y|d   }	|	j                  dk(  }
|
sqd}|j                  |	g d|dddg      }|d}|j                  |	g d|dddg      }|d}|j                  |	g d|ddg      }|d}|j                  |	g d|ddg      }nld}|j                  |	g d|ddg      }|d}|j                  |	g d|ddg      }|d}|j                  |	ddg|dg      }|d}|j                  |	ddg|dg      }|y|dk(  rdnd}|
s|j                  |	d|      }n|j                  |	d|      }|y|d   }|d   }t         j                  j                  dt        j                  dgdg      }t         j                  j                  dt        j                  dgdg      }t         j                  j                  dt        j                  dgdg      }t         j                  j                  dt        j                  dgdg      }|j                  |       |j                  |       |j                  |       |j                  |       d|j                  d   z   }t         j                  j                  d|j                  d   ddddg|g|j!                  dd            }|
s|j                  d   n|j                  d    }d|j                  d   z   }t         j                  j                  d|ddddg|g|j!                  dd!            }|j#                  |       |j#                  |       |j%                  ||j                  d   |       |j%                  |	||       |j'                          t        j(                  |||"       y)#a  Generates the initial decoder GPT2 subgraph and saves it for downstream use.
       The initial decoder model will be saved to init_decoder_onnx_path.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        init_decoder_onnx_path (str): Path of GPT-2 init decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   F)r  LayerNormalizationr  r  r  r   r  FastGelur  r   r  rH  r  )r   r   r   r_   r   r   r   r   r   r   r   r   r   )
r  SkipLayerNormalizationr  r   r  rI  r  r   r  rJ  )
r   r   r_   r   r   r   r   r   r   r   )rH  r  r  r   rI  r   rH  r  )r   r   r_   r   r   r   r   r   )rJ  r   rI  r   rJ  )r   r_   r   r   r   re   rJ  )r  r  r   rt  r_   )r  r   rt  )r  r   rt  rt  r  SliceLastTokenStartsr  SliceLastTokenEndsSliceLastTokenAxesSliceLastTokenStepsedge_modified_SliceGatherLastToken_0_r  r   GatherLastToken_1_r   )r   r   r   r   r   r   r   r   r  r   r2  r  r   r  r  rv  r  add_nodereplace_node_inputr7  r   )r@  rF  rP   init_decoder_model_protor   gpt2_init_decoder_modelr   logits_matmul_node"logits_matmul_to_residual_add_pathresidual_add_nodeis_skiplayernorm_path&residual_add_to_attention_parent_indexresidual_add_to_attention_path residual_add_to_add_parent_indexadd_before_residual_add	attentionmatmul_after_attentionslice_starts
slice_ends
slice_axesslice_stepsslice_0_output_nameslice_node_0add_before_residual_add_outputslice_1_output_nameslice_node_1s                             r,   generate_gpt2_init_decoderrk    s     $/@UYZ177>>qAFF'(@A1EEG!4444,-?@ !!X- *A)R)R	
 	0#*&* *1-D-V-V +.
*$ *1-D-V-Vm$.
* .51H1Z1Z"  
2. *1:2> .559QQ !12.)@)R)REHnpqstvwGx*
&
 *1562-D-V-V!67AqA.* *1562-D-V-V!#ADjlmopCq.*
 *1562-D-V-V!#ADjlmopCq.* 23.)@)R)R>Agijlm@n*
&
 *1562-D-V-V!#BEkmnpqDr.*
 *1562-D-V-V!Hk#:=cef<g.*
 *1562-D-V-V!Hk#:=cef<g.*
 &-,RVW,Wq]^$ !"9"F"Fu&F#
 #:"F"F79Y#
 &.r2I;B?;;**###ST	 + L ((!##ST	 ) J ((!##SS	 ) J ++))"##ST	 * K ++L9++J7++J7++K8 +Y-=-=a-@@;;((Q"  !
 %%$55g?ST ) L" 2G&&q)LcLjLjklLm # +-D-K-KA-NN;;((*"  !
 %%$55g?ST ) L $$\2$$\2 ../EyGWGWXYGZ\op../@B`buv ,,. NN+-C[str.   c                    t        d      }t        |j                        }t        |j                        }t        |j                        }| j                  j
                  D ]  }|j                  j                  j                  j                  D ]S  }|j                  d      s|j                  ||||fv s(t        |j                        }|j                          ||_        U  | j                  j                  D ]  }|j                  j                  j                  j                  D ]S  }|j                  d      s|j                  ||||fv s(t        |j                        }|j                          ||_        U  y)zoMake dim_proto numeric.

    Args:
        model: T5 encoder and decoder model.
        config: T5 config.
    r_   ri  N)rq   r  d_modeld_kvr   r   r<   r  r1  rh  rB  ri  rx   rC  rj  r   )	r   configsequence_lengthr  hidden_size	head_sizerL  	dim_protorj  s	            r,   make_dim_proto_numeric_t5rt    sE    !fOF$$%Ifnn%KFKK I++$$0066::I!!+.93F3F	K 4  	 3 34	!&/	# ; % ++##0066::I!!+.93F3F	K 4  	 3 34	!&/	# ; $r.   generation_typec                 B,   | j                   dk(  }|t        j                  k(  }|t        j                  k(  }|t        j                  k(  }| j
                  }t        j                  d|        t        | j                        dk(  ry| j                  d   dk(  rg|r^| j                  t        j                  k(  rAg d| _	        t        j                  d| j                          t        j                  d       ng | _	        |s|r;|st        d	      | j                  rt        d
      | j                  rt        d      |r|r| j                   st#        d      | j                   r|st#        d      | j                   r| j$                  st#        d      |r| j&                  rMt(        j*                  j-                  | j&                        r$t        j                  d| j&                          n5| j&                  swdj/                  | j0                  | j                  t        j                  k(  rdnd      }t3        t3        | j4                        j6                  |      j9                         | _        t        j                  d| j0                   d| j&                   d       t;        |        nv| j&                  r<| j<                  r0t        j                  d| j&                   d| j<                          n.t        j                  d| j0                   d       t?        |        d}| j@                  s| j                  t        j                  k(  rb|r`|s|s|rZt        j                  d| j&                   d       tC        | j&                  | jD                        }|st        jG                  d       d}	d}
| jH                  s|r|s|s|rt        j                  d| j&                   d        d!j/                  | j                  t        j                  k(  rdnd      }t3        t3        | j4                        j6                  |      j9                         }
tK        | j&                  |
| jD                        }	|	st        jG                  d"       |	r+tM        | j&                  | jD                        st#        d#      |s| jN                  s|	rtt        j                  d$| j&                   d       tQ        | j&                  | jD                         |	r/t        j                  d$|
 d       tQ        |
| jD                         |r,tS        jT                  | j0                  | jV                  %      }nf| j                   d&k(  r,tY        jT                  | j0                  | jV                  %      }n+t[        jT                  | j0                  | jV                  %      }| j\                  rt        j                  d'|        |j^                  }|r|j^                  n|j`                  }|jb                  }| jb                  d(k7  r| jb                  }| j^                  d(k7  r| j^                  }| j`                  d(k7  r| j`                  }te        jf                  | j&                  d)*      }| j                    d+|jh                  _5        d}| j                   dk(  rxtm        |jh                  | j                         |	rvte        jf                  |
d)*      }| j                    d,|jh                  _5        tm        |jh                  | j                         n to        |jh                  | j                         d}|rg d-}n|s|rg d.}| jp                  r|js                  d/       n|js                  d0       | jt                  r|js                  d1       n|js                  d0       | jv                  r|js                  d2       n|js                  d0       |rX| jx                  r| jz                  r|js                  d3       n|js                  d0       | j|                  r|js                  d4       d5g}| j                  r|js                  d6       | j                  r$| j                  sJ d7       |js                  d8       d}|r1td        j~                  j                  d9||d:| j                    ;      }ne|r1td        j~                  j                  d<||d=| j                    ;      }n2|r0td        j~                  j                  d>||d?| j                    ;      }d@|_A        d}|rtd        j~                  j                  dA|      td        j~                  j                  dB|      td        j~                  j                  dC| j                        td        j~                  j                  dD| j                  rdnd      td        j~                  j                  dE| j                   dk(  rdnd      g}n/|rtd        j~                  j                  dA|      td        j~                  j                  dB|      td        j~                  j                  dE| j                   dk(  rdnd      td        j~                  j                  dC| j                        g}n|rtd        j~                  j                  dA|      td        j~                  j                  dB|      td        j~                  j                  dE| j                   dk(  rdnd      td        j~                  j                  dC| j                        td        j~                  j                  dF| j                        td        j~                  j                  dG| j                        td        j~                  j                  dH| j                        td        j~                  j                  dI| j                        td        j~                  j                  dJ| jx                        td        j~                  j                  dK| j                        g
}|r0|j                  td        j~                  j                  dL|      g       |j                  j                  |       g }| j                   dMv rz| jN                  rCt        j                  dN| j<                   d       tQ        | j<                  | jD                         te        jf                  | j<                  d)*      }| j                    dO|jh                  _5        t        |jh                  | j                         t        ||       t        ||       |r| j                   st#        dP      t        j                  dQ       t        |jh                        rt        j                  dR       nt        j                  dS       t        |      rt        j                  dT       nt        j                  dU       | j                  sHt        ||      }t        j                  t        |       dV|D cg c]  }|jj                   c} dW       |j                  j                  td        j~                  j                  dX|jh                        td        j~                  j                  dY|jh                        td        j~                  j                  dZt        |jh                  j                        d[k(  r|j                  nd(      g       n|	r| j                  sHt        ||      }t        j                  t        |       dV|D cg c]  }|jj                   c} d\       |r*t        j                  d]       t        |jh                         | j                   r"t        |jh                  |d      st#        d^      |j                  js                  td        j~                  j                  d_|jh                               n6t        |jh                        }t        j                  t        |       d`       |r*t        j                  da       t        |jh                         | j                   r"t        |jh                  |d)      st#        db      |j                  js                  td        j~                  j                  dY|jh                               td        j~                  j                  dct        j                  dddeg      }td        j~                  j                  dft        j                  dg      }td        j~                  j                  dgt        j                  dg      }td        j~                  j                  dht        j                  dg      }td        j~                  j                  dit        j                  dg      }td        j~                  j                  djt        j                  dg      }td        j~                  j                  dkt        j                  dg      }d} |r
|||||||g} n
|s|r||||g} | jp                  rAtd        j~                  j                  d/t        j                  |g      }!| js                  |!       | jt                  rBtd        j~                  j                  d1t        j                  dd|g      }"| js                  |"       | jv                  rBtd        j~                  j                  d2t        j                  dddeg      }#| js                  |#       | jx                  rN| jz                  rBtd        j~                  j                  d3t        j                  dd|g      }$| js                  |$       |rM| j|                  rAtd        j~                  j                  d4t        j                  dg      }%| js                  |%       d}&|r2td        j~                  j                  d5t        j                  g dl      }&n5|s|r1td        j~                  j                  d5t        j                  dddfg      }&|&g}'| j                  rBtd        j~                  j                  d6t        j                  dddig      }(|'js                  |(       | j                  rDtd        j~                  j                  d8t        j                  dmdddh|g      })|'js                  |)       td        j~                  j                  |g|s| j                    dnn| j                    do| |'|      }*td        j~                  j                  |*dp|j                  q      }+| jD                  rpddrl^m_}, |,j                  td        j                        |,j                  ds      k  rt        jG                  dt       t        j                  |+| j4                  d)d)u       n te        j                  |+| j4                         t        j                  dv| j4                          yc c}w c c}w )wzConvert model according to command line arguments.

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r>   z**** past_present_share_buffer=r_   r   rM   )r  rH  rJ  rI  z**** Setting op_block_list to zI**** use --op_block_list if you want to override the block operator list.z<Currently only gpt2 with greedy search/sampling is supportedzLoutput_sequences_scores currently is not supported in greedy search/samplingzHoutput_token_scores currently is not supported in greedy search/samplingzi`use_decoder_masked_attention` MUST be turned on to use `past_present_share_buffer` in case of BeamSearchzS`past_present_share_buffer` MUST be turned on to use `use_decoder_masked_attention`z?`use_decoder_masked_attention` option is only supported on GPUsz)skip convert_to_onnx since path existed: z{}_past_{}.onnxr   r   zConvert GPT model z	 to onnx z ...z,skip convert_to_onnx since paths specified: z and zConvert model z to onnx ...Fz=Pad logits MatMul weights for optimal MatMul perf in fp16 on z. The file will be overwritten.z]Tried and failed to pad logits MatMul weights. Performance may be sub-optimal for this MatMulNz*Creating an initial run GPT2 decoder from z. zgpt2_init_past_{}.onnxzuTried and failed to generate the init decoder GPT2 model. Performance may be sub-optimal for the initial decoding runzGCould not update the input shapes for the non-initial decoder subgraph.z Run symbolic shape inference on r   r?   zConfig=re   Tr   z decoderz init decoderr   
max_length
min_length	num_beamsnum_return_sequenceslength_penaltyrepetition_penaltyr   ry  rz  r~  rX   r"   r[   r   r]   r^   	sequencessequences_scoresz8--output_token_scores requires --output_sequences_scoresscores
BeamSearchBeamSearch_r  GreedySearchGreedySearch_Sampling	Sampling_r  eos_token_idpad_token_idno_repeat_ngram_sizerW   r   temperaturetop_pfilter_valuemin_tokens_to_keepcustompresence_penalty
vocab_sizer?   r@   zSymbolic shape inference on z encoder and decoder initzMpast_present_share_buffer is only supported with use_decoder_masked_attentionzl*****update t5 decoder subgraph to share past/present buffer and use decoder_masked_multihead_attention*****z4*****update t5 decoder subgraph successfully!!!*****zF*****DecoderMaskedMultiHeadAttention is not applied to T5 decoder*****z9*****pack qkv for decoder masked mha successfully!!!*****z3*****pack qkv for decoder masked mha failed!!!*****z shared initializers (z>) in encoder and decoder subgraphs are moved to the main graphrF  rG  decoder_start_token_idr   zC) in decoder and init decoder subgraphs are moved to the main graphzY*****update init decoder subgraph to make past and present share buffer******************zLCould not update the init decoder subgraph to use DecoderMaskedSelfAttentioninit_decoderz: initializers from the decoder are moved to the main graphzT*****update decoder subgraph to make past and present share buffer******************zGCould not update the decoder subgraph to use DecoderMaskedSelfAttentionr   r  rp  ry  rz  r{  r|  r}  r~  )r  r|  ry  zmax_length - sequence_lengthz beam searchz greedy searchzonnxruntime.transformers)producer_nameopset_imports)versionz1.12.0z0Require onnx >= 1.12 to save large (>2GB) model!)r   all_tensors_to_one_filezmodel save to )dr   r$   r2   r3   r4   rY   r   r   r   r   r   r
   rw   NotImplementedErrorrU   rV   rZ   r   rh   r   rs   rt   existsformatr   r   r   r   as_posixr   r   r   rR   r   rP   r   rS   rk  rE  rQ   r   r   from_pretrainedr   r   r   rH   r  r  r  r   r   r   r   r  r  rX   r   r[   r\   r  r]   r^   r2  rv  r`  r  r  rW   r  r  r  r  r  r   r]  r"  rt  r(  r?  rT   rI  r   r  r}  r  rM  r3  r   r  r  
make_graph
make_modelopset_import	packagingr  parse__version__r   r   )-r   ru  is_gpt2is_beamsearchis_greedysearchis_samplingrY   onnx_filenamelogits_matmul_weight_paddedgpt2_init_decoder_generatedgpt2_init_decoder_onnx_pathgpt2_init_decoder_onnx_filenamero  r  r  r  r   rW  r  r  r-  attr_to_extendrH  r?  r  r   ry  rz  r{  r|  r}  r~  graph_inputsrX   r[   r   r]   r^   r  graph_outputsr  r  	new_graph	new_modelr  s-                                                r,   convert_generation_modelr  &  s    OOv-G)^-F-FFM+~/J/JJO'>+B+BBK&*&D&D
KK12K1LMN
4!#(:(:1(=(Gt~~):)::!dDKK89K9K8LMNKKcd!#D+%&dee''%&tuu##%&pqq !]4;\;\w
 	
 ((1Jnoo ((Z[[0A0A!BKKCDDUDUCVWX$$ 1 8 8++t~~IZIZ7ZV`f! %)dkk):)A)A=$Q$Z$Z$\!KK,T-D-D,EYtO`O`Naaefg!?!?KK>t?P?P>QQVW[WuWuVvw KK.)@)@(ANOt #(''NNi///oKDL]L]K^ _, ,	
 'C4CTCTVZVsVs&t#*NNo #("&;;o@ARAR@SSUVW*B*I*Inn	(9(99Fv+
' '+4+<+C+CEd&e&n&n&p#&@:D<Y<Y'
# +NNN '/Yt<<0
 fgg
 #d&>&>B]6t7H7H6IIhij))4+H+HI&KK:;V:WWvwx79V9VW++D,C,Ct~~^	D	 ))$*A*AT^^\**4+B+Bdnn]||gfX&'&&L*16&&v7J7JL""J "__
B((B((OOD$5$5$OM"&//!2(;M"& ]00$..A '&*oo6Qfj&k#48OO3DM1R#)). !8!>!>O"=#6#6GF
 
K
 l#b)*b!!&'b;;4--MM/*MM"99MM&!mG##)*++g-gg+x D{{$$t/0	 % 
 
{{$$  12	 % 
 
{{$$T__-.	 % 
 "DKNKK&&~|DKK&&~|DKK&&'=t?X?XYKK&&'7d>Q>QWXYKK&&|$//V:SQYZ[
 
KK&&~|DKK&&~|DKK&&|$//V:SQYZ[KK&&'=t?X?XY	
 
KK&&~|DKK&&~|DKK&&|$//V:SQYZ[KK&&'=t?X?XYKK&&}d6F6FGKK&&w

;KK&&~t7H7HIKK&&';T=T=TUKK&&x=KK&&'94;P;PQ
 #t{{99,
STUNN.)L-'##KK6t7U7U6VVuvwD::D<Y<YZ(F(F[_`&*oo%66O#P /0C0CT^^T!-8!-8 %44 !pqqKK~ O}ObObcRSde.}=WXQR//2=-PLKK|$%%;\<Z\QVV\<Z;[  \Z  [ 	**9m6I6IJ**9m6I6IJ**,589L9L9R9R5SWX5XF11^`		
 ' 3367NP]^<())?Q]@^Q]AQ]@^?_  `c  d
 )wxABYB_B_` 009m'--}e: !!oppNN!!$++"<"<^MdMjMj"kl -]-@-@ALKK3|,--ghi %KKno=m>Q>QR ,,5i6
 fggdkk88MDWDWXY 22;@Q@QT`bsStuI33L+BSBSVWUXYJ33L+BSBSVWUXYJ22;@Q@QTUSVWI;;==>TVaVgVgjkilm[[778H+J[J[^_]`aN;;<PR]RcRcfgehiL 
 
K	
 [[77kFWFWZdYef
J' KK>>!2!2\:4N
 	-.!!;;k//,@Q1R
 	N+{{t))::[..z0J
 	M*tyy{{11&+:K:KaSQD! IKK66@
	
 
KKK66<(
	 KM##;;== 1 1LBX3Y
 	-.33+\;
S

 	V$&&	0?4??
<(GXXfEgI &&0#00 ' I $$%==))*W]]8-DDNNMNKK"&$(		
 			)T[[)
KK../i =[: A_s   AXAXr   r   r  r  bad_words_idsc                    | j                   r)t        j                  j                         st	        d      | j
                  t        j                  k(  r|j                          t        j                  | j                   rdnd      }|j                  |       t        j                  d       |j                  |      }|j                  |      }g }t        | j                        D ]  }	t        j                         }
|j                  ||| j                   | j"                  | j$                  | j&                  | j(                  ||| j*                  | j,                  | j.                  |r|ndd| j0                  xs | j2                        }	|j5                  t        j                         |
z
          |j6                  d   }dd	lm}  |||      S )
a  Test PyTorch performance of text generation.

    Args:
        args (argparse.Namespace): arguments parsed from command line
        model (Union[GPT2LMHeadModel, T5ForConditionalGeneration]): PyTorch model
        input_ids (torch.Tensor): input_ids
        attention_mask (torch.Tensor): Attention mask
        eos_token_id (int): EOS token ID
        pad_token_id (int): Padding token ID
        bad_words_ids (List[List[int]]): Words shall not be generated.

    Raises:
        RuntimeError: PyTorch with CUDA is not available for --use_gpu

    Returns:
        Dict[str, Any]: A dictionary with string with metric name, and value can be integer or string.
    z=Please install PyTorch with Cuda for testing gpu performance.zcuda:0cpuFNTr   r   ry  rz  r{  rW   r  r  r  r|  r}  r~  r  return_dict_in_generateoutput_scoresr   get_latency_result)rh   torchcudais_availabler   r   r
   rw   halfdevicer  set_grad_enabledr   
total_runstimegeneratery  rz  r{  rW   r  r|  r}  r~  rU   rV   r   r1  benchmark_helperr  )r   r   r   r   r  r  r  r  torch_latency_startr  r  s                r,   test_torch_performancer  
  sl   4 ||EJJ335Z[[~~***

\\dll(>F	HHV	5!V$I#&&v.NM4??#		NN)nn..!%!:!:%%!%!:!:..#66+8-d$(66R$:R:R  
" 	TYY[501' $( #J3mZ88r.   c                    t        j                  | j                  t         j                        }t	        | j                  d         D ]?  }d}t	        | j                  d         D ]   }| |   |   |k(  r|dk(  r	d||   |<   |dz  }" A |S )Nr   r   r_   )r   onesr1  int32r   )r   r  r   r  abs_posr<  s         r,   create_attention_maskr  \
  s    WWY__BHH=N9??1%&yq)*A|A,.7a<'(q!!$1	 + ' r.   	sentences	is_greedyc                    | j                   dk(  sJ t        j                  | j                  | j                        }d|_        |j                  |_        t        j                  | j                  | j                  |j                        }|g d} ||dd	      }|d
   }|d   }d}|j                  |d      }	|	D 
cg c]  }
|
g }	}
| j                  rt        j                  d|	       ng }	|j                  }|j                  }|j                  }|j                  }g }d}| j                   sdt#        d       t#        d       |j%                  ||| j&                  | j(                  | j*                  | j,                  | j.                  ||| j0                  | j2                  | j4                  |	r|	ndd| j6                  xs | j8                        }t#        d
|       t#        d       t#        d|j:                         | j6                  rt#        d|j<                         | j8                  rt#        d|j>                         tA        |j:                        D ]9  \  }}|jC                  |d      }|jE                  |       t#        | d|        ; t#        d       t#        d       |r|jG                         jI                         jK                  tL        jN                        tM        jP                  | j&                  gtL        jN                        tM        jP                  | j(                  gtL        jN                        tM        jP                  | j4                  gtL        jR                        d}nW|jG                         jI                         jK                  tL        jN                        tM        jP                  | j&                  gtL        jN                        tM        jP                  | j(                  gtL        jN                        tM        jP                  | j*                  gtL        jN                        tM        jP                  | j0                  gtL        jN                        tM        jP                  | j2                  gtL        jR                        tM        jP                  | j4                  gtL        jR                        d}| j                  rBtM        jT                  |tL        jN                        }| j                  r|	D ]  }d||<   	 ||d<   | jV                  rtY        ||      |d<   |jZ                  d   }| j\                  rAt        j_                  d       tM        jT                  ||ftL        jN                        }||d<   | j`                  rtc        | jd                        jf                  ji                         }t        j                  d |       dd!l5m6} t        j_                  d"| d#       |g}tA        |      D ]:  \  }}tn        jp                  js                  |d$tu        |      z         } |||       < t        j                  d%|       | jv                  ryt        j                  d&       ty        | jd                  | jz                  | j|                        }t        j                  d'       |j                  d|      }g }t        | j                        D ]N  }t        j                         } |j                  d|      }|jE                  t        j                         | z
         P dd(lCmD}! |jZ                  d   } |!||      }"t#        d)       |d   }#t#        d|#       | j6                  rt#        d|d*          | j8                  rt#        d|d+          |rZ|#jZ                  \  }}$g }%t        |      D ]:  }|jC                  |#|   d      }|%jE                  |       t#        d,| d-|        < np|#jZ                  \  }}&}$g }%t        |      D ]P  }t        |&      D ]@  }'|jC                  |#|   |'   d      }|%jE                  |       t#        d,| d.|' d|        B R |r|j:                  j                  || j0                  d/      }(t        j                  |#      })t#        d       t#        d0       t#        |(       t#        |       t#        d       t#        d1       t#        |)       t#        |%       t#        d       ||%k(  }*t#        d2|*rd3nd4       |*|"d5<   | j                  rt        | ||||||	      }+t#        d6|+       t#        d7|"       |"S c c}
w )8a9  Test GPT-2 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r>   rw  left)r   r  N)zThe product is releasedzI enjoy walking in the parkzTest best way to investptTreturn_tensorsr   r   r   walk in park)add_prefix_spacer  2--------------------------------------------------CTest PyTorch model and beam search with huggingface transformers...r  !huggingface transformers outputs:r  r  r  skip_special_tokens: 'Testing beam search with onnxruntime...r   r  rx  r   rX   zYUse prefix vocab mask with all ones in ORT, but no corresponding setting for Torch model.r[   test_data_diroutput_test_datazSaving test_data to z/test_data_set_* ...test_data_set_
ORT inputszCreating ort session......zRun ort session......r  ORT outputs:r_   r   batch z sequence: 
 sequence re   Torch Sequences:ORT Sequences:Torch and ORT result is same	differentparityTorch LatencyORT)Jr   r   r  r   r   padding_side	eos_token	pad_tokenr   r  encoderX   r   r   ro  r  ri   r  r  ry  rz  r{  rW   r  r|  r}  r~  rU   rV   r  r  r  r   decoder   r  numpyastyper   r  arrayfloat32r  r\   r  r1  r[   r   rl   r   r   r   r  bert_test_datar  rs   rt   rr   rq   rj   r   rh   rf   runr   r  r  r  r  r  r  
LongTensorrk   r  ),r   r  r  	tokenizerr   r  r   r   	bad_wordsr  word_idro  r  r  r  torch_decoded_sequencesbeam_outputsr  sequencedecoded_sequencerX   bad_word_idr  r[   r  r  
all_inputsdirr   resultlatencyr  r  r  r   r  ry  ort_decoded_sequencesnum_sequencesr<  torch_sequencesort_sequencesis_sametorch_latency_outputs,                                               r,   test_gpt_modelr  h
  s    ??f$$$--d.E.EQUQ_Q_`I#I#--I++..++E 
	 ytDF{#I,-NI$$Y$FM.;<m7gYmM<_m4\\F&&L&&L""J LhST~~)nn..!%!:!:%%!%!:!:..#66+8-d$(66R$:R:R & 
" 	k9%12k<112''$l&C&CD##(L//0$\%;%;<KAx(//d/S#**+;<QCr*+,- =
 
(O	
34"..077A((DOO#4BHHE((DOO#4BHHE"$((D,C,C+DBJJ"W	
 #..077A((DOO#4BHHE((DOO#4BHHE4>>"2"((C$&HHd.G.G-HPRPXPX$Y hh(;(;'<BJJO"$((D,C,C+DBJJ"W
 WWj:
??,*+
;'  -)|!!#8L#Q #JopGGZ$<BHHM&7"#T[[)0099;_m43*=/9MNOX
":.IAv'',,}.>Q.GHCS&) / LLv&
LL-.$T[[$,,@X@XYK
LL()__T6*F G4??#		OOD&)tyy{U*+ $
 4#J4F	.q	I	+y!## &),hq	"#,?? Z "z"A(//	!RV/W!(()9:F1#[)9(:;< #
 3<///]J "z"A=)#,#3#3IaLOY]#3#^ %,,-=>qcA3b1A0BCD * # &0088TE^E^`bc((3h !o%&hm#$h)-BB(G&M"x5 
 	o34	%MY =s   ?
e.c                    | j                   dv sJ | j                  rt        j                  d       yt	        j
                  | j                  | j                        }d|_        | j                   dk(  r,t        j
                  | j                  | j                        }n+t        j
                  | j                  | j                        }|ddg} ||d	d
      }|d   }|d   }d}|j                  |      dd }|D 	cg c]  }	|	g }}	| j                  rt        j                  d|       ng }|j                  }
|
j                  }|
j                  }|
j                   }t        j                  d| d| d|        g }| j"                  sdt%        d       t%        d       |j'                  ||| j(                  | j*                  | j,                  | j.                  | j0                  ||| j2                  | j4                  | j6                  |r|ndd
| j8                  xs | j:                        }t%        d|       t%        d       t%        d|j<                         | j8                  rt%        d|j>                         | j:                  rt%        d|j@                         tC        |j<                        D ]9  \  }}|jE                  |d
      }|jG                  |       t%        | d|        ; t%        d       t%        d       tI        jJ                  |tH        jL                        }| j                  r|D ]  }d||<   	 |jO                         jQ                         jS                  tH        jL                        tI        jT                  | j(                  gtH        jL                        tI        jT                  | j*                  gtH        jL                        tI        jT                  | j,                  gtH        jL                        tI        jT                  | j2                  gtH        jL                        tI        jT                  | j4                  gtH        jV                        tI        jT                  | j6                  gtH        jV                        d }| j                  r||d!<   | jX                  rt[        ||      |d<   | j\                  rt_        | j`                        jb                  je                         }t        j                  d"|       dd#l3m4} |g}tC        |      D ]:  \  }}tj        jl                  jo                  |d$tq        |      z         } |||       < t        j                  d%|       ts        | j`                  | jt                  | jv                        }g }ty        | jz                        D ]N  }t}        j|                         }|j                  d|      }|jG                  t}        j|                         |z
         P |j                  d   }dd&lAmB}  |||      } t%        d'       d   }!t%        d|!       | j8                  rt%        d|d(          | j:                  rt%        d|d)          |!j                  \  }}"}#g }$ty        |      D ]P  }ty        |"      D ]@  }%|jE                  |!|   |%   d
      }|$jG                  |       t%        d*| d+|% d|        B R | j"                  sj<                  j                  || j2                  d      }&t        j                  |!      }'t%        d       t%        d,       t%        |&       t%        |       t%        d       t%        d-       t%        |'       t%        |$       t%        d       ||$k(  }(t%        d.|(rd/nd0       |(| d1<   | j                  rt        | ||||||      })t%        d2|)       t%        d3|        | S c c}	w )4a=  Test T5 or MT5 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  zLSkipping parity test as prefix vocab mask is not implemented by Hugging FaceNrw  r  r?   z4translate English to French: The product is releasedzsummarize: research continues to show that pets bring real health benefits to their owners. Having a dog around can lead to lower levels of stress for both adults and kids.r  Tr  r   r   r  re   r  zeos_token_id:z, pad_token_id:z, vocab_size:r  r  r  r  r  r  r  r  r  r  r   r   rx  rX   r  r  r  r  r  r  r_   r   r  r  r  r  r  r  r  r  r  r  )Hr   r[   r   r   r   r  r   r   r  r   r   r  rX   ro  r  r  r  ri   r  r  ry  rz  r{  rW   r  r|  r}  r~  rU   rV   r  r  r  r   r  r   r   r  r  r  r  r  r  r  r\   r  rl   r   r   r   r  r  r  rs   rt   rr   rq   r   rh   rf   r   r  r  r  r1  r  r  r  r  r  rk   r  )*r   r  r  r   r  r   r   r  r  r  ro  r  r  r  r  r  r  r   r  rX   r  r  r  r  r  r   r  r  r  r  r  r  r   r  r  ry  r  r<  r	  r
  r  r  s*                                             r,   test_t5_modelr  ;  s    ??m+++cd++D,C,Ct~~^I#I$*::##nn

 ,;;##nn
 B {
	 ytDF{#I,-NI$$Y/4M.;<m7gYmM<_m4\\F&&L&&L""J
LL=ol^=YcXdef hST~~)nn..!%!:!:%%!%!:!:..#66+8-d$(66R$:R:R & 
$ 	k9%12k<112''$l&C&CD##(L//0$\%;%;<KAx(//d/S#**+;<QCr*+,- =
 
(O	
34*RXX6J(K&'J{# ) ]]_**,33BHH=hh0Ahh0AXXt~~.bhh? "$*C*C)DBHH U((D$7$7#8

K hh(?(?'@

SF )|!!#8L#Q T[[)0099;_m43X
":.IAv'',,}.>Q.GHCS&) / LLv&$T[[$,,@X@XYK G4??#		v.tyy{U*+ $ #J34F	.q	I	+y!## &),hq	".7oo+Z
:}%A(//	!QUY/Z!(()9:F1#Zs"-=,>?@ &  &0088TE^E^`bc((3h !o%&hm#$h)-BB(G&M"x5 
 	o34	%M[ =s    
_-c                    t        |       }t        |j                         |j                  dv r|j                  rAt
        j                  j                  |j                        st        d|j                         |j                  rAt
        j                  j                  |j                        st        d|j                         |j                  r|j                  r|j                  r|j                  st        d      |j                  dk(  xr |j                  dk(  }|j                  dk(  r|r|j                  dkD  rf|j                  dk  rWt        |t        j                         t         j#                  d	       |j                  d
kD  s|j$                  s|j&                  r'yt        |t        j(                         nt        |       t         j#                  d       |j                  dv rt+        ||      }nt-        |||      }|r`|j.                  r2t         j#                  d|j0                   d|j0                   d       |S t         j#                  d|j0                          |S )a/  Main entry function

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Raises:
        ValueError: Path does not exist: --encoder_decoder_init_onnx
        ValueError: Path does not exist: --decoder_onnx
        ValueError: --decoder_onnx and --encoder_decoder_init_onnx are not used together for T5

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  z1Path does not exist: --encoder_decoder_init_onnx z$Path does not exist: --decoder_onnx zB--decoder_onnx shall use together with --encoder_decoder_init_onnxr_   r>   rd   rb   zThe test for gpt2_sampling onnx model is limited to non-custom model with small top_p(e.g <=0.01) value. The result should be the same as gpt2 greedy search.g{Gz?Nzstart testing model...)r  )r  r  zOutput files: r:   z.datazOutput file: )r   r   rH   r   r   rs   rt   r  r   r   r{  r|  r  r  r$   r4   r   r   r  r^   r3   r  r  rP   r   )r6   r  r   r  r  s        r,   r   r     s     4 D-'))"''..A_A_2`PQUQoQoPpqrrRWW^^D4E4E%FCDDUDUCVWXX**43D3Dd&D&Dabb!#F(A(AQ(FI& Y::

S 0$T>+B+BCKK p zzD DKK499$T>+F+FG &
KK()-'ty9	YO((KK.R}EJK M KK-}56Mr.   __main__r)   )T)shared_   NN)r  )r   r_   re   )NFr  )`__doc__rm   loggingr   rs   r  enumr   pathlibr   typingr   r   r   r   r	   r  r   r   r  r  r
   r   fusion_utilsr   r   r   r   r8  r   transformersr   r   r   r   r   r   r   r   onnxruntimer   r   r   r   4onnxruntime.transformers.models.gpt2.convert_to_onnxr   r   0onnxruntime.transformers.models.gpt2.gpt2_helperr   2onnxruntime.transformers.models.t5.convert_to_onnxr   r   ,onnxruntime.transformers.models.t5.t5_helperr    r!   	getLoggerr   r$   rq   	Namespacer   r   r   boolr   r   r   r  r  r"  rx   dictr>  rI  rM  r^  re  rm  r}  r  r  r  r  r(  r?  rE  rk  rt  r2   r  Tensorr  r  r  r  r/   r5   r.   r,   <module>r%     sR  
%N    	    3 3    4 $ 4 4  	 	 	 j i ] S j d			2	T B(49- B9K9K BJ*)x)) *)Z!X'' !:Os Od O$KC K4 K[_ K\3  D Ue B5 5I 5pIodoo Io) IoXAa4?? Aay AaN #'+'+g!g!g! g! 	g!
 tng! tng!T: j (  
+>"#Ji-J -`M
M&*M>BM	M`93* 93z fh``!$`47`IL`_b`F$f $fNV* Vr8 8v# ae : [_II47ISWI	IX"0J ZhYrYr q08#5#5 q0 q0h?9


?9"<<=?9 ||?9 LL	?9
 ?9 ?9 S	??9 
#s(^?9D	P++ Pc8K P_c Pfz** zxS	7J zz8xS	" 8htCy6I 8v zF r.   