
    g                      d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	Z
d dlZd dlZd dlmZ d dlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,  G d de      Z-e G d d             Z. G d d      Z/e G d d             Z0e G d d             Z1e G d d             Z2e G d d             Z3e G d d             Z4 G d de      Z5y)    )annotationsN)	dataclass)Enum)Any)TensorProto)onnx_pb   )BaseQuantizerQuantizationParams)
TensorData)DEQUANT_OP_NAMEONNX_TYPE_TO_NP_TYPEQUANT_OP_NAMEQuantizedValueQuantizedValueType__producer____version__add_dequant_output_suffixadd_dequant_suffixadd_quant_input_suffixadd_quant_output_suffixadd_quant_suffixcompute_data_quant_paramscompute_scale_zpcompute_scale_zp_float8find_by_nameget_qmin_qmax_for_qType	ms_domainnormalize_axisquantize_onnx_initializertensor_proto_to_array)CreateQDQQuantizerc                      e Zd ZdZdZdZy)QDQQuantTensorTyper   r	      N)__name__
__module____qualname__
ACTIVATIONWEIGHTBIAS     [/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/quantization/qdq_quantizer.pyr$   r$   /   s    JFDr-   r$   c                  "    e Zd ZU ded<   ded<   y)QDQQuantParamProviderstr
input_name	node_nameNr&   r'   r(   __annotations__r,   r-   r.   r0   r0   8   s    ONr-   r0   c                  0    e Zd Zej                  dddfdZy)QDQTensorQuantInfoNc                V    || _         || _        || _        |d u| _        |J || _        y N)tensor_typequant_para_provideraxis	is_shared	data_type)selfr:   r;   r<   r>   s        r.   __init__zQDQTensorQuantInfo.__init__A   s8    &#6 	,D8$$$"r-   )r&   r'   r(   r$   r)   r@   r,   r-   r.   r7   r7   @   s    #5#@#@VZaequ #r-   r7   c                  6    e Zd ZU ded<   ded<   ded<   ded<   y)QDQBiasQuantInfor1   r3   r2   weight_namefloatbetaNr4   r,   r-   r.   rB   rB   K   s    NO
Kr-   rB   c                  4    e Zd ZU ded<   ded<   ded<   d	dZy)
QDQTensorQuantParamsr   originalzQuantizationParams | None	convertedset[str] | Noneconverted_recv_nodesc                    | j                   | j                  S | j                  | j                   S || j                  v r| j                   S | j                  S r9   rI   rH   rK   r?   consumer_node_names     r.   get_for_consumerz%QDQTensorQuantParams.get_for_consumer\   Q    >>!== $$,>>!
 #58Q8Q"Qt~~eX\XeXeer-   N)returnr   r&   r'   r(   r5   rP   r,   r-   r.   rG   rG   V   s      (())
fr-   rG   c                  "    e Zd ZU ded<   ded<   y)QDQScaleZpInitializersr   scale
zero_pointNr4   r,   r-   r.   rU   rU   j   s    r-   rU   c                  ,    e Zd ZU ded<   ded<   ded<   y)QDQTensorScaleZpInitializersrU   rH   zQDQScaleZpInitializers | NonerI   rJ   rK   Nr4   r,   r-   r.   rY   rY   s   s    $$,,))r-   rY   c                  4    e Zd ZU ded<   ded<   ded<   d	dZy)
QDQTensorQuantizedValuer   rH   zQuantizedValue | NonerI   rJ   rK   c                    | j                   | j                  S | j                  | j                   S || j                  v r| j                   S | j                  S r9   rM   rN   s     r.   rP   z(QDQTensorQuantizedValue.get_for_consumer   rQ   r-   N)rR   r   rS   r,   r-   r.   r[   r[   }   s    $$))
fr-   r[   c                     e Zd Z	 d#dZd Zd Zdej                  fdZd$dZ	d%dZ
d$dZd	 Zd&d
Zd'dZ	 	 	 	 	 	 	 	 	 	 	 	 d(dZd Zd Zd Zd Zd Z	 d#	 	 	 	 	 	 	 	 	 	 	 d)dZ	 d#	 	 	 	 	 	 	 	 	 	 	 d*dZ	 d#dZd+dZd#dZd Zd Zd Zd Zd$dZ	 d#	 	 	 	 	 	 	 d,dZd-dZ 	 d.	 	 	 	 	 	 	 d/dZ!d0dZ"d1d Z#d2d!Z$d3d"Z%y)4QDQQuantizerNc                   t        j                  | |||||||||	|
       i | _        i | _        g | _        |
j                  dg       | _        |
j                  dd      | _        |
j                  dd      | _        |
j                  dd      | _	        i | _
        |
j                  di       | _        |
j                  dd      rt        nd | _        |
j                  d	d      | _        |
j                  d
d      | _        | j                   dk  rt"        j$                  t"        j&                  t"        j(                  t"        j*                  ft-        fd| j.                  D              }| j                  sF| j0                  v s| j2                  v s|r(t5        j6                  dt         d       t        | _        | j9                         | _        i | _        i | _        y )N"OpTypesToExcludeOutputQuantizationAddQDQPairToWeightFQuantizeBiasTDedicatedQDQPair QDQOpTypePerChannelSupportToAxisUseQDQContribOpsQDQKeepRemovableActivations"QDQDisableWeightAdjustForInt32Bias   c              3  :   K   | ]  }|j                   v   y wr9   )r:   ).0topset21_typess     r.   	<genexpr>z(QDQQuantizer.__init__.<locals>.<genexpr>   s      /8Y1.8Ys   zONNX QuantizeLinear and DequantizeLinear operators do not support 16-bit/4-bit integer quantization types prior to opset 21. The domain of QuantizeLinear and DequantizeLinear operators will be set to 'z' to enable support.) r
   r@   tensors_to_quantizebias_to_quantizenodes_to_removeget'op_types_to_exclude_output_quantizationadd_qdq_pair_to_weightquantize_biasdedicated_qdq_pairtensor_to_its_receiving_nodes'qdq_op_type_per_channel_support_to_axisr   qdq_op_domainqdq_keep_removable_activations(qdq_disable_weight_adjust_for_int32_biasopset_versionr   UINT16INT16UINT4INT4anytensor_quant_override_qtypesactivation_qTypeweight_qTypeloggingwarningcalc_graph_quant_paramsquantization_paramsinitializer_quant_paramsquantized_value_map)r?   modelper_channelreduce_ranger   r   tensors_rangenodes_to_quantizenodes_to_excludeop_types_to_quantizeextra_optionsoverrides_have_opset21_typesrl   s               @r.   r@   zQDQQuantizer.__init__   s    	 	
 CE =?! 8E7H7HImoq7r4
 '4&7&78Le&T# +..~tD #0"3"34F"N-/* 8E7H7HIkmo7p4*7*;*;<NPU*VY\` /<.?.?@]_d.e+ 9F8I8IJnpu8v5
 "(//1B1BKDUDUWbWgWghM+. /8<8Y8Y/ ,( %%%%6$$5/cclbm n&& &/"#'#?#?#A GI% $& r-   c                   t        || j                  j                               }||j                  S || j                  v rJ| j                  |   }|j
                  j                  d      r |j
                  j                  j                  S y)2
        Check if tensor can be quantized
        Nr:   )	r   r   initializerr>   value_infostypeHasFieldr:   	elem_typer?   tensor_nameweightvis       r.   _get_tensor_typezQDQQuantizer._get_tensor_type   sx     k4::+A+A+CD###D,,,!!+.Bww.ww**444r-   c                   t        || j                  j                               }|B|j                  t        j
                  j                  t        j
                  j                  fv ryy|| j                  v rl| j                  |   }|j                  j                  d      rA|j                  j                  j                  t
        j                  t
        j                  fv ryyt        j                  d| d       y)r   Tr:   z$failed to infer the type of tensor: z6. Skip to quantize it. Please check if it is expected.F)r   r   r   r>   
onnx_protor   FLOATFLOAT16r   r   r   r:   r   r   r   r   s       r.   _is_tensor_quantizablez#QDQQuantizer._is_tensor_quantizable   s     k4::+A+A+CDJ$:$:$@$@*BXBXB`B`#aa  D,,,!!+.Bww.2773F3F3P3P!!##U 4  	 OO6{mCyz r-   c                J   | j                  |      r|rUt        |t              st        dt	        |       d      | j                  |      }t        |||      | j                  |<   y|| j                  vr,| j                  |      }t        ||      | j                  |<   yyy)a  
        Adds a tensor to the list (actually a dict) of tensors to quantize. Called indirectly by op quantizers that
        want to quantize a tensor (i.e., "mark" a tensor for quantization).

        If quant_sharing_provider is not None, tensor with name tensor_name will be quantized with the same
        quantization parameters as the node input specified in quant_sharing_provider. Ex: A Tranpose node's output
        will typically use the same quantization parameter initializers used at the Transpose node's input.

        Args:
            tensor_name: name of the tensor to quantize
            quant_sharing_provider: name of the tensor and node that provides quantization parameter
            tensor_type: QDQQuantTensorType default ACTIVATION
        zBquant_sharing_provider must be of type QDQQuantParamProvider, not .)r:   r;   r>   )r:   r>   N)r   
isinstancer0   	TypeErrorr   r   r7   rn   )r?   r   quant_sharing_providerr:   r>   s        r.   __quantize_tensorzQDQQuantizer.__quantize_tensor  s     &&{3%!"8:OP#\]abx]y\zz{|  !11+>	8J +AWcl9((5 D$<$<< 11+>	8JWbnw8x((5 = 4r-   c                D    | j                  |dt        j                        S )z
        Adds a tensor to the list of tensors to quantize. Called by op quantizers that
        want to quantize a tensor (i.e., "mark" a tensor for quantization).

        Args:
            tensor_name: name of the tensor to quantize
        N)_QDQQuantizer__quantize_tensorr$   r)   r?   r   s     r.   quantize_activation_tensorz'QDQQuantizer.quantize_activation_tensor/  s      %%k49K9V9VWWr-   c                X    | j                  |t        ||      t        j                        S )a  
        Adds a tensor to the list of tensors to quantize. Called by op quantizers that
        want to quantize an output tensor using the same quantization parameters as one of the node's inputs.

        Ex: A Tranpose node's output will typically use the same quantization parameter initializers used at
        the Transpose node's input.

        Args:
            output_name: name of the node output to quantize so that it uses the same quantization params as an input.
            input_name: name of the node input from which the output tensor will get its quantization params.
            node_name: name of the node that consumes `input_name`.
        )r   r0   r$   r)   )r?   output_namer2   r3   s       r.   quantize_output_same_as_inputz*QDQQuantizer.quantize_output_same_as_input9  s-     %%.z9EGYGdGd
 	
r-   c                D    | j                  |dt        j                        S )z
        Adds a tensor to the list of weight tensors to quantize. Called by op quantizers that
        want to quantize a weight (i.e., "mark" a weight for quantization).

        Args:
            tensor_name: name of the weight to quantize
        N)r   r$   r*   r   s     r.   quantize_weight_tensorz#QDQQuantizer.quantize_weight_tensorJ  s      %%k49K9R9RSSr-   c                l   t        || j                  j                               }|ru|j                  t        j
                  j                  t        j
                  j                  fv r4t        t        j                  ||j                        | j                  |<   y y t        j                  d| d       y )N)r:   r<   r>   z9only support per-channel quantization on weight. Tensor: z is not quantized.)r   r   r   r>   r   r   r   r   r7   r$   r*   rn   r   r   )r?   r   r<   r   s       r.   "quantize_weight_tensor_per_channelz/QDQQuantizer.quantize_weight_tensor_per_channelT  s    k4::+A+A+CDJ$:$:$@$@*BXBXB`B`#aa8J 2 9 9PVP`P`9((5 b
 OOWXcWddvwxr-   c                   | j                   j                  |j                        dz   }|j                   | }t        j                         }|j                  |       ||_        | j                   j                  |       |S )zk
        Duplicates an existing initializer and adds it to the model. Returns the new initializer.
        r	   )r   #get_largest_initializer_name_suffixnameonnxr   CopyFromadd_initializer)r?   r   name_suffixnew_initializer_namenew_initializers        r.   _dup_initializerzQDQQuantizer._dup_initializer^  sv      ::II+JZJZ[^__"-"2"2!3K=A**,  -3

""?3r-   c                   | j                   j                  |      rVt        j                  d| d       | j	                  |d      \  }}|r| j                  ||       y| j                  |       yt        || j                  j                               }|t        j                  d| d       y|j                  t        j                  j                  t        j                  j                  fvrt        j                  d| d       y|}	|| j                   v rW| j#                  |      }
|
j$                  }	| j                  j'                  ||	|h       t        j                  d	| d
|	 d       t)        ||||      | j                   |	<   y)a  
        Adds a bias tensor to the list of bias tensors to quantize. Called by op quantizers that
        want to quantize a bias with bias_zero_point = 0 and bias_scale = input_scale * weight_scale * beta.
        TODO: Explain the reasoning for using this formula.

        Args:
            node_name: name of the node that consumes the bias, input, and weight tensors.
            bias_name: name of the bias tensor to quantize.
            input_name: name of the input tensor whose scale is used to compute the bias's scale.
            weight_name: name of the weight tensor whose scale is used to compute the bias's scale.
            beta: Multiplier used to compute the bias's scale.
        zQuantizing bias tensor 'z=' as a weight due to the presence of user-specified overridesr   )default_axisNzExpected bias 'z' to be an initializerz%' to be an floating-point initializerzCreated a copy of bias input 'z
' called '')tensor_quant_overridesrq   r   infois_tensor_per_channelr   r   r   r   r   r   r>   r   r   r   r   ro   r   r   replace_input_of_nodesrB   )r?   r3   	bias_namer2   rC   rE   is_per_channelr<   bias_initializeractual_bias_namenew_bias_initializers              r.   quantize_bias_tensorz!QDQQuantizer.quantize_bias_tensorj  s    &&**95LL*9+5rs $(#=#=iVW#=#X ND77	4H  ++I6'	4::3I3I3KL#OOoi[8NOP%%j.D.D.J.JJLbLbLjLj-kkLL?9+5Z[\$--- $(#8#89I#J 388 JJ--i9II;WLL9)JO_N``abc 3C9jZegk2l./r-   c                   |j                   syt        |      }t        j                  t        j                        }d}t        j
                  |j                  t        j                        t        j
                  |j                  dz   t        j                        z
  }	|j                  }
d}|st        j                  |j                         t        j
                  dt        j                              }t        j                  |j                         t        j
                  dt        j                              }t        j                  t        j                  |      t        j                  |            }|d|z  z  |	z  }t        j
                  |j                         t        j                        }t        j
                  |j                         t        j                        }||z  }||k  rK|dkD  rF||z  }t        j                  d	| d
| d|j                    d       ||z  }|j#                  |
      }d}||fS |j$                  r!t'        |j$                        dk(  r|j$                  d   }t)        |      D ]  }t        j                  ||         }|d|z  z  |	z  }t        j
                  |j                         t        j                        }t        j
                  ||   j                         t        j                        }||z  }||k  s|dkD  s||z  }t        j                  d| d| d| d|j                    d	       ||z  }|j#                  |
      ||<   d} ||fS )aI  
        Checks if the bias scale (input_scale * weight_scale) that we intend to use is too small.
        A bias scale that is too small leads to quantized bias values that fall outside the range of a int32 and have to
        be clipped, which decreases accuracy. If this function detects such a scenario, the weight_scale value will be
        increased to prevent this from happening.

        Although the adjustment method and amount differs, the idea to adjust the weight's scale came from the following
        reference:
        https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/optimize/quantization_utils.cc#L252

        :param input_scale: The input's scale.
        :param weight_scale: The weight scale to potentially adjust.
        :param weight_name: The weight initializer's name. Used for logging.
        :param bias_tp: The bias ONNX initializer.
        :param is_per_channel: True if the bias and weight are quantized per-channel.
        :return: A tuple with a bool indicating if the weight's scale was adjusted and the new weight scale.
        FNgqh ?dtyper	   Fr   g       @g        zIncreasing scale for weight `z` by the ratio z to ensure bias input `z` has a valid scale.TzIncreased scale[z] for weight `z` by ratio )sizer!   npiinfoint32arraymaxfloat64minr   minimummaximumabsitemr   r   r   astypeshapelenrange)r?   input_scaleweight_scalerC   bias_tpr   bias_float_data
int32_infomultiplicative_epsilonqrangeweight_scale_dtypeupdated_an_elemrminrmaxabsmaxbias_smallest_valid_scaleinput_scale_fp64weight_scale_fp64bias_candidate_scaleratio	new_scale	num_elemsi	bias_rmaxs                           r.   #_adjust_weight_scale_for_int32_biasz0QDQQuantizer._adjust_weight_scale_for_int32_bias  s   2   /8XXbhh'
!'*..

;bhhz~~XYGYacakak>ll)//::o113RXXarzz5RSD::o113RXXarzz5RSDZZtbffTl;F(>#,(ORX(X%!xx(8(8(:"**M "):):)<BJJ O#36G#G $'@@G[^aGa14HH3K=PUw W**1,,7KM .5	(//0BC"&. ,,- C(:(:$;q$@$**1-I9%FF?1#56	,BcIo,VY_,_)#%88K,<,<,>bjj#Q $&HH\!_-A-A-C2::$V!'7:K'K$(+DDK_beKe58LLELL*1#^K=TYSZ [118>RT !2E 9I&/&6&67I&JLO&*O! &$ ,,r-   c                   | j                   ry| j                  j                         D ]  \  }}|j                  | j                  vs0|j                  | j
                  vs|j                  | j                  vrP| j                  |j                     j                  |j                        }| j
                  |j                     }t        j                  |d   t        j                  j                  |j                              }| j                  |j                     }|d   }|t        j                   j"                  t        j                   j$                  fvr2|d   }|j'                         rI|d   }	|j)                  dd      du}
| j+                  ||	|j                  t-        || j.                  j1                               |
      \  }}|s||d<    y)a3  
        Iterates through all bias inputs that should be quantized to int32. If the intended
        bias scale (equal to input_scale * weight_scale) is too small, this function will increase
        the associated weight's scale to ensure the bias does not overflow the int32 range when quantized.
        NrV   r   
quant_typerW   r<   )rz   ro   itemsr2   r   rn   rC   r   rP   r3   r   asarrayr   helpertensor_dtype_to_np_dtyper>   r   INT8r}   r   rq   r   r   r   r   )r?   r   	bias_infoinput_qparams
input_infor   weight_quant_paramsweight_quant_typeweight_zero_pointr   r   did_update_weight_scalenew_weight_scales                r.   ,_adjust_weight_quant_params_for_bias_tensorsz9QDQQuantizer._adjust_weight_quant_params_for_bias_tensors  s    88$($9$9$?$?$A Iy$$D,D,DD''t/G/GG((0M0MM !44Y5I5IJ[[\e\o\opM11)2F2FGJ**g&dkk.R.RS]SgSg.hK #'"?"?	@U@U"V 3L A )9)9)>)>@P@P@V@V(WW,?,M $$&':7'CL044VTB$NN 9=8`8`%%Y

(>(>(@A95#%5 '/?#G,M %Br-   c                :    | j                   j                  |       y r9   )rp   append)r?   nodes     r.   remove_nodezQDQQuantizer.remove_node  s    ##D)r-   c                N    | j                   j                  | j                         y r9   )r   remove_nodesrp   )r?   s    r.   r  zQDQQuantizer.remove_nodes!  s    

 4 45r-   c                p   | j                   j                         D ]|  }| j                  |      st        | |      }|j	                          |j
                  D ]=  }|| j                  vrg | j                  |<   | j                  |   j                  |       ? ~ | j                         | _	        | j                          | j                          | j                          | j                  r| j                          | j                          | j                   s| j                   j#                          t$        | j                   j                   _        t(        | j                   j                   _        | j,                  t.        k(  r | j                   j1                  t.        d       | j                   j                   S )Nr	   )r   nodesshould_quantize_noder"   quantizeinputrv   r  _calc_initializer_quant_paramsr   r  _quantize_normal_tensors_quantize_sharing_param_tensorsrt   _quantize_bias_tensorsr  rs   clean_initializersr   producer_namer   producer_versionrx   r   set_opset_import)r?   r  op_quantizerr   s       r.   quantize_modelzQDQQuantizer.quantize_model$  sK   JJ$$&D((.1$=%%'#'::K"$*L*LLJL::;G66{CJJ4P $. ' )-(K(K(M%99;%%',,.'')**JJ))+)5

&,7

)*JJ''	15zzr-   c                   || j                   v r| j                   |   j                  | j                   |   j                  t        | j                  j	                         |         dk(  rn| j                  j                  |      sS| j                  j                  |      s8| j                  j                  ||       || j                  v r| j                  |= yy)Nr	   TF)	r   rI   r   r   input_name_to_nodesis_graph_outputis_graph_inputreplace_output_of_all_nodesrn   )r?   upstream_output_namer   s      r.   try_replacing_upstream_outputz*QDQQuantizer.try_replacing_upstream_output@  s    4333((5??G(()=>HHPDJJ2245IJKqPJJ../CDJJ--.BCJJ223GU#t'?'??,,-ABr-   c                    t         j                  j                  t        |||g|g||| j                        }| j
                  j                  |g       y)zI
        Creates a QuantizeLinear node and adds it to the model.
        r<   domainN)r   r   	make_noder   rx   r   	add_nodes)r?   q_inputq_outputquant_node_name
scale_namezp_namer<   qlinear_nodes           r.   _create_q_nodezQDQQuantizer._create_q_nodeO  sT     {{,,j'*J%% - 
 	

l^,r-   c                    t         j                  j                  t        |||g|g||| j                        }| j
                  j                  |g       y)zK
        Creates a DequantizeLinear node and adds it to the model.
        r#  N)r   r   r%  r   rx   r   r&  )r?   dq_input	dq_outputdequant_node_namer*  r+  r<   dequant_nodes           r.   _create_dq_nodezQDQQuantizer._create_dq_nodee  sT     {{,,z7+K%% - 
 	

l^,r-   c
                   t         j                  j                  t        |||g|g||	| j                        }
t         j                  j                  t
        |||g|g||	| j                        }| j                  j                  |
|g       y )Nr#  )r   r   r%  r   rx   r   r   r&  )r?   r'  r(  r)  r/  r0  r1  r*  r+  r<   r,  r2  s               r.   _create_qdq_nodeszQDQQuantizer._create_qdq_nodes{  s     {{,,j'*J%% - 
 {{,,z7+K%% - 
 	

lL9:r-   c                    |j                   }|| j                  v ry| j                  |   }|j                  d      }| j	                  ||      }d}t        |      }| j                  j                  ||       | j                  r_t        |      }| j                  ||t        |      ||t        |      |j                  j                   |j                  j                   |	       nt        ||d   |d   |d   |      }	| j                  j!                  |	       |	j                   }t"        j$                  j'                  t(        |	j                   |j                  j                   |j                  j                   g|gt        |      || j*                        }
| j                  j-                  |
       t/        |||j                  j                   |j                  j                   t0        j2                  |      }t5        |dd      | j                  |<   y)a  
        Adds Q/DQ nodes for an initializer. If `self.add_qdq_pair_to_weight` is true, creates
        the sequence (weight_f32 -> Q -> DQ -> ). Otherwise, this function quantizes the initializer
        and adds the sequence (weight_quant -> DQ ->).
        Nr<   r   rW   rV   r#  )r<   )r   r   r   rq   _make_scale_zp_initializersr   r   replace_input_of_all_nodesrs   r   r5  r   r   rV   rW   r    r   r   r   r%  r   rx   add_noder   r   Initializerr[   )r?   weight_protorC   quant_paramsr<   scale_zp_initializersq_weight_nameweight_dequant_outputweight_quant_outputquant_weightr2  quantized_values               r.   _add_qdq_nodes_for_initializerz+QDQQuantizer._add_qdq_nodes_for_initializer  s    #''$222+/+H+H+U $$V, $ @ @l [$( 9+ F

--k;PQ&& #:+"F""# -#%";/%++00%0055
 5\*\*W%L JJ&&|4(--M;;00""$9$?$?$D$DF[FfFfFkFkl&'";/)) 1 L JJ- )!'',,!,,11**
 1HY]_c0d  -r-   c                   | j                   r|| j                  v r
t        | j                  |         dkD  rt        | j                  |         }t        |      D ]  }d|dz    }t	        |      |z   }t        |      |z   }	t        |      |z   }
t        |      |z   }| j                  |||
||	|||       | j                  |   |   }| j                  j                  |||	       |dk(  st        ||	||t        j                  |      }t        |d d       | j                  |<    y |}t        |      }| j                  j!                  |      r*t#        |      }|}| j                  j%                  ||       n| j                  j'                  ||       | j                  |t	        |      t        |      t	        |      |t        |      ||       t        ||||t        j                  |      }t        |d d       | j                  |<   y )Nr	   _r   
scale_type)ru   rv   r   r   r   r   r   r   r5  r   replace_node_inputr   r   Inputr[   r   r  r   r  r8  )r?   r   r*  r+  r>   num_dedicated_qdq_pairr   postfix tensor_name_quant_output_postfix"tensor_name_dequant_output_postfixquant_node_name_postfixdequant_node_name_postfixr  rB  r'  r0  s                   r.   _add_qdq_pair_for_activationz)QDQQuantizer._add_qdq_pair_for_activation  s   ##tAAAD66{CDqH%()K)KK)X%Y"12a!eW+3J;3WZa3a05N{5[^e5e2*:;*G'*Q',>{,Kg,U)&&4+46-	 99+FqI

--dKAcd6&4#:"*00#,'O =TTceiko<pD,,[99 3< "G1+>Izz))+60='	

66{GL

55k9M""'4 -'4";/	 -"(($O 5LO]acg4hD$$[1r-   c                   t        | j                  j                  |g       D cg c]  }|j                   c}      }	| j                  r4|| j                  v r&t        | j                  |         dkD  rt        d      |	}
||	}t               }
n|
|z
  }
t        |      t        |	      k(  }| j                  j                  |      }|}|r't        |      }| j                  j                  ||       t        |      }| j                  ||t        |      ||       t        |      }|r|s|}|
r"||k7  r| j                  j                  |||
       | j!                  ||t#        |      ||       |}|s/t        | d      }| j!                  ||t#        | d      ||       t        | d      }| j                  ||t        | d      ||       t        | d      }|r|r|}|r"||k7  r| j                  j                  |||       | j!                  ||t#        | d      ||       t%        ||||t&        j(                  |      }t%        ||||t&        j(                  |      }t+        |||      | j,                  |<   yc c}w )a  
        Adds Q and DQ ops to a tensor whose quantized data type is converted. That is, some consumers may use the
        original data type from the producer, while other consumers use the converted data type.
        This is generally done by adding a sequence of ops that convert from one data type (e.g., uint8) to another (e.g., uint16).

        T_float ---> Quant(to u8) ---> Convert(to u16) ---> Dequant(to float) ---> T_float'
        where Convert(to u16) is equivalent to: ---> Dequant(to float) ---> Quant(to u16) --->

        This function handles the following scenarios:

        1) Tensor T is not a graph output; all consumers use the converted type

            <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 ---> <Consumers>

        2) Tensor T is not a graph output; some consumers use the original type, others use the converted type

            <Producer> ---> Q1 -+-> DQ1 ---> <Consumers of original type>
                                |
                                +-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>

        3) Tensor T is a graph output; all consumers use the converted type

            <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 -+-> <Consumers>
                                                          |
                                                          +-> <Graph output>

        4) Tensor T is a graph output; some consumers use the original type, others use the converted type

            <Producer> ---> Q1 -+-> DQ1 -+-> <Consumers of original type>
                                |        |
                                |        +-> <Graph output>
                                |
                                +-> DQ1' ---> Q2 ---> DQ2 ---> <Consumers of converted type>

        5) Tensor T is a graph output that is not consumed by any other nodes.

            <Producer> ---> Q1 ---> DQ1 ---> Q2 ---> DQ2 ---> <Graph output>
        r	   z|Do not currently support converted quant_types in TensorQuantOverrides when the `dedicated_qdq_pair` extra_option is enabledN_convert_convert_clonerF  )setrv   rq   r   ru   r   
ValueErrorr   r  r   r  r   r-  r   r   r   r3  r   r   r   rI  r[   r   )r?   r   first_scale_namefirst_zp_namescale_data_typeconvert_scale_nameconvert_zp_nameconvert_recv_nodesr  tensor_recv_nodesoriginal_recv_nodesall_use_convertedr  first_q_inputfirst_q_outputfirst_dq_outputsecond_q_inputsecond_q_outputsecond_dq_outputoriginal_quantized_valueconverted_quantized_values                        r.   %_add_qdq_ops_for_converted_activationz2QDQQuantizer._add_qdq_ops_for_converted_activation  s   `  t7Y7Y7]7]^ikm7n o7nt7n op ##tAAAD66{CDqH  O  0%!2"%%"58J"J 23s;L7MM**44[A $2;?MJJ22;N0=>+;K+HJZ\i	

 4K@#4)O?k#AJJ--k?L_`O-?-LN^`m	
 ) 3{m84LMN  "k].#AB  2[M2JK}H56	
 5}H5MN0*"2k"AJJ--k;KM_`+h78	
 $2$$&$
  %3$$&%
! 1H$&?AS1
  -S !ps   Jc           
        | j                   j                         j                         D ]  \  }}|| j                  v r|j                  r#t        || j                  j                               }|r| j                  |       n\| j                  |      }|st        d| d      |j                  \| j                  ||j                  j                  j                  |j                  j                   j                  |j"                         n|j"                  |j                  j                  j"                  k(  sJ | j%                  ||j                  j                  j                  |j                  j                   j                  |j"                  |j                  j                  j                  |j                  j                   j                  |j&                         | j                   |=  y)z}
        Adds Q/DQ ops to tensors (activations and weights) that have been marked for quantization by op quantizers.
        z4Quantization parameters are not specified for param zb. In static mode quantization params for inputs and outputs of nodes to be quantized are required.N)r>   )rn   copyr   r   r=   r   r   r   rC  "_make_tensor_scale_zp_initializersrU  rI   rP  rH   rV   r   rW   r>   rg  rK   )r?   r   tensor_infor   tensor_qparam_initializerss        r.   r  z%QDQQuantizer._quantize_normal_tensors  s    )-(@(@(E(E(G(M(M(O$Kd666((*;

8N8N8PQ77D151X1XYd1e.5(RS^R_ `  
 2;;C99'6??EEJJ6??JJOO&1&;&;	 :   +448R8[8[8a8a8k8kkkkBB'6??EEJJ6??JJOO'116@@FFKK6@@KKPP6KK ,,[9Q )Pr-   c           
     v   | j                   r| j                   j                         j                         D ]q  \  }}|j                  }|s|j                  | j
                  v s/| j                   |= | j
                  |j                     j                  |j                        }| j                  |      rt        d      d}d}|| j                  v rD| j                  |   }|j                  r)| j                  ||j                  d      }|j                  }|)| j                  ||j                  |j                          | j#                  ||j                  |j                   |j$                  j&                  |j$                  j(                  |j*                  j(                  |       t | j                   ryy)a{  
        Adds Q/DQ ops to tensors that have been marked for quantization by op quantizers.
        Only operates on tensors that want to use the quantization parameter initializers from an upstream tensor.
        For example, a Transpose node's output tensor will typically want to use the same quantization parameter
        initializers as the Transpose node's input.
        zBQuantization parameter shared mode is not supported for weight yetNrR  )rn   ri  r   r;   r2   r   rP   r3   is_input_a_initializerrU  r   rI   r7  rK   rP  r*  r+  rg  rV   r>   r   rW   )r?   r   rk  quant_providerrB  converted_qparam_initsrK   tensor_paramss           r.   r  z,QDQQuantizer._quantize_sharing_param_tensors  s    &&,0,D,D,I,I,K,Q,Q,S([!,!@!@!n&?&?4C[C[&[00=&*&>&>~?X?X&Y&j&j&00'O 22;?()mnn .2*+/("d&>&>>(,(@(@(M(22595U5U +]-D-Dj62 4A3U3U0-599')C)C_E\E\ BB'+66+33288BB288==2==BB0A -T &&r-   c           	     .   | j                   j                         D ]w  \  }}|| j                  v r| j                  ||       t	        || j
                  j                               }| j
                  j                  |       | j                  |   j                  }|j                  dk(  rt        |j                  t              s.t        dt        |j                         d|j                        t!        |      }t"        j$                  j'                  d|j(                  g|g||j                        }n?|j                  dv r|j*                  t"        j,                  j.                  t"        j,                  j0                  t"        j,                  j2                  hv rt5        d|j*                   d      |j(                  |j6                  |j8                  g}t!        |      }|j:                  ;t"        j$                  j'                  d	||g||j:                  | j<                  
      }nIt"        j$                  j'                  d	||g|| j<                        }nt5        d|j                  d      | j
                  j?                  |       z y)zq
        Adds DQ ops (or Cast) for bias tensors that have been marked for quantization by op quantizers.
        CastUnexpected type z for input=)r   to)NDequantizeLinearzUnexpected quantize type z for DequantizeLinear.Nrv  r#  )r$  zUnexpected operator type r   ) ro   r   r   quantize_bias_staticr   r   r   remove_initializerrH   	node_typer   r>   intr   r   r2   r   r   r   r%  q_name
node_qtyper   r   BFLOAT16r   RuntimeErrorr*  r+  r<   rx   r9  )r?   r   r   initquant_valuer3   r2  inputss           r.   r  z#QDQQuantizer._quantize_bias_tensors  sP    %)$9$9$?$?$A IyD444%%i;	4::+A+A+CDDJJ))$/229=FFK$$. "$..#6#&6tDNN7K6LKXaXlXlWo$pqq.y9	#{{44 ''(K"~~  5   &&*DD))$$,,$$--$$**. 
 ')B;CYCYBZZp'qrr%,,k.D.DkFYFYZ.y9	##/#';;#8#8*"!(--#11 $9 $L $(;;#8#8*"!#11 $9 $L #%>{?T?T>WWX#YZZJJ-c %Br-   c                >    || j                   v xs || j                  v S r9   )rn   ro   r   s     r.   is_tensor_quantizedz QDQQuantizer.is_tensor_quantizedJ  s#    d666^+I^I^:^^r-   c                   | j                   j                  |      }|y| j                  j                  |      ry| j                  j	                  |      }| j
                  s|sy|r| j                  j                  ||      n|}|r#| j                  j                  |      }|d   d   }t        |j                        }t        ||      \  }	}|	st        j                  d| d| d|        yd|fS )a  
        Checks if a given tensor is configured to be quantized per-channel. If so, also returns the channel axis.

        ORT only supports per-channel quantization on static weights (i.e., ONNX initializers). If the user did not provide
        tensor quantization overrides for this tensor, then the value of self.per_channel determines if the weight
        is to be quantized per-channel.

        Params:
            tensor_name: The name of the tensor to check.
            default_axis: The default channel axis. This method checks if the normalized axis is within bounds.
                          Can be overridden via the extra_options 'QDQOpTypePerChannelSupportToAxis'
                          and 'TensorQuantOverrides'.
            op_type: Optional, defaults to None. The operator type that is the only consumer of this weight.
                     Used to access the extra option 'QDQOpTypePerChannelSupportToAxis'.
        Returns:
            A tuple (is_per_channel, axis) in which the first element indicates whether the tensor is
            quantized per-channel and the second element is the channel axis.
            The returned axis is only None if the tensor is not per-channel or the axis is out of bounds.
        r   r   r<   zAxis z is out-of-range for weight 'z' with rank T)initializersrq   r   has_per_tensor_overrideshas_per_channel_overridesr   rw   get_per_channel_overridesr   dimsr   r   r   )
r?   r   r   op_typeweight_initializerhas_per_chan_overridesr<   per_chan_overridesweight_rank
axis_valids
             r.   r   z"QDQQuantizer.is_tensor_per_channelM  s    2 "..22;?%&&??L!%!<!<!V!VWb!c(>Zat;;??Vgs!!%!<!<!V!VWb!c%a(0D,112)$<
DOOeD6)F{mS_`k_lmnTzr-   c           
        || j                   v r#| j                   |   j                  j                  S | j                   |j                     j                  j                  }t        || j                  j                               }t        |      }| j                   |j                     j                  |j                        j                  }t        || j                  j                               }t        |      }| j                  ||||j                        \  }	}
}}}}t        ||	|
|t        j                   |j"                  dkD  rdnd||      }t%        |dd      | j                   |<   |	S )z]
        Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
        r	   r   N)ry  r|  )r   rH   r{  rC   r*  r   r   r   r!   r2   rP   r3   quantize_bias_static_implrE   r   r   r:  r   r[   )r?   r   r   weight_scale_nameweight_scale_initializerr   input_scale_nameinput_scale_initializerr   quantized_bias_namequantized_bias_scale_namequantized_bias_zp_namebias_scale_datary  r|  rB  s                   r.   rw  z!QDQQuantizer.quantize_bias_static~  se    000++I6??FFF !44Y5J5JKTT__#/0A4::CYCYC[#\ ,-EF $$Y%9%9:KKIL_L_`kk 	 #//?AWAWAY"Z+,CD **9k<QZQ_Q_`	
%" )%"** %%)At!	
 /FoW[]a.b  +""r-   c                   |d   }|d   }|d   }|j                  d      }|t        |j                        dk(  s!|t        |j                        dk(  sJ d       t        |j                        t        |j                        k(  sJ d       |d	z   |z   }|d
z   |z   }	t        j                  j                  |||j                  |j                         j                               }
| j                  j                  |
       |j                  t        j                  k(  rt        j                  j                  }nS|j                  t        j                   k(  rt        j                  j"                  }nt%        d|j                   d|      t        j                  j                  |	||j                  |j                         j                               }| j                  j                  |       t'        ||
      S )z
        Creates and returns scale and zero-point initializers for the given quantization params. The initializers are
        named:
            - {param_name}_zero_point{init_name_suffix}
            - {param_name}_scale{init_name_suffix}
        rW   rV   r   r<   r	   r   zWrong scale/zp shapesz,Scale and zero-point must have the same rank_zero_point_scalezUnexpected dtype=z for param_name=)rq   r   r   r   r   make_tensorraveltolistr   r   r   r   float32r   r   r   float16r   rU  rU   )r?   
param_namer<  init_name_suffixrW   rV   zero_point_typer<   zero_point_namer*  init_zprG  
init_scales                r.   r7  z(QDQQuantizer._make_scale_zp_initializers  s    ",/
W%&|4'++F3 S%5%:LS-2	#"	# 
 5;;3z'7'7#88h:hh8$}47GG(*-==
 ++))_j.>.>
@P@P@R@Y@Y@[
 	

""7+;;"**$#//55J[[BJJ&#//77J0=Mj^\]][[,,ZU[[RWR]R]R_RfRfRhi


"":.%j'::r-   c                   | j                   || j                   vrt        j                  d| d       y| j                   |   }t        |t              st        dt        |       d|d      | j                  ||j                        }|j                  r| j                  ||j                  d      nd}t        |||j                        S )a  
        Create and returns all scale/zero_point initializers for a given tensor. If the tensor is converted
        to a different quantization type, this function creates two pairs of zp/scale initializers. Otherwise,
        only one pair of zp/scale initializers is created.
        Nz$Quantization parameters for tensor:"z" not specifiedrt   for r   rR  )r   r   r   r   rG   r   r   r7  rH   rI   rY   rK   )r?   r   rq  original_initsconverted_initss        r.   rj  z/QDQQuantizer._make_tensor_scale_zp_initializers  s     ##+{$BZBZ/ZLL?}O\]00=-)=>.tM/B.C5WXYZZ99+}G]G]^ && ,,[-:Q:QS]^ 	 ,NO]MoMoppr-   c                b   | j                   }d|v r|d   j                  }d|v rd|v r|d   |d   }}n|t        j                  j                  k(  rt        ||j                  d         \  }}n|j                  d|j                  d         }|j                  d|j                  d         }|j                  d| j                        }|j                  d	d
      }	t        ||	|      \  }
}t        |||
||| j                        \  }}t        |j                         |j                         |      S )z
        Calculates quantization parameters (scale/zero-point) given a tensor's min/max range and optional
        user-provided overrides.
        r   rV   rW   r	   r   r   r   	symmetricr   F)r   r  rW   rV   r   )r   r:   r   r   FLOAT8E4M3FNr   avg_stdrq   range_valueis_activation_symmetricr   r   min_real_ranger   squeeze)r?   tensor_dataquant_overridesr   zerorV   r   r   r  r   qminqmaxs               r.   calc_quant_paramszQDQQuantizer.calc_quant_params  s*   
 **
?*(6BBJo%,/*I),79Q%D4++8881*k>Q>QRS>TUKD%"&&v{/F/Fq/IJD"&&v{/F/Fq/IJD'++K9U9UVI*..~uEL0,bklJD$*4tT9dNaNabKD%!T\\^5==?_ijjr-   c                   | j                   i S | j                          i }| j                   D ]  }| j                   |   }t        |t              st	        dt        |       d|d      | j                  j                  |i       }| j                  ||      }d}d}d|v r)| j                  ||d         }|d   j                  d      }t        |||      ||<    |S )z
        Calculates quantization parameters (scale/zero-point) for all tensors in the graph using each tensor's min/max range
        and optional user-provided overrides.
        Nrt  r  r   )default_valconvert
recv_nodes)r   adjust_tensor_rangesr   r   r   r   r   get_per_tensor_overridesr  rq   rG   )r?   r   r   tdr  rH   rI   rK   s           r.   r   z$QDQQuantizer.calc_graph_quant_params  s    
 %I!!# --K##K0Bb*-"248*E+PQ RSS"99RRS^lnRoO--b/BHI#' O+ 222y7QR	'6y'A'E'El'S$/CHiYm/n, .  #"r-   c                   i }| j                   j                         D ]0  \  }}t        || j                  j	                               }|s.t        |      }t        |j                        }|j                  t        j                  u }|r| j                  n| j                  }| j                  j                  |      rb| j                  |   }	d|	d   v r|	d   d   j                  }t        |   }
d|	d   v }|sQt!        t#        j$                  |	d   d   |
      t#        j$                  |	d   d   |j&                        |      ||<   ng }g }|	D ]]  }|j)                  t#        j$                  |d   |
             |j)                  t#        j$                  |d   |j&                               _ |	d   d   }t+        ||      \  }}|st-        d|j.                   d	| d
|       t!        t#        j$                  |      t#        j$                  |      ||      ||<   | j                  j1                  |i g      }	d|	d   v r|	d   d   j                  }|	d   j1                  d|j2                        }|du}|xs |r| j5                  |      n| j6                  }|	d   j1                  d|      }|	d   j1                  d| j8                        }d}d}|sSt;        |j=                         |||| j>                  |	d   j1                  d      |	d   j1                  d            \  }}nt+        ||      \  }}|st-        d|j.                   d	| d
|       |}|j                  |   }g }g }tA        |      D ]  }|jC                  ||      }|	r|t        |	      k  r|	|   ni }t;        |jE                         |||| j>                  |j1                  d      |j1                  d            \  }}|j)                  |       |j)                  |        t#        jF                  |      }t#        jF                  |      }t!        ||||      ||<   3 |S )ze
        Returns quantization parameters (scale/zero_point/quant_type) for all initializers.
        r   r   r<   rW   r   rV   r  zWeight z# has a per-channel axis with value z  that is out-of-bounds for rank )rW   rV   r   r<   Nr  r   r   r   )r   r  rmin_overridermax_override)$rn   r   r   r   r   r!   r   r   r:   r$   r*   r   r   r   overrides_scale_zpr   r   r   r   r   r  r   rU  r   rq   r<   is_weight_symmetricr  r   r   flattenr  r   taker  r   )r?   r   r   rk  r   initializer_datainitializer_rank	is_weightr   	overrideszp_dtyper   zero_points_listscales_listchan_overrideschannel_axisis_axis_validnorm_channel_axisis_symmetric_defaultis_symmetricr   rW   rV   channel_countr   per_channel_datachannel_overrideschannel_zero_pointchannel_scales                                r.   r  z+QDQQuantizer._calc_initializer_quant_params  s   
 >@(,(@(@(F(F(H$K&{DJJ4J4J4LMK4[A"#3#9#9: $//3E3L3LLI.7**T=R=RJ **==kJ 77D	9Q</!*1l!;!G!GJ/
;!'9Q<!7%7I#%88IaL,Fh#W hhy|G'<>N>T>TU#-8'4 (*$"$K*3(//9UW_0`a#**288N74KScSiSi+jk +4 $-Q<#7L7ElTd7e4M#4((%k&6&6%77Z[gZh i66F5GI 
 8J#%88,<#= hh{3#-.	8'4  3377bTJIy|+&q\,7CC
$Q<++FK4D4DEL)5N $2 $8A((4tGcGc ! %Q<++K9MNL$Q<++ND<M<MNL,0J'+E!$=$,,. !-#'#6#6"+A,"2"26":"+A,"2"26":%!
E 4B,P`3a00$$!+"2"2!33VWcVd e22B1CE 
  1 0 6 6| D#%  }-A'7'<'<Q'M$8Aa#i.FX	!^`%8Q(..0"$%1'+':':&7&;&;F&C&7&;&;F&C95& %++,>?&&}5 .  ZZ(89


;//A%%!	0,[ )Ih #"r-   r9   )r   r1   )r   r1   r2   r1   r3   r1   )r   onnx.TensorProtorR   r  )g      ?)r   
np.ndarrayr   r  rC   r1   r   r  r   boolrR   ztuple[bool, np.ndarray | None])r'  r1   r(  r1   r)  r1   r*  r1   r+  r1   r<   
int | None)r/  r1   r0  r1   r1  r1   r*  r1   r+  r1   r<   r  )r;  r  )r   r1   r   rz  r  z
str | NonerR   ztuple[bool, int | None])r   r1   r   rB   rR   r1   ) )r  r1   r<  r   r  r1   rR   rU   )r   r1   rR   z#QDQTensorScaleZpInitializers | None)r  r   r  zdict[str, Any]rR   r   )rR   zdict[str, QDQTensorQuantParams])rR   zdict[str, QuantizationParams])&r&   r'   r(   r@   r   r   r$   r)   r   r   r   r   r   r   r   r   r  r	  r  r  r!  r-  r3  r5  rC  rP  rg  r  r  r  r  r   rw  r7  rj  r  r   r  r,   r-   r.   r^   r^      s    \&|, EIVhVsVs y:X
"Ty
/mbN-N- !N- 	N-
 "N- N- 
(N-`1@f*6 8,  -- - 	-
 - - -:  -- - 	-
 - - -. uy;*AeFAiF[
z,:\0d5.n_ #	// / 	/
 
!/b*#Z Z\$;$;-?$;SV$;	$;Lq.k.#:z#r-   r^   )6
__future__r   r   dataclassesr   enumr   typingr   numpyr   r   onnx.numpy_helperr   r   r   base_quantizerr
   r   	calibrater   quant_utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   registryr"   r$   r0   r7   rB   rG   rU   rY   r[   r^   r,   r-   r.   <module>r     s   #  !       & = !     . )    # #    f f f&    * * * f f f$E#= E#r-   