
    	g                    ~   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z d dlZd dlmZmZmZmZ d dlmZ d dlmZmZ ddlmZ ddlmZ dd	lmZmZ  ej<                  d
ej>                          ej@                  e!      Z" G d d      Z# G d de#      Z$ G d de#      Z% G d de#      Z& G d de#      Z' G d de#      Z(d Z) G d d      Z*d<dZ+ G d d      Z, G d d      Z- G d  d!      Z.d" Z/d# Z0d$ Z1e!d%k(  r e1       Z2e2jf                  re"ji                  ejj                         e2jl                  Z7e2jp                  Z9ee2jt                     Z:e2jv                  r e<e2jv                        nd&Z;e2jz                  r e<e2jz                        ndZ=ej|                  j                  e9      r!e"j                  d'e9 d(        eAd'e9 d(      e2j                  r'e2j                  d)k(  re"j                  d*       d+e2_B         ej                  e7      ZFe2j                  d)k(  r" e&e2j                  e2j                  e;e=,      ZIne2j                  d-k(  r- e'e2j                  e2j                  e2j                  e:e;e=.      ZIne2j                  d/k(  r
 e$e;0      ZIne2j                  d1k(  r e%e2j                  e;2      ZIne2j                  d3k(  r|e:ej                  k(  re"j                  d4       ej                  Z:e7ZFe2j                  e2j                  d5k(  rd6ZMnd7ZMnd6ZM e(e2j                  e2j                  e2j                  eM8      ZIn eQd9e2j                          e.eFe2j                  e2j                  e2j                  eI:      ZTeTj                          eTj                  j                  e9d;       yy)=    )annotationsN)
GraphProto
ModelProto	NodeProtoTensorProto)version)quantize_matmul_4bitsquantize_qdq_matmul_4bits   )CalibrationDataReader)	ONNXModel)QuantFormatattribute_to_kwargz2%(asctime)s %(name)s [%(levelname)s] - %(message)s)formatlevelc                  (    e Zd Z	 	 d	 	 	 	 	 	 	 ddZy)WeightOnlyQuantConfigNc                    || _         || _        |rt        |      ndh| _        |rt	        |      | _        yddd| _        y)a  This is the Base class for Weight Only blockwise quantization Configuration.

        Args:
            algorithm:
                weight only quantize algorithm name.
            quant_format: QuantFormat{QOperator, QDQ}.
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
            op_types_to_quantize (optional):
                set of operator types to quantize. Default {MatMul}
            quant_axes (dict[str, int], optional):
                op:axis, which axis to quantize for an op. Default {MatMul: 0, Gather: 1}
        MatMulr   r   )r   GatherN)	algorithmquant_formatsetop_types_to_quantizedict
quant_axes)selfr   r   r   r   s        d/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/quantization/matmul_4bits_quantizer.py__init__zWeightOnlyQuantConfig.__init__    sC    ( #(AUC(<$=\d[e!.8$z*VW>X    NN)r   strr   r   r   tuple[str, ...] | Noner   "tuple[tuple[str, int], ...] | None)__name__
__module____qualname__r    r    r   r   r      s:    
 8<9=YY "Y 5	Y
 7Yr    r   c                  >     e Zd Zdej                  df	 d fdZ xZS )RTNWeightOnlyQuantConfigNc                t    |t         j                  k(  sJ d       |i }t        |   d||       || _        y)a  
        This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
        RTN is the most straightforward way to quantize weight using scale maps.

        Args:
            ratios:
                percentile of clip. Defaults to {}.
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
            op_types_to_quantize (optional):
                set of operator types to quantize.
        z"RTN only supports QOperator formatNRTNr   r   r   )r   	QOperatorsuperr   ratios)r   r0   r   r   	__class__s       r   r   z!RTNWeightOnlyQuantConfig.__init__;   sM    ( {444Z6ZZ4>F%!5 	 	

 r    )r   r#   r%   r&   r'   r   r.   r   __classcell__r1   s   @r   r*   r*   :   s(      **7;	 5	 r    r*   c                  L     e Zd Zddddddej                  df	 	 	 d fdZ xZS )GPTQWeightOnlyQuantConfigNg{Gz?   FTc	                    |t         j                  k(  sJ d       t        	|   d||       || _        || _        || _        || _        || _        || _	        y)a  
        This is a class for GPTQ algorithm Weight Only Quant Configuration.
        GPTQ algorithm provides more accurate quantization but requires more computational resources.

        Args:
            calibration_data_reader:
                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
            percdamp:
                percent of the average Hessian diagonal to use for dampening.
            block_size (int, optional):
                channel number in one block to execute a GPTQ quantization iteration.
            actorder (bool, optional):
                whether rearrange Hessian matrix considering the diag's value.
            mse (bool, optional):
                whether get scale and zero point with mse error.
            perchannel (bool, optional):
                whether quantize weight per-channel.
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
            op_types_to_quantize (optional):
                set of operator types to quantize.
        z#GPTQ only supports QOperator formatGPTQr-   N)
r   r.   r/   r   calibration_data_readerpercdamp
block_sizeactordermse
perchannel)
r   r:   r;   r<   r=   r>   r?   r   r   r1   s
            r   r   z"GPTQWeightOnlyQuantConfig.__init__\   sh    F {444[6[[4%!5 	 	

 (?$ $ $r    )r:   zCalibrationDataReader | Noner   r#   r2   r4   s   @r   r6   r6   [   sA     AE **7;/%!=/% 5/% /%r    r6   c                  H     e Zd Zdddej                  ddf	 	 	 d fdZ xZS )HQQWeightOnlyQuantConfigr7      r   Nc                    |t         j                  k(  sJ d       t        |   d|||       || _        || _        || _        y)a  
        This is a class for HQQ algorithm Weight Only Quant Configuration.
        HQQ algorithm quant weight without needing calibrate data.

        Args:
            block_size (int, optional):
                channel number in one block to execute a HQQ quantization iteration.
            bits (int, optional):
                how many bits to represent weight.
            axis (int, optional):
                0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
            op_types_to_quantize (optional):
                set of operator types to quantize.
            quant_axes (dict[str, int], optional):
                op:axis, which axis to quantize for an op. Default {MatMul: 0, Gather: 1}
        z"HQQ only supports QOperator formatHQQr   r   r   r   N)r   r.   r/   r   r<   bitsaxis)r   r<   rF   rG   r   r   r   r1   s          r   r   z!HQQWeightOnlyQuantConfig.__init__   sT    : {444Z6ZZ4%!5!	 	 	
 %		r    )r   r#   r   r$   r2   r4   s   @r   rA   rA      s;      **7;9=' 5' 7' 'r    rA   c                  T     e Zd Zdddej                  ddf	 	 	 	 	 	 	 	 	 d fdZ xZS )DefaultWeightOnlyQuantConfigr7   FNc                d    t         |   d|||       || _        || _        d| _        || _        y)a  
        This is a class for weight only affine quantization configuration.

        Args:
            block_size (int, optional):
                channel number in one block to execute an affine quantization iteration.
            is_symmetric (bool, optional):
                whether quantize weight symmetrically.
            accuracy_level (int, optional):
                Accuracy level of the 4-bit quantized MatMul computation.
                Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details.
                (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits)
            quant_format (QuantFormat{QOperator, QDQ}, optional):
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
                Defaults to QuantFormat.QOperator.
            op_types_to_quantize (optional):
                set of operator types to quantize.
            quant_axes (dict[str, int], optional):
                op:axis, which axis to quantize for an op. Default {MatMul: 0, Gather: 1}
        DEFAULTrE   rB   N)r/   r   r<   is_symmetricrF   accuracy_level)r   r<   rL   rM   r   r   r   r1   s          r   r   z%DefaultWeightOnlyQuantConfig.__init__   sC    < 	%!5!	 	 	
 %(	,r    )
r<   intrL   boolrM   
int | Noner   r#   r   r$   r2   r4   s   @r   rI   rI      sW     "%) **7;9='-'- '- #	'- 5'- 7'- '-r    rI   c                  2     e Zd Z	 	 	 d fd	Zd Zd Z xZS )NVAWQWeightOnlyQuantConfigc                t   	 ddl }ddlm} || _         || _        	 ddlm} || _        	 dd	lm}m	}	 || _        |	| _	        t        | -  dt        j                  dd       | j                   j                  | j                   j                  j!                         rdnd      }
| j#                  |||ddd|
ddddd      }|| _        || _        y# t        $ r t	        d       t        d      dw xY w# t        $ r t	        d       t        d      dw xY w# t        $ r t	        d
       t        d      dw xY w)a=  
        Configuration for the nvidia_awq quantization method.

        Args:
            tokenizer_dir (str): pathof the tokenizer dir.
            dataset_name (str): Name of the dataset.
            cache_dir (str): Directory for caching.
            calibration_method (str): calib method for nvidia_awq.
        r   N)
DataLoaderzfError: The 'torch' library is required but not installed. Please install it using 'pip install torch'.z torch is not installed. Exiting.)load_datasetzlError: The 'datasets' library is required but not installed. Please install it using 'pip install datasets'.z#datasets is not installed. Exiting.)
AutoConfigAutoTokenizerztError: The 'transformers' library is required but not installed. Please install it using 'pip install transformers'.z'transformers is not installed. Exiting.
nvidia_awqrE   cudacpu    r   i   TFr7   )dataset_name
model_name	cache_dir
calib_size
batch_sizer<   deviceuse_fp16use_buffer_shareadd_past_kv_inputsmax_calib_rows_to_loadadd_position_ids)torchtorch.utils.datarT   ImportErrorprintdatasetsrU   transformersrV   rW   r/   r   r   QDQra   rY   is_availableget_calib_inputsr:   calibration_method)r   tokenizer_dirr\   r^   rp   rg   rT   rU   rV   rW   ra   calib_inputsr1   s               r   r   z#NVAWQWeightOnlyQuantConfig.__init__   sm   "
	L3DJ(DO	O- ,D		S>(DO!.D 	"$!%	 	 	
 ""TZZ__-I-I-K6QVW,,%$"##&! - 
 (4$"4o  	Lx @AtK		L  	O~ CD$N		O  	S G GHdR		Ss!   C C2 D !C/2!D!D7c	           	     b   | j                   }	|}
|}t        |t              r<|	j                  |||	j                        }
|	j                  |||	j                        }|
j                         |j                         d}|rJ|j                         j                  d      dz
  }|j                  |dk(  d       |j                         |d<   |r|r|	j                  n|	j                  }|
j                  \  }}|j                  }|j                  |j                  |j                  z  }}t!        |j"                        D ]q  }|	j%                  |||r|nd|||      }|	j%                  |||r|nd|||      }|j'                  d| d|j                         d| d	|j                         i       s |S )
N)ra   dtype)	input_idsattention_maskr   r   position_idszpast_key_values.z.keyz.value)rg   
isinstancelisttensorint64
contiguouslongcumsummasked_fill_float16float32shapemax_position_embeddingsnum_key_value_headshidden_sizenum_attention_headsrangenum_hidden_layerszerosupdate)r   configinput_ids_argattention_mask_argrd   ra   rb   rc   rf   rg   ru   rv   inputsrx   torch_dtyper`   sequence_lengthmax_sequence_length	num_heads	head_sizeipast_key
past_values                          r   make_model_inputz+NVAWQWeightOnlyQuantConfig.make_model_input5  s    

!	+mT*]6UI"\\*<VSXS^S^\_N #--/,779

 )..077;a?L%%n&91=%1%<%<%>F>"+3%--K*3//'J"("@"@**""f&@&@@ !I 6334 ;;+;'!% '  #[[+;'!% ) 
 *1#T2H4G4G4I*1#V4j6K6K6M# 50 r    c                4   | j                   }| j                  }| j                  }|j                  |d|d      }|j                  |d|d      }|j	                  ddi       |j
                  |_        ||k  sJ d       d|v r& |ddd	
      j                  t        |            }d}n d|v r |dd      }d}nt        d| d      ||   d | }|j                  |ddd|      }|j                  |      }|d   }|d   }| j                  } |||d      } |||d      }t        |j                        t        |j                        k(  sJ t        |      t        |      k(  sJ ||z  }g }t        |      D ]   \  }}|j!                  |       ||dz
  k(  s  n g }t        |      D ]   \  }}|j!                  |       ||dz
  k(  s  n t#        d| dt        |       dt        |       d       g }t        |      D ]t  } ||    }!||    }"| j%                  ||!|"|
|||	|      }#|#j'                         D $%ci c]$  \  }$}%|$|%j)                         j+                         & }#}$}%|j!                  |#       v t#        dt        |       d       |S c c}%}$w )NT)use_auth_tokenr^   trust_remote_code	pad_tokenz[PAD]z8calib size should be no more than max_calib_rows_to_loadcnncnn_dailymailz3.0.0train)namesplitarticlepilezmit-han-lab/pile-val-backup
validation)r   textz	dataset "z" not supportedpt)return_tensorspadding
truncation
max_lengthru   rv   F)r`   shuffler   z/
--Quantize-Script-- number_of_batched_samples=z, batch-input-ids-list-len=z, batched_attention_mask=
z0
--Quantize-Script-- number of batched inputs = )rV   rW   rU   from_pretrainedadd_special_tokens	eos_tokenr   selectr   
ValueErrorbatch_encode_plustorT   lendataset	enumerateappendrj   r   itemsrZ   numpy)&r   r\   r]   r^   r_   r`   r<   ra   rb   rc   rd   re   rf   auto_configauto_tokenizerrU   r   	tokenizerdataset2columnbatch_encodedbatch_encoded_input_idsbatch_encoded_attention_maskdata_loadercalib_dataloader_input_idscalib_dataloader_attention_masknumber_of_batched_samplesbatched_input_idsidxdatabatched_attention_maskbatched_inputs_listr   ru   rv   r   
input_nametorch_tensors&                                         r   ro   z+NVAWQWeightOnlyQuantConfig.get_calib_inputsv  sH     oo++((,,tyTX - 
 #22tyTX 3 
	 	$$k7%;<'11	33o5oo3L #O'QXXY^_uYvwHF|##$AVHFyoFGGF#KZ0!33T4DU_ 4 
 &((0"/"<'45E'F$ oo%01HU_in%o"*5(Z+
' -556#>]>e>e:ffff-.#6U2VVVV$.*$<!"#=>IC$$T*0145 ?
 "$"#BCIC"))$/0145 D
 	>?X>Y Z((+,=(>'??XY\]sYtXuuwy	

 !01A)!,I3A6N**"  	F ^d]i]i]kl]kAY\j,"2"2"4":":"<<]kFl&&v. 2" 	A#FYBZA[[]^_""	 ms   ;)J)r   ./cacheawq_lite)r%   r&   r'   r   r   ro   r3   r4   s   @r   rR   rR      s!     %N5`?Ba#r    rR   c                P    t        |t        j                  | |z        z        | k(  S N)rN   npceil)val1val2s     r   is_divisibler     s$    tbggdTk**+t33r    c                  ^    e Zd Z	 	 ddZe	 	 	 d	 	 	 	 	 d	d       Zed        Z	 d
dZddZy)HQQWeightOnlyQuantizerc                    || _         y r   r   r   r   s     r   r   zHQQWeightOnlyQuantizer.__init__       r    Nc                   dd l |dddddn|}|d   |d   |d	   |d
   f\  }}}	}
| j                  rj                  nj                  }| j	                  |      }|j	                  |      }|j	                  |      }|ffd	}d}t        |
      D ]  }j                  ||z  |z         j                  |d   |d         }||z
  |z  } |||z
  |      }j                  |||z
  |z  z
  |d      }||	z  }t        j                  ||z
        j                               }|r t        |t        j                  |d             ||k  r|} n ~~~~||fS )Nr   gffffff?g      $@g)\(?   )lp_normbetakappaitersr   r   r   r   c           
        |dk(  rLj                  |       j                  j                  j                  j	                  |       d|z  z
        z  S j                  |       j                  j                  j                  j	                  |       d|z  j                  j	                  |       dz   |dz
        z  z
        z  S )Nr         ?g:0yE>)signnn
functionalreluabspow)xr   prg   s      r   	shrink_opz:HQQWeightOnlyQuantizer.optimize_weights.<locals>.shrink_op  s    Avzz!}uxx':':'?'?		!sUYz@Y'ZZZzz!}uxx':':'?'?IIaLC$J%))EIIaL4<OQRUVQV2W#WW(  r    g     @r   TrG   keepdim   )rg   is_cudar   r   r   r   roundclampmeanfloatr   rj   r   )r{   scalezeromin_maxrG   
opt_paramsverboser   r   r   r   rt   w_fr   
best_errorr   w_qw_rw_ecurrent_errorrg   s                       @r   optimize_weightsz'HQQWeightOnlyQuantizer.optimize_weights  s|    	R\RdcD2Njt
y!vww	'
#ue "(U]]iiwwu~!( 	 
uA++cEkD0177
GAJOC:&CC#It,C::cS3Y%$77dD:QDEMD!%))C#I"6";";"=>Ma-34z)*
  c3d{r    c           	        | j                   d   |j                   d   k(  r|j                  }| j                  } |dv rA| j                         dz  |z  }t        |      D ]  }| dd xxx ||d |   ||z  z  z  ccc  y t	        d      )Nr   )   rB      r  zOnly 2,4,8 bits are supported.)r   Telement_sizer   NotImplementedError)pack_tensorori_int_tensorrF   compress_ratiojs        r   pack_on_row_fast_248bitz.HQQWeightOnlyQuantizer.pack_on_row_fast_248bit  s    Q>#7#7#::+--N%--K9(557!;tCN>*AB>!2C^2C#DQR#TT + &&FGGr    c                <   dd l }|j                         }	|	j                  }
||
|   |z  z
  |z  }|dk(  r+|j                  j                  j                  |	d|fdd      }	n,|j                  j                  j                  |	ddd|fdd      }	|	j                  }|-|r+|dk(  r|	j                  d|g      n|	j                  |dg      }	|du r#|	j                         |	j                         }}d}n,|	j                  |d      d   }|	j                  |d      d   }d|z  dz
  }d}||g}|||z
  z  j                  d	
      }||z
  }|dk(  j                         j                         dkD  r|||dk(  <   ||z  j                  d	
      }| |z  }|r|j                  |      }|r| j                  |	||||      \  }}|j                  |	|z  |z         j                  |d   |d         }|j                  |      j                         }d|z  }|dk(  r+|j                  |d   d      }|j                  |d   d      }n*|j                  d|d         }|j                  d|d         }~	~~||j                  |j                         |j                  |j                         fS )Nr   r   constantrw   FTr   r  g     @)max)r{   r   r   r   rG   r   )rg   r   r   r   r   padreshapeminr  r   sumitemr   r   rN   r   rt   )r   r{   rF   channel_wise
group_sizeoptimize
round_zerorG   rg   weight	ori_shapepad_lenr   _min_maxmax_vmin_vr   r   min_max_axisr   r   s                         r   quantize_internalz(HQQWeightOnlyQuantizer.quantize_internal(  s    	LL		$* <<
J19XX((,,Va\:qQFXX((,,VaAw5GUVWF ":>!)V^^R$45&..ZdfhYiJjF 5 vzz|$DH::4:6q9D::4:6q9D4!%. $+&--#-6d{A""$))+a/.3L*+\)00S09Euu};;t$D //vUQU_fmq/rKE4 kk&5.4/066wqz71:Nkk% $$&e19MM%(B/E<<a"-DMM"eBi0E<<E"I.DD$EHHV\\*DGGFLL,AAAr    c                ~	   |j                   dk(  rt        d      ddl}t        j	                  d|j
                   d       |j                  d   }t        ||      \  }}|t        j	                  d       |gS t        j                  j                  |      }t        |j                        d	k7  rt        j	                  d
       |gS |j                  |      }|j                  j                         r|j                         }| j!                  |j"                  | j$                  j&                  | j$                  j(                        \  }	}
}|	j+                         }	|
j+                         }
|j+                         }|j-                  |	j                  d   |	j                  d   d	z  f|j.                  |	j0                        }| j3                  ||	| j$                  j&                         |
j5                         j7                         }|j5                         j7                         }|j9                  d      }|j9                  d      }|j                  \  }}| j$                  j(                  }|d	z  }||z   dz
  |z  }|j9                  |||      }t        j                  j;                  |j5                         j7                               }|j
                  dz   |_        |j                  D ].  }|j
                  |k(  s|j                  j=                  |        n t        j                  j;                  |      }|j
                  dz   |_        |j>                  jA                  ||g       |j                  d   |j
                  |j
                  g}t        j                  j;                  |      }|j
                  dz   |_        |j>                  jA                  |g       |jC                  |j
                         i }|j                  \  }}||d<   ||d<   | j$                  j&                  |d<   | j$                  j(                  |d<   t        jD                  jF                  	 d||jH                  d   g|j
                  r|j
                  dz   nddd|}t        j	                  d|j
                   d       |gS )  
        Target node:        QOperator node:            QDQ nodes:
        MatMul              MatMulNBits                DeQuantizeLinear -> MatMul
        Gather              GatherBlockQuantized       Gather, Gather, Gather (optional) -> DequantizeLinear
        If the node is target node with fp32 or fp16 const weight, quantize the weight to int4 and
        return the new nodes.
        If QOperator format, return the corresponding QOperator nodes.
        If QDQ format, return the corresdponging QDQ nodes.
        Gather (quantized data) + Gather (scales) + Gather (optional, zero points) -> DequantizeLinear is
        not supported yet because Gather does not support int4 data.
        r   z/Gather quantization is not supported yet in HQQr   Nstart to quantize  ...r   2MatMul doesn't have const weight. Skip to quantizer  )MatMul weight is not 2D. Skip to quantize)rF   r  )rt   ra   rw   _Q4_scales_zero_pointsKNrF   r<    com.microsoftr   outputsr   domaincomplete quantization of MatMulNBits)%op_typer  rg   loggerinfor   inputget_initializeronnxnumpy_helperto_arrayr   r   
from_numpyrY   rn   r   r  r   rF   r<   r}   r   uint8ra   r  rZ   r   r  
from_arrayremoveinitializerextendr   helper	make_nodeoutput)r   nodegraph_stackrg   input_bb_pbbs_graphb_arrayb_array_torchquant_weight_torchscales_torchzero_points_torchpacked_torchscaleszero_pointsrowscolsr<   	blob_sizek_blocksb_quantr7  scales_tensorinput_names	zp_tensorkwargsmatmul_q4_nodes                              r   quantizezHQQWeightOnlyQuantizer.quantizei  s    <<8#%&WXX(489**Q-(+>h<KKLM6M##,,T2w}}"KKCD6M((1::""$)..0M>B>T>TOO$++"2"2t{{?U?U ?U ?
;L*; 0::<#..0-88:{{%%a(*<*B*B1*E*JK++%,, # 

 	$$\3Et{{GWGWX!!#))+'++-335#!))"-"((
d[[++
!O	:%)j8#++D(IF##..|/?/?/A/G/G/IJyy5(^^EzzW$%%e, $
 ))44V<!YY2##Wm$<=zz!}gllM4F4FG%%00=	^3	##YK09>>*]]
dss))v#{{55|..
[[^$&*iiU"R"
 
 	/		{$?@r    )r   rA   )r   NF)r   z	list[int]rG   rN   r   zdict | None)rB   T@   TTr   rE  r   rF  list[GraphProto]returnzlist[NodeProto])	r%   r&   r'   r   staticmethodr   r  r   r\  r(   r    r   r   r     sw    (  "&2 	2
 2  2 2h 	H 	H fg?BB[ r    r   c                    t        t        |      dz
  dd      D ]/  }||   }|j                  D ]  }|j                  | k(  s||fc c S  1 y)Nr   rw   r!   )r   r   r@  r   )r   
graph_pathgidgraphr{   s        r   r8  r8    sR    S_q("b13''F{{d"u}$ ( 2
 r    c                      e Zd ZddZddZddZedd       Zedd       Zedd       Z	e	 	 	 	 	 	 	 	 	 	 dd       Z
ddZdd	Zy
)DefaultWeightOnlyQuantizerc                    || _         y r   r   r   s     r   r   z#DefaultWeightOnlyQuantizer.__init__  s	    r    c           
     z   t        |j                        dk7  rt        d      |j                  \  }}| j                  j                  }||z   dz
  |z  }| j                  j
                  t        j                  k(  r|dz  }||z  }||z
  }|dkD  rt        j                  |d|fdfd      }t        j                  |||fd      }	t        j                  ||dz   dz  z  d      }
t        j                  ||z  |j                        }t        |	|||
|||| j                  j                         nt        j                  ||z  dz   dz  d      }	t        j                  ||z  dz   dz  d      }
t        j                  ||f|j                        }t        |	|||
|||| j                  j                         |	||
fS )	z24b quantize fp32 weight to int4 using C++ kernels.r  z9Current int4 block quantization only supports 2D tensors!r   r   )r   r   r  r=  rt   )r   r   r   r   r<   r   r   r.   r   r  r   rt   r	   rL   r
   )r   
fp32weightrR  rS  r<   rU  rT  padded_rowsr  packed
zero_pointrP  s               r   int4_block_quantz+DefaultWeightOnlyQuantizer.int4_block_quant  s    z A%XYY%%
d[[++
:%)j8;;##{'<'<<"aI"Z/K!D(G{VVJ!Wv0F
S
 XXtXy9IF$8a<A*=">gNJXXthz7G7GHF!
FJ
D$PTP[P[PhPh XXtd{Q14GDF4(?Q#61"<GLJXXx.j6F6FGF%
FJ
D$PTP[P[PhPh 
++r    c                
   | j                   j                  rt        j                  nt        j                  }|j
                  d   }t        ||      \  }}|t        j                  d       |gS t        j                  j                  |      }t        |j                        dk7  rt        j                  d       |gS | j                  |      \  }}	}
| j                   j                  t         j"                  k(  r[t        j                  j%                  ||j&                  dz         }t        j                  j%                  |	|j&                  dz         }nut        j(                  j+                  |j&                  dz   ||j                  |j-                         d      }t        j                  j%                  |	|j&                  d	z         }|j
                  D ].  }|j&                  |k(  s|j
                  j/                  |        n |j0                  j3                  ||g       g }| j                   j                  t         j"                  k(  ri|j
                  d
   |j&                  |j&                  g}| j                   j                  sdt        j                  j%                  |
|j&                  dz         }|j5                  |j&                         |j0                  j3                  |g       i }|j                  \  }}||d<   ||d<   d|d<   | j                   j6                  |d<   | j                   j8                  | j                   j8                  |d<   t        j(                  j:                  	 d||j<                  d
   g|j&                  r|j&                  dz   nddd|}|j5                  |       |S |j&                  |j&                  g}|j&                  dz   g}|j
                  d
   |d
   g}|j<                  d
   g}| j                   j                  st        j(                  j+                  |j&                  dz   ||	j                  |
j-                         d      }|j5                  |j&                         |j0                  j3                  |g       d
| j                   j6                  d}t        j(                  j:                  	 d|||j&                  r|j&                  dz   ndd|}t        j(                  j;                  d|||j&                  r|j&                  dz   nd      }|j3                  ||g       |S )z
        Quantize weight B of MatMul node to int4.
        Currently only support 2D constant matrix and axis 0 blockwise quantization.
        r   r%  r  r&  r'  r(  _DQ_Q4T
_DQ_scalesr   r)  r*  r+  rB   rF   r<   rM   r,  r-  r.  _output_DQ_zero_points)rG   r<   )r   r/  r   r   
_matmul_Q4r2  )DequantizeLinear)r   rL   r   INT4UINT4r7  r8  r5  r6  r9  r:  r;  r   r   ro  r   r   r.   r>  r   rB  make_tensortobytesr?  r@  rA  r   r<   rM   rC  rD  )r   rE  rF  qtyperG  b_tensorb_graph	b_ndarrayrm  rP  rQ  rV  rW  r7  output_nodesrX  rY  rZ  rR  rS  r[  dq_input_namesdq_output_namesmatmul_input_namesmatmul_output_names	dq_kwargsdq_nodematmul_nodes                               r   quantize_matmulz*DefaultWeightOnlyQuantizer.quantize_matmul  sk   
 %)KK$<$<  +BSBS**Q-+G[A'KKLM6M%%..x8	y1$KKCD6M&*&;&;I&F#;;##{'<'<<''2268==5;PQG --88QZAZ[Mkk--hmmh.Fy`f`n`n`prvwG --88Q]A]^M]]EzzW$$$U+ #
 	""G]#;<;;##{'<'<<::a=',,8J8JKK;;++ --88hmmVdFde	""9>>2##**I;7F"JD$F3KF3KF6N#';;#9#9F< {{))5+/;;+E+E'(![[22"Q(*.))TYY&& N /8 5 &llM,>,>?N&||i78O"&**Q-1C!D#';;q>"2;;++ KK33MM$55ufllKL_L_Lacg	 %%inn5##**I;7!"$++2H2HIIkk++"%'-1YYTYY)B	
 G ++//)+15TYY-	 0 K + 67r    c                   t        j                  | dd      }t        j                  | dd      }t        j                  t        j                  |      t        j                  |      kD  ||      }|dz  }t        j                  |dk(  d| |z        j                         j                  dd      j                  t         j                        }||fS )Nr   TrG   keepdimsg       r   i   )	r   r  r  wherer   r   clipastypeint8)r   max_valmin_valabs_maxr   quantized_slices         r   quant_slice_symmetricz0DefaultWeightOnlyQuantizer.quant_slice_symmetricO  s    &&A5&&A5((266'?RVVG_<gwO$((5A:q$,?EEGLLRQRSZZ[][b[bc%%r    c                    t        j                  | j                  dd      d      }t        j                  | j	                  dd      d      }||z
  dz  }t        j
                  |dk(  d| |z        j                         j                  dd      j                  t         j                        }t        j
                  |dk(  d| |z  |z         j                         j                  dd      j                  t         j                        }|||fS )Nr   Tr  r   g      .@r     )
r   minimumr  maximumr  r  r   r  r  r=  )r   r  r  r   rn  r  s         r   quant_slice_asymmetricz1DefaultWeightOnlyQuantizer.quant_slice_asymmetricZ  s    **TXX1tX<a@**TXX1tX<a@7"d*XXeqj!gX-=>DDFKKArRYYZ\ZbZbc
((5A:q$,2KLRRTYYZ[]_`gghjhphpqz11r    c                    | j                  d      }t        |      dz  dk7  rt        j                  |d      }|ddd   dz  |ddd   dz  dz  z  }|j	                  d      S )	z2Pack int8 data to int4 and store in uint8 ndarray.rw   r  r   Nr  r   rB   r=  )r  r   r   r   r  )r   	data_flatquant_data_int4s      r   pack_int8_to_int4z,DefaultWeightOnlyQuantizer.pack_int8_to_int4e  so     LL$	y>A"		)Q/I$SqS>C/Yqt!t_s5Jq4PQ%%g..r    c                   d}| j                   |   }d}t        | j                         D ]  \  }}||k  r||z  }||kD  s||z  } ||z   dz
  |z  }	t        | j                         }
|	|
|<   | j                  |||f      }t	        j
                  ||	|f| j                        }|rt	        j
                  |||fd      }n4t	        j
                  |||fd      }t	        j
                  ||	|fd      }t        d||      D ]  }t        ||z   |      }|dd||ddf   }|rt        j                  |      \  }}nt        j                  |      \  }}}||dd||ddf<   ||z  }||dd||dz   ddf<   |r{dd||dz   ddf<    t        j                  |      }d}|st        j                        }|j                  |
      }|||fS )zXQuantize ndarray data to int4 using numpy, return (quantized data, scales, zero points).r   rj  r  r=  r   N)r   r   rz   r  r   r   rt   r   r  rg  r  r  r  )r   quantize_axisr<   rL   mknr   dimrU  scales_shapedata_reshaperP  quant_data_int8zero_point_int8end_idxslicequantized_slice_int8scale_slicezero_point_slice_int8r
  r  zero_point_int4s                          r   quantize_ndarrayz+DefaultWeightOnlyQuantizer.quantize_ndarrayo  s    JJ}%

+FAs= S]"S	 , 
NQ&:5DJJ'&.]#||Q1I.1h*$**= hh1ay?O hh1ay@O hh8Q'7wGO q!Z(A!j.!,G AgIq1E4N4d4dej4k1$k /EEeL I$k3H 0DOAqy!O,ZA(3F1a1q5k1$%5J1A; 12 )$ 5FFW8JJ?[O-77r    c                   | j                   j                  t        j                  k(  sJ d       | j                   j                  rt
        j                  nt
        j                  }|j                  d   }t        ||      \  }}|t        j                  d       |gS t        j                  j                  |      }t        |j                         }| j                   j"                  j%                  dd      }	| j                   j&                  }
|	|k  r|	| k\  sJ d       |
dk\  r|
dz
  |
z  dk(  sJ d       |	|z   |z  }	| j)                  ||	|
| j                   j                        \  }}}|j                  D ].  }|j*                  |k(  s|j                  j-                  |        n t        j.                  j1                  |j*                  d	z   ||j                   |j3                         d
      }t        j                  j5                  ||j*                  dz         }|j*                  |j                  d   |j*                  g}|j6                  j9                  ||g       | j                   j                  st        j.                  j1                  |j*                  dz   ||j                   |j3                         d
      }|j;                  |j*                         |j6                  j9                  |g       	 t        j.                  j=                  |d      }||	|
d}t        j.                  j@                  	 d||jB                  d   g|j*                  r|j*                  d	z   nddd|}|gS # t>        $ r d}Y dw xY w)z,Quantize weight data of Gather node to int4.z0Gather only supports QOperator format currently.r   z4Gather doesn't have const weight. Skip quantization.r   r   z&Invalid quantize axis for Gather node.   z#Invalid block size for Gather node.r'  Tr(  r)  rG   )gather_axisr  r<   r,  r-  r.  )GatherBlockQuantized)"r   r   r   r.   rL   r   rw  rx  r7  r8  r5  r6  r9  r:  r;  r   r   r   getr<   r  r   r?  rB  ry  rz  r>  r@  rA  r   get_node_attr_valuer   rC  rD  )r   rE  rF  r{  data_argdata_tensorprotodata_graphprotodata_ndarray	data_rankr  r<   quantized_datarP  rQ  r7  quantized_data_tensorprotoscales_tensorprotorX  zp_tensorprotor  rZ  gather_q4_nodes                         r   quantize_gatherz*DefaultWeightOnlyQuantizer.quantize_gather  s0   {{'';+@+@@tBtt@$(KK$<$<  +BSBS::a=,;Hk,R)/#KKNO6M((112BC**+	..228Q?[[++
y(]yj-HrJrrHRj1n
%Ba%GoJooH&2i?.2.C.C-T[[5M5M/
+ %**EzzX%%%,,U3 +
 &*[[%<%<!!E)5,2D2DnF\F\F^`d&
" "..99&BRBWBWZcBcd166

1GYG^G^_##**,FHZ+[\{{''![[44 %%6v||[M`M`MbdhN ~223''../?@	++99$GK
 '*$
 .."
[[^$&*iiU"R"
 
 %  	K	s   5 M, ,M:9M:c                f   t         j                  d|j                   d       |j                  dk(  r| j	                  ||      }nH|j                  dk(  r| j                  ||      }n&t         j                  d|j                   d       |g}t         j                  d|j                   d       |S )r"  r#  r$  r   r   zUnsupported operator z1 for weight only quantization. Skip quantization.r1  )r5  r6  r   r4  r  r  error)r   rE  rF  resultss       r   r\  z#DefaultWeightOnlyQuantizer.quantize  s     	(489<<8#**4=G\\X%**4=GLL0>opqfG/		{$?@r    N)r   rI   )rk  znpt.ArrayLiker`  )tuple[np.ndarray, np.ndarray, np.ndarray]r^  )r   
np.ndarrayr`  ztuple[np.ndarray, np.ndarray])r   r  r`  r  )r   r  r`  r  )
r   r  r  rN   r<   rN   rL   rO   r`  z0tuple[np.ndarray, np.ndarray, np.ndarray | None])r%   r&   r'   r   ro  r  ra  r  r  r  r  r  r\  r(   r    r   rg  rg    s     ,DWr & & 2 2 / / 585858 58 	58
 
:58 58n> @r    rg  c                       e Zd Z	 	 ddZddZy)NVAWQWeightOnlyQuantizerc                    || _         y r   r   r   s     r   r   z!NVAWQWeightOnlyQuantizer.__init__  r   r    c                   	 ddl m} t        j                  d       | j                  j                  } ||| j                  j                  |      }t        j                  d       |S # t        $ r t        d       t        d      dw xY w)	z
        Perform nvidia_awq quantization using ModelOpt's int4 quantize function.

        Args:
            model (ModelProto): The ONNX model to quantize.

        Returns:
            ModelProto: The quantized ONNX model.
        r   )r\  zlPlease ensure that the 'modelopt' package is installed. Please install it using pip install nvidia_modelopt.zXmodelopt is not installed. Please install it using pip install nvidia_modelopt. Exiting.Nz#Starting nvidia_awq quantization...)rp   r:   "Completed nvidia_awq quantization.)	modelopt.onnx.quantization.int4r\  ri   rj   r5  r6  r   r:   rp   )r   modelquantize_int4rr   quantized_models        r   quantize_awqz%NVAWQWeightOnlyQuantizer.quantize_awq	  s    	Q 	9: {{:: (#{{==$0
 	89+  	~ j		s   A) )!B
N)r   rR   )r  ModelProto | strr`  r   )r%   r&   r'   r   r  r(   r    r   r  r    s    *!r    r  c            	      x    e Zd ZdZdddddej
                  dddf		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
dZddZd Zd Z	d	 Z
y)MatMul4BitsQuantizera4  
    Target node:        QOperator node:            QDQ nodes:
    MatMul              MatMulNBits                DeQuantizeLinear -> MatMul
    Gather              GatherBlockQuantized       Gather, Gather, Gather (optional) -> DequantizeLinear

    Perform 4b quantization of constant weights for target nodes.
    If algo_config.quant_format is QOperator:
      - nodes are replaced by the corresponding QOperator nodes.
      - quantized weights are stored in the contrib ops.
    If algo_config.quant_format is QDQ:
      - the quantized weight is stored in a standard onnx node. For MatMul, it is DequantizeLinear. For Gather,
        it is the three Gathers, one for quantized data, one for scales and one for optional zero points.
      - The nodes are replaced by the corresponding QDQ nodes.
      - currently Gather is not supported in QDQ because Gather does not support int4 yet.
    Note:
      - for quantized gather, the memory usage of "DequantizeLinear + Gather" is the same as the original Gather
        during runtime. Therefor it is not recommended.
    r7   FNc                j   |g }t        |t              rt        t        j                  |            n
t        |      | _        t        |t              r|nd | _        || _        || _        || _	        t        |      | _        |rt        |      nd | _        d | _        |
t        ||||||	      }
|
| _        |
j                   dk(  rt#        | j                        | _        y |
j                   dk(  rt%        | j                        | _        y |
j                   dk(  rt'        | j                        | _        y y )Nr<   rL   rM   r   r   r   rD   rK   rX   )ry   r"   r   r9  loadr  
model_pathr<   rL   rM   r   nodes_to_excludenodes_to_includenode_quantizerrI   algo_configr   r   rg  r  )r   r  r<   rL   rM   r  r  r   r   r   r  s              r   r   zMatMul4BitsQuantizer.__init__B  s    #!4>uc4JYtyy/0PYZ_P`
#-eS#9%t$(, #$4 59I$4 5t"6%)-)%9%K '  E)"89I9I"JD""i/"<T=M=M"ND""l2":4;K;K"LD 3r    c                   g }|d   }|j                   D ]  }|j                  D cg c]R  }|j                  t        j                  j
                  k(  s'|j                  t        j                  j                  k(  r|T }}t        |      rVi }|j                  D ]  }|j                  t        j                  j
                  k(  r9|j                  |j                         |j                  | j                  |      i}n|j                  t        j                  j                  k(  rTg }	|j                  D ]4  }
|j                  |
       |	j                  | j                  |      g       6 |j                  |	i}nt        |      }|j                  |        t        j                   j"                  |j$                  |j&                  |j(                  fd|j                  i|}g }|j                  | j*                  v r't,        j/                  d|j                   d       |g}n| j0                  r|j                  | j0                  v s"|j$                  | j2                  j4                  v r| j6                  j9                  ||      }n&t,        j/                  d|j                   d       |g}|j                  |        |j;                  d       |j                   j                  |       |j=                          |S c c}w )Nrw   r   zexclude to quantize z$ as specified by nodes_to_exclude...zskip to quantize r$  rE  )rE  	attributetyper9  AttributeProtoGRAPHGRAPHSr   r   gr   _process_subgraphgraphsrA  r   r   rB  rC  r4  r7  rD  r  r5  r6  r  r  r   r  r\  
ClearFieldpop)r   rF  	new_nodesre  rE  attrgraph_attrsrZ  kvvaluesubgraph	out_nodess               r   r  z&MatMul4BitsQuantizer._process_subgraphk  sv   	BJJD !NN*D99 3 3 9 99TYY$J]J]JdJd=d *  
 ; NNDyyD$7$7$=$==#**4662"ii)?)?)LMd&9&9&@&@@ "(,H'..x8!LL$*@*@*M)NO )4 #ii//5MM"% + {{,,LL$**dkk@D		MS IyyD1112499+=abc!F	''DII9N9N,N 0 0 E EE //88{K	/		{$?@!F	Y'K N 	 

)$Ss   AK=c           	     h   i }d| j                   | j                  rdndd}| j                  j                  j                  j                  D ]_  }|j
                  dv st        |j                  D cg c]  }| j                  j                  |      du ! c}      rQ|||j                  <   a |S c c}w )z3Generate weight only quant configuration for nodes.rB   symasym)rF   r  schemer   N)
r<   rL   r  re  rE  r4  allr7  r8  r   )r   q4_node_configtemplate_config_q4rE  r   s        r   _generate_q4_node_configz-MatMul4BitsQuantizer._generate_q4_node_config  s    //#00ef

 JJ$$**//D||z)4::V:aDJJ66q9TA:VW0BN499- 0  Ws   0$B/
c                ^     fd}i } j                    j                   |d<    j                         } j                  j                  }t        j                  d| d       |dk(  r\ddlm}  j                  j                  |d	<    |d j                   j                  n j                  j                  |d
| _
        n|dk(  rddlm}  j                  j                  |d<    j                  j                  |d<    j                  j                  |d<    j                  j                  |d<    j                  j                   |d<   d|d<    |       } |d j                   j                  n j                  j                  ||d| _
        t        j                  d| d       y)u  4b quantize a model with RTN or GPTQ algorithm. Please refer to
        https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
        for more details on weight only quantization using Intel® Neural Compressor.
        c               3  z   K   t        j                  j                  j                        } | D ]  }|d f 
 y wr   )copydeepcopyr  r:   )data_readerr   r   s     r   inc_dataloaderz<MatMul4BitsQuantizer.int4_quant_algo.<locals>.inc_dataloader  s5     --(8(8(P(PQK#Dj  $s   8;NrM   zstart to quantize model with z algorithm...r,   r   )rtn_quantizer0   )r  weight_configr9   )gptq_quantizer;   	blocksizer=   r>   r?   rw   	n_samples)r  r  
dataloaderz$complete quantization of model with z algorithm.r(   )rM   r  r  r   r5  r6  .neural_compressor.adaptor.ox_utils.weight_onlyr  r0   r  r  r  r;   r<   r=   r>   r?   )r   r  rZ  weight_only_node_configr   r  r  r   s   `       r   int4_quant_algoz$MatMul4BitsQuantizer.int4_quant_algo  s   	!
 *'+':':F#$"&"?"?"A$$..	3I;mLMS#//66F8% )-)Ddoo$**JZJZ5 DJ
 & T!%!1!1!:!:F:"&"2"2"="=F;!%!1!1!:!:F: ,,00F5M#'#3#3#>#>F< "$F;')J& )-)Ddoo$**JZJZ5% 	DJ 	:9+[QRr    c                |   | j                   j                  dv rD| j                  j                         g}| j                   j                  t
        j                  k(  r| j                  j                  dd       | j                   j                  t
        j                  k(  sd| j                   j                  v r{| j                  j                         }|D ]\  }|j                  dv s|j                  dk  s"t        j                  d       | j                  j                  |j                  d       ^ | j                  |       | j                  j!                          y | j                   j                  dk(  rt        j#                  d	       | j$                  j'                  | j(                  | j                  j                  n| j(                        | _        t        j#                  d
       t+        | j                        | _        | j                  j!                          y 	 t-        j.                  d       dd l}t        j:                  |j<                        t        j:                  d      k\  sJ d       | j?                          y # t0        $ r)}t3        j4                  | d       t7        d      |d }~ww xY w)N)rD   rK   r-  r   r   )Nzai.onnxr,     zThe opset of the input model is under 21 and doesn't support int4 data type. Force to update it to opset 21, but the generated model may not be a valid model.rX   z%Processing nvidia_awq quantization...r  neural_compressor.zLneural-compressor is not correctly installed. Please check your environment.r   z2.3.2zGRequire neural-compressor >= 2.3.2 to support weight only quantization!) r  r   r  re  r   r   r.   set_opset_importrm   r   opset_importr0  r   r5  warningr  clean_initializersr6  r  r  r  r   	importlibimport_module	Exceptionloggingr  RuntimeErrorr  parse__version__r  )r   rF  r	  opseter  s         r   processzMatMul4BitsQuantizer.process  s'   %%);;::++-.K ,,0E0EE

++OQ?,,?8tO_O_OtOtCt#zz668)E||'<<QSASp 

33ELL"E * "";/JJ))+''<7 KK?@,,99$(OO$;

  DJ KK<="4::.DJJJ))+''(;< %==!2!>!>?7==D  YXY    "  1g&"bs   #J	 		J;$J66J;)r  r  r<   rN   rL   rO   rM   rP   r  zlist[str] | Noner   r#   r   r$   r  zWeightOnlyQuantConfig | None)rF  r_  )r%   r&   r'   __doc__r   r.   r   r  r  r  r  r(   r    r   r  r  .  s    , "%)-1 **7;9=48'M'M 'M 	'M
 #'M +'M 5'M 7'M 2'MR.`-S^/#r    r  c                &    | j                         dv S )N)true1)lower)r  s    r   ort_convert_str_to_boolr  
  s    ;;=M))r    c                D    | j                  d      \  }}|t        |      fS )N:)r   rN   )skeyr  s      r   parse_key_value_pairr     s!    JCE
?r    c            
        t        j                  d      } | j                  ddd       | j                  ddd       | j                  d	d
dt        d       | j                  ddt        g dd       | j                  ddt        d       | j                  dd
dddt
        dd
gd       | j                  dd
t        d       | j                  ddd
d !       | j                  d
"       | j                  d#d$t        d
g d%&       | j                  d'd$t        d
d()       | j                  d*d+t        d+d,gd-       | j                  d.t        d$d/d0gd12       | j                  d3t        d$d
d45       | j                  d6d7      }|j                  d8t        d9d:;       |j                  d<t        d
d=>       |j                  d?t        d
d@dAgdBC       |j                  dDt        dEdF;       | j                         S )GNa
  Blockwise int4 quantization for MatMul 2D weight matrices.

A weight matrix is partitioned into into blocks, where each block is a
continguous subset inside each column. Each block is quantized into a
set of 4b integers with a scaling factor and an optional offset.
)descriptionz--input_modelTzPath to the input model file)requiredhelpz--output_modelzPath to the output model filez--block_sizeFr[   zBlock size for quantization)r#  defaultr  r$  z--quant_methodr%  )r%  hqqrtngptqrX   uW   the algorithm used to quantize weight, 
rtn and gptq leverage Intel® Neural Compressor)r%  r  choicesr$  z--bitsrB   z#the target bits to represent weight)r%  r  r$  z--symmetric?zWIndicate whether to quantize the model symmetrically, symmetric is not supported by hqq)r#  r%  constnargsr  r)  r$  z--accuracy_levelzAccuracy level of the 4-bit quantized MatMul computation. Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).)r#  r  r$  z-vz	--verbose
store_true)r#  action)r   z--nodes_to_exclude+zBSpecify the nodes to be excluded from quantization with node names)r,  r  r#  r%  r$  z--nodes_to_includezKSpecify the specific nodes to be included from quantization with node names)r,  r  r#  r$  z--quant_formatr.   rm   zQuantFormat {QOperator, QDQ}QOperator format quantizes the model with quantized operators directly.QDQ format quantize the model by inserting DeQuantizeLinear before the MatMul.z--op_types_to_quantizer   r   zPop_types_to_quantize {MatMul, Gather}. Operators to quantize. Default is MatMul.)r  r,  r)  r$  z--quant_axeszKey-value pairs in op_type:axis_to_quantize separated by space.Specify the axis to quantize for an op. Default {MatMul:0, Gather:1}Example: --quant_axes MatMul:0 Gather:1)r  r,  r#  r$  rX   z-Arguments specific to nvidia_awq quantizationz--calib_dataset_namer   z/Name of the calibration dataset for nvidia_awq.)r  r%  r$  z--tokenizer_dirzPath of the tokenizer dir.)r  r#  r$  z--calibration_methodawqawq_clipz<Support two options, awq implementation and weight clipping.)r  r#  r)  r$  z--cache_dirr   z%Cache directory for calibration data.)
argparseArgumentParseradd_argumentrN   r"   r  set_defaultsr   add_argument_group
parse_args)parsernv_awq_configs     r   r7  r7    s   $$F $=[\
(4>]^
Spq
?g   !#<ab
$uf  	 q	   kE,O
&
Q   Z   e$Y    8$_   !2   --l<klM>	   )	   
#K   4	   r    __main__r  zfile z already existsr&  zAsymmetric is not supportted by hqq, will force to symmetric=FalseF)r<   rF   r   r   r%  r  r'  )r   r(  )r<   r   rX   zFQOperator is not applicable to nvidia_awq. overriding the value to QDQr0  r   r1  )r\   rq   r^   rp   z!Unsupported quantization method: )r  rM   r  r  r  T)rc  r_  r`  ztuple[TensorProto, GraphProto])W
__future__r   r2  r  r  r  osr   r   numpy.typingtypingnptr9  onnx.onnx_pbr   r   r   r   	packagingr   onnxruntime.capi._pybind_stater	   r
   	calibrater   
onnx_modelr   quant_utilsr   r   basicConfigINFO	getLoggerr%   r5  r   r*   r6   rA   rI   rR   r   r   r8  rg  r  r  r  r   r7  argsr   setLevelDEBUGinput_modelinput_model_pathoutput_modeloutput_model_pathr   r   tupler   pathexistsr  r  	symmetricquant_methodr
  r  r  r<   rF   quant_configrM   r.   rm   rp   calib_dataset_namerq   r^   r   r  r  quantr  save_model_to_filer(   r    r   <module>rY     s   #     	    G G  [ , ! 8   OW^WcWc d			8	$Y Y64 B0% 5 0%f(4 (V(-#8 (-Vs#!6 s#l4f  f Ro od	( (XY# Y#x*

l^ z<D||&''))t001L?C?X?X5!:!:;^i+/??t'J	ww~~'(u./?@% 12/BCC~~$++u4Z[DII&'EE!/TYYManx
 
		i	'3..%!5!
 
		e	#/EYZ			f	$0DOObvw			l	*;000NNcd&??L "".&&%/%/"%/"!+100,,nn1	
 <T=N=N<OPQQ **.... E 
MMO	KK""#4d;K r    