
    gC                         d dl Zd dlZd dlmZ d dlZej                  ej                  ej                  ej                  dZd Z G d d      Z	 	 	 	 	 ddZy)	    N)AutoTokenizer)ztorch.int32ztorch.int64ztorch.float32ztorch.float16c                     ddl m} |j                  | j                         |j                         |j	                         |j                         z  |j                  j                         y )Nr   )cudart)cudar   
cudaMemcpydata_ptrelement_sizenelementcudaMemcpyKindcudaMemcpyDeviceToDevice)dstsrcr   s      k/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/models/phi2/inference_example.pycuda_memcpyr      sK    
S\\^+66	    c                   d    e Zd Zd Zd Zd Zdej                  dedefdZ		 ddZ
dd	Zd
 Zd Zy)ORTGeneratorc                     || _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        i | _        y )N    P   i   r   F)	onnx_decoder_path	num_heads	head_size
num_layersmax_sequence_length	device_iduse_cuda_graphuse_traced_inputsstatic_inputs_map)selfdecoder_paths     r   __init__zORTGenerator.__init__    sF    !-#' #!&!#r   c                    || j                   v ry t        j                  d      }t        j                  d| j                        }i }t        j                  |dft        j
                  |      |d<   t        j                  dgt        j                  |      |d<   t        j                  |dgz  t        j
                  |      |d<   t        j                  dgt        j
                  |      |d	<   || j                  | j                  | j                  f}t        | j                        D ]m  }t        j                  ||t        j                  
      }|j                  d| |j                         d| |j!                         j                         i       o t        j                  |ddft        j                  |      |d<   || j                   |<   y )Ncpur      )dtypedevice	input_idsr   step	seqlens_ktotal_sequence_lengthr'   r&   	past_key_past_value_   logits)r   torchr'   r   zerosint32tensorint64r   r   r   ranger   float16update
contiguousclone)r    
batch_size
cpu_devicecuda_device	static_iocache_shapeicaches           r   append_static_inputsz!ORTGenerator.append_static_inputs+   su   ///\\%(
ll64>>:	!&j!_EKKXc!d	+!LL!EKKT	&!&jA3.>ekkZe!f	+-2\\1#U[[Yc-d	)*!4>>43K3KT^^\t'AKKKu}}UE	!ou/?/?/A[QRPSCTV[VaVaVcVnVnVpqr ( $kk:q%*@^ij	(-6z*r   c           	      
   | j                   rt        j                  nt        j                  | _        t        j
                  |d   | j                  t        j                        }t        j
                  |d   | j                  t        j                        }|j                  \  }}| j                  xr+ || j                  v xr | j                  xr | j                   | _        | j                  s1t        j
                  dg| j                  t        j                        n| j                  |   d   }| j                  s4t        j
                  |dgz  | j                  t        j                        n| j                  |   d   }t        ||j!                  d      j#                  d      j%                  t        j                               | j                  s:t        j
                  dgt        j                  d      t        j                        n| j                  |   d	   }||d<   |j'                         |j'                         d
}	| j(                  r|j'                         |	d<   | j                  r)|j'                         |	d<   |j'                         |	d	<   |	d= | j                  r| j*                  nd}
| j                  rd|| j,                  |
| j.                  fn|| j,                  |
| j.                  f}| j                  st1        | j2                        D ]  }t        j4                  || j                  | j                        }| j                  sE|	j7                  d| |j'                         d| |j9                         j'                         i      n#|	j7                  d| |j'                         i        nwt1        | j2                        D ]_  }|	j7                  d| | j                  |   d|    j'                         d| | j                  |   d|    j'                         i       a t        j4                  ||d| j                  | j                        }d|j'                         i}| j                  s| j                  rd|| j,                  || j.                  fn|| j,                  || j.                  f}t1        | j2                        D ]  }t        j4                  || j                  | j                        }| j                  s7|j7                  d| |j'                         d| |j'                         i      n#|j7                  d| |j'                         i        |	|fS )Nr(   r,   attention_maskr   r)   r*   r%   r$   r+   )r(   rD      r-   r.   past_r/   r0   present_key_present_value_present_)use_fp16r1   r7   float32torch_dtyper4   r'   r3   shaper   r   use_buffer_share	packed_kvr   r5   r   sumsubtor9   use_stepr   r   r   r6   r   r2   r8   r:   )r    encodings_dictr(   rD   r;   sequence_lengthr)   r*   total_seq_lengthinputspast_seq_length
past_shaper@   pastr0   outputspresent_shapepresents                     r   get_initial_inputs_and_outputsz+ORTGenerator.get_initial_inputs_and_outputsB   s   ,0MM5==u}}LL!<T[[X]XcXcd	n5E&Ft{{bgbmbmn&/oo#
O  #t555#%%# NN"	 	 )) LL!T[[D''
3F; 	 )) LLqc)$++U[[Q''
3K@ 	
 	I~11!488;>>u{{KL )) LL!U\\%%8L''
34KL 	
 . #--/,779

 ==!__.F6N"+"6"6"8F;.>.I.I.KF*+'(6:6K6K$22QR ~~ 
DNNOT^^Ldnnot~~N 	 %%4??+{{:dkkIYIYZ  >> MMYqc?DOO4EUVTWGXZ^ZdZdZfZqZqZs"tu%sT__5F'GH , 4??+#A3)?)?
)KiXYWZO)\)g)g)i%aS)4+A+A*+MP[\][^N_+`+k+k+m , Z%[_[k[klV..01$$ >> JP $../4>>R 
 4??+++mDKKtO_O_`
  >> NN's+W-?-?-A^TUSVCWY`YkYkYmn !8A39K9K9M(NO , wr   modelrW   r[   c           
      Z   |j                         }d }|j                         D ]  \  }}|j                  ||j                  j                  |j                  j                  dk(  rdn|j                  j
                  t        t        |j                           t        |j                        |j                                |j                  } |j                         D ]K  }|j                  }	| j                  rd|	v r||	j                  dd         }|j!                  |	|j                  j                  |j                  j
                  | j"                  rt$        j&                  nt$        j(                  t        |j                        |j                                ||	   }|j!                  |	|j                  |j                  dk(  rdn|j
                  | j"                  rt$        j&                  nt$        j(                  t        |j                        |j                                N |S )Nr$   r   )namedevice_typer   element_typerM   
buffer_ptrr]   rZ   )
io_bindingitems
bind_inputr'   typeindexpt_to_npreprr&   tuplerM   r   get_outputsra   rN   replacebind_outputrJ   npr7   rK   )
r    r_   rW   r[   re   r'   kvoutputra   s
             r   apply_io_bindingzORTGenerator.apply_io_binding   s   %%'
LLNDAq!!HHMM xx}}5!188>>%d177m4AGGn::< "  XXF # '')F;;D$$d):4<<	6:;&& !hhnn04"**2::. zz| '  DM&& &#);;%#7aV\\04"**2::. zz| '  *. r   c                    || _         t        j                         }d|_        d|_        || _        | j                   dk\  rd| j                   | j
                  dfnd}t        j                  | j                  ||g      | _        t        j                         | _
        t        j                  j                         r t        j                  d| j                         nt        j                  d      | _        || _        || _        || _        || _        t'        j(                  d	d
      | _        d| j*                  _        y )N   r   CUDAExecutionProvider)r   enable_cuda_graphCPUExecutionProvider)sess_options	providersr   r$   zmicrosoft/phi-2T)trust_remote_codez[PAD])r   ortSessionOptionslog_verbosity_levellog_severity_levelr   InferenceSessionr   sess
RunOptionsror1   r   is_availabler'   rJ   rN   rO   rS   r   from_pretrained	tokenizer	pad_token)	r    r   rJ   rN   rO   rS   r   rz   eps	            r   create_sessionzORTGenerator.create_session   s    #))++,(*+', ~~" %DNNY]YlYl&mn' 	
 (()?)?lgifjk	..">Cjj>U>U>Well64>>:]b]i]ijo]p  0" &667H\`a#* r   c                    | j                  |      \  }}|d   j                         }|j                  \  }}	|	}
t        j                  || j
                  t        j                        }|rdd l}g }d}|
|k  r| j                  | j                  ||      }|rj                         }|j                          |r| j                  r| j                  j                  dd       | j                  j                  || j                         | j                  r3| j                  j                  d| j                  rt!        |      nd       d}n&| j                  j                  || j                         |j#                          |r$j                         }j%                  |z
         |d   d d d	d d f   }t        j&                  |d	
      }||z  | j(                  j*                  k(  }|j-                  || j(                  j*                        j/                  |dg      }t        j0                  ||gd	
      }t        j2                  |      rn|
dz  }
|j5                  t        j6                        |d<   | j                  r4t9        | j:                  |   d   |d          | j:                  |   d   |d<   | j<                  rwt        j>                  |
dz
  g| j
                  t        j@                        |d<   | j                  r4t9        | j:                  |   d   |d          | j:                  |   d   |d<   | j                  r|d   }|| j/                  |d      z   j5                  t        j6                        |d<   |
|d   d<   | j                  rt9        | j:                  |   d   |d          | j:                  |   d   |d<   |d   d   | j:                  |   d   d<   | j:                  |   d   |d<   nLt        j0                  |d   | j/                  |d      gd      j5                  t        j6                        |d<   |d   j                  d   dk7  rC|d   d d d dd d f   jC                         |d<   | j                  r| j:                  |   d   |d<   |d   jE                          | jF                  s_tI        | jJ                        D ]9  }| jL                  s|d|    |d| <   |d|    |d| <   ,|d|    |d| <   ; |d   j                  d   }| jL                  rd|| jN                  || jP                  fn|| jN                  || jP                  f}tI        | jJ                        D ]  }t        j                  || j
                  | jR                        }| jL                  sE|jU                  d| |jC                         d| |j                         jC                         i      n#|jU                  d| |jC                         i        |
|k  r|rItW        d| d|	 d||	z
          tW        ddd   z   ddtY        jZ                  |dd        z   d       y | j(                  j]                  |d      }|S )Nr(   r,   r   Tgpu_graph_idz-1Fr0   )dimr%   r)   r*   r+   rD   rG   r-   rH   r.   rI   rF   rE   zBatch size: z, Sequence length: z, Token num: zPrompt letency: i  zms, Token latency: ms)skip_special_tokens)/r^   r:   rM   r1   r2   r'   booltimert   r   synchronize_inputsr   r   add_run_config_entryrun_with_iobindingr   strsynchronize_outputsappendargmaxr   eos_token_idmasked_fillreshapecatallrR   r3   r   r   rS   r4   r5   r9   zero_rN   r6   r   rO   r   r   rL   r8   printrp   meanbatch_decode)r    rT   
max_lengthcuda_graph_annotation	benchmarkrW   r[   all_token_idsr;   rU   current_lengthhas_eosr   latency
prompt_runre   startendnext_token_logitsnext_tokenstokens_to_addprevious_seqlens_kr@   new_sequence_lengthr\   r]   textss                              r   generate_implzORTGenerator.generate_impl   sf   ==nM{+113&3&9&9#
O(++jEJJOG
z)..tyy&'JJ		))+&&GG00F		,,ZA&&GG00&dF\F\,A(Bbf #
		,,ZA**,iiksU{+ !( 1!R( ;,,'8bAK +t~~/J/JJG (33GT^^=X=XYaacmopbqrM!II}m&D"MM yy! aN"/"2"25;;"?F;%%D22:>{KVT_M`a&*&<&<Z&H&U{#}}!&~/A.B4;;^c^i^i!jv)) 6 6z B6 JFSYN[%)%;%;J%G%OF6N""%+K%8"'9gX<N<Nz[\<]']&a&abgbmbm&n{#5C./2)) 6 6z B; OQWXcQde*.*@*@*L[*YF;'U[\sUtuvUwD**:67NOPQR6:6L6LZ6XYp6qF23+099,-/A/A*a/PQST,"U[[/ '(
 x &&q)Q.$+H$5a!Qh$?$J$J$L!))(,(>(>z(J8(TGH%H##%((t/A>>29L:L2M1#/4;nQC<P4QQC01.5n.Eqc{+ 0 '--=&>&D&DQ&G# ~~ 
DNN4GX$dnn6I4>>Z 
 t/A#kk-SWScScdG  $~~  ".qc 2G4F4F4H"0 4gmmo6P6P6R %^^xs^W=O=O=Q,RS 0w z)P zl*=o=Nm\fix\x[yz $TGAJ%6$77J4RTRYRYZabcbdZeRfKfJggijk++Mt+Tr   c                 b    | j                   j                  |d      }| j                  |||      S )NT)padding)r   batch_encode_plusr   )r    promptr   r   rT   s        r   generatezORTGenerator.generatea  s1    99&$9O!!.*>STTr   c                 T   |\  }}||z   }i }t        j                  dd||ft         j                        j                         |d<   t        j                  ||ft         j                        j                         |d<   | j                  |||d       | j                  |||d       y )	Nr   iX  )r&   r(   rD   F)r   T)r1   randintr3   tolistonesr   )r    prompt_shape	token_numr   r;   rU   r   rT   s           r   generate_benchmarkzORTGenerator.generate_benchmarkf  s    &2#
O$y0
&+mmAuz?>[chcncn&o&v&v&x{#+0::z?6S[`[f[f+g+n+n+p'( 	>:7LX]^ 	>:7LX\]r   N)TTFFF)F)__name__
__module____qualname__r"   rB   r^   r}   r   dictrt   r   r   r   r    r   r   r   r      sT    	$7.[z&c&:&: &D &SW &R pu+2BU
^r   r   c                     t        |       j                  |||||       fd}dg}	|s ||	       |r:d}
dD ]2  }j                  |       dD ]  }||f}j                  ||
|        4 y y )Nc                     t        |       }rj                  |       j                  | d|      }t        t        |            D ]   }t	        d| |          t	        d||          " y )N)r;      )r   r   zPrompt: zTexts: )lenrB   r   r6   r   )r   example_batch_sizer   r@   	generatorr   s       r   
simple_runzrun_phi2.<locals>.simple_run  si     [**6H*I""6cQc"ds5z"A*fQi()U1X& #r   zV```python
    def print_prime(n):
    """
    Print all primes between 1 and n
    """r   )r%   rE   rv      )   i   )r   )r   r   rB   r   )onnx_model_pathrN   r   rO   rJ   rS   r   run_benchmarkr   r   r   r;   rU   r   r   s         `       @r   run_phi2r   u  s     _-IY2BIxYgh'	F 6 	&J**:6#, *O<,,\9\f,g $- ' r   )FTFFF)numpyrp   r1   transformersr   onnxruntimer}   r3   r5   rK   r7   rj   r   r   r   r   r   r   <module>r      sc      &  8888ZZZZ	S^ S^t
 )hr   