
    gL                     "   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 dddddZdd	d
ddZd Zd5dZdededefdZdedededefdZdedededefdZdedefdZdedefdZdedededededededed ed!ed"efd#Z	 	 d6dededefd$Zdedefd%Zdedededededededed ed!efd&Z	 	 d7d'ed(edededededed ed!ed)ed*ed+efd,Z	 	 d7d'ed(ededededededed ed!ed)ed*ed+efd-Z	 	 d7d'ed(edededededed ed!ed)ed*efd.Z	 	 d7d'ed(edededededed ed!ed)ed*efd/Zdedededededededed ed!efd0Zd1 Z d8d2Z!d3 Z"e#d4k(  rd dl$Z$	  e"        yy# e%$ r!  e$jL                   ejN                           Y yw xY w)9    Nmeasure_memoryzrunwayml/stable-diffusion-v1-5zstabilityai/stable-diffusion-2z stabilityai/stable-diffusion-2-1z+stabilityai/stable-diffusion-xl-refiner-1.0)1.5z2.02.1zxl-1.0CUDAExecutionProviderROCMExecutionProviderMIGraphXExecutionProviderTensorrtExecutionProvider)cudarocmmigraphxtensorrtc                      g d} d}| |fS )N)
z.a photo of an astronaut riding a horse on marsz@cute grey cat with blue eyes, wearing a bowtie, acrylic paintingzia cute magical flying dog, fantasy art drawn by disney concept artists, highly detailed, digital paintingzdan illustration of a house with large barn with many cute flower pots and beautiful blue sky sceneryzgone apple sitting on a table, still life, reflective, full color photograph, centered, close-up productzWbackground texture of stones, masterpiece, artistic, stunning photo, award winner photozSnew international organic style house, tropical surroundings, architecture, 8k, hdrznbeautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstationzcblue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realisticzldelicate elvish moonstone necklace on a velvet background, symmetrical intricate motifs, leaves, flowers, 8kz*bad composition, ugly, abnormal, malformed )promptsnegative_prompts     o/var/www/openai/venv/lib/python3.12/site-packages/onnxruntime/transformers/models/stable_diffusion/benchmark.pyexample_promptsr   #   s    G COO##    c                      t        d|| |      S )NT)is_gpufuncmonitor_typestart_memoryr   )r   r   r   s      r   measure_gpu_memoryr   6   s    D|Zfggr   
model_name	directorydisable_safety_checkerc                 p   ddl m}m} dd l}|Ft        j
                  j                  |      sJ |j                         }|j                  |||      }n|j                  | d|d      }|j                  |j                  j                        |_
        |j                  d       |rd |_        d |_        |S )Nr   )DDIMSchedulerOnnxStableDiffusionPipeline)providersess_optionsonnxT)revisionr"   use_auth_tokendisable)	diffusersr    r!   onnxruntimeospathexistsSessionOptionsfrom_pretrainedfrom_config	schedulerconfigset_progress_bar_configsafety_checkerfeature_extractor)	r   r   r"   r   r    r!   r*   session_optionspipes	            r   get_ort_pipeliner8   :   s    Dww~~i(((%446*::( ; 
 +::	 ; 
 #..t~~/D/DEDN   ."!%Kr   enable_torch_compileuse_xformersc                 P   ddl m}m} ddlm}m} |j                  | |      j                  d      }|j                  j                  |       |r|j                          |rwt        j                  |j                        |_        t        j                  |j                        |_        t        j                  |j                        |_        t        d       |j                  |j                  j                         |_        |j#                  d	       |rd |_        d |_        |S )
Nr   )r    StableDiffusionPipeline)channels_lastfloat16)torch_dtyper   )memory_formatz)Torch compiled unet, vae and text_encoderTr'   )r)   r    r<   torchr=   r>   r/   tounet*enable_xformers_memory_efficient_attentioncompilevaetext_encoderprintr0   r1   r2   r3   r4   r5   )	r   r   r9   r:   r    r<   r=   r>   r7   s	            r   get_torch_pipelinerI   X   s    @,"22:72SVVW]^DIILL}L-779MM$)),	==*!MM$*;*;<9:"..t~~/D/DEDN   ."!%Kr   engine
batch_sizec                 p    |j                  d      d   j                  dd      }|  d| d| |rdz   S dz   S )	N/zstable-diffusion-sd__b _safe)splitreplace)rJ   r   rK   r   short_model_names        r   get_image_filename_prefixrW   s   sP    !'',R0889LdSXQ'(:,7AW2ee]deer   image_filename_prefixc
                     ddl m}
 t         |
      sJ t               \  }} fd}t	        |	||      }t	        |	||      } |        g }t        |      D ]  \  }}||k\  r nt        |      D ]  }t        j                         }  |gz  |gz  d      j                  }t        j                         }||z
  }|j                  |       t        d|dd       t        |      D ]"  \  }}|j                  | d	| d	| d	| d
       $   ddlm} d|||t        |      t        |      z  t!        j"                  |      ||dS )Nr   )r!   c                        d        y Nwarm up)num_inference_stepsnum_images_per_promptr   rK   heightr7   stepswidths   r   warmupz run_ort_pipeline.<locals>.warmup       Y5Xbcr         @)r]   r   guidance_scaleInference took .3f secondsrP   .jpg__version__r*   rJ   versionr`   rb   ra   rK   batch_countnum_promptsaverage_latencymedian_latencyfirst_run_memory_MBsecond_run_memory_MB)r)   r!   
isinstancer   r   	enumeraterangetimeimagesappendrH   saver*   rl   sumlen
statisticsmedian)r7   rK   rX   r`   rb   ra   rp   ro   r   memory_monitor_typer!   r   r   rc   first_run_memorysecond_run_memorylatency_listipromptjinference_startry   inference_endlatencykimageort_versions   `` ```                     r   run_ort_pipeliner   x   s    6d7888.0G_d d
 **=v|T*+>U
HLw'	6{#A"iikO:%$)!0 1J >" f  !IIKM#o5G(OGC=9:%f-5

34AaS!AaSEF . $ (( 7   ""|,s</@@$++L9/ 1 r   c
                 <    t               \  }
} fd}t        |	||      }t        |	||      } |        t        j                  d       g }t	        |
      D ]  \  }}||k\  r nt        j
                  j                          t        |      D ]  }t        j                         }  |gz  d|gz  d       j                  }t        j
                  j                          t        j                         }||z
  }|j                  |       t        d|dd       t	        |      D ]"  \  }}|j                  | d| d| d| d	       $   d
t        j                  ||t        |      t        |      z  t!        j"                  |      ||dS )Nc                        d        y r[   r   r_   s   r   rc   z"run_torch_pipeline.<locals>.warmup   rd   r   Fre   )r   r`   rb   r]   rf   r   	generatorrg   rh   ri   rP   rj   rA   rm   )r   r   rA   set_grad_enabledrv   r   synchronizerw   rx   ry   rz   rH   r{   rl   r|   r}   r~   r   )r7   rK   rX   r`   rb   ra   rp   ro   r   r   r   r   rc   r   r   r   r   r   r   r   ry   r   r   r   r   s   `` ```                   r   run_torch_pipeliner      s     /0G_d d **=v|T*+>U
H	5!Lw'	6

 {#A"iikOx*,$)"!0 1J > f  JJ""$ IIKM#o5G(OGC=9:%f-5

34AaS!AaSEF .# $	 (2 $$ ""|,s</@@$++L9/ 1 r   r"   r`   rb   ra   rp   ro   tuningc                 F   |}|r|dv r|dddf}t        j                          }t        | |||      }t        j                          }t        d||z
   d       t        d| ||      }t	        ||||||||	|
|
      }|j                  | ||j                  dd      |d	d
       |S )N)r   r      )tunable_op_enabletunable_op_tuning_enableModel loading took ri   ortExecutionProviderrR   Fr   r   r"   r   enable_cuda_graph)rx   r8   rH   rW   r   updaterU   )r   r   r"   rK   r   r`   rb   ra   rp   ro   r   r   r   provider_and_options
load_startr7   load_endrX   results                      r   run_ortr      s     $(PP (_`*abJJ	3GI_`Dyy{H	: 56h
?@5eZUklF MM$" (()<bA&<!&	
 Mr   c                 z   ddl m}m} |Nt        j                  j                  |      r/d| v r|j                  ||d d      }ng|j                  ||d      }nRd| v r(|j                  | d|d d      }|j                  |       n&|j                  | d|d	      }|j                  |       |rd |_        d |_	        |S )
Nr   ORTStableDiffusionPipelineORTStableDiffusionXLPipelinexlF)r"   r6   use_io_binding)r"   r   T)exportr"   r6   r   )r   r"   r   )
optimum.onnxruntimer   r   r+   r,   r-   r/   save_pretrainedr4   r5   )r   r   r"   r   r   r   pipelines          r   get_optimum_ort_pipeliner   0  s     ]	!::3CC! $$	 D H 2AA!$ B H
 
	/??   @ 
 	  +-== 	 > 
 	  +"&%)"Or   c
                     ddl m}
m} t         |
|f      sJ t	               } fd}t        |	||      }t        |	||      } |        g }t        |      D ]  \  }}||k\  r nt        |      D ]  }t        j                         }  |d d      j                  }t        j                         }||z
  }|j                  |       t        d|dd       t        |      D ]"  \  }}|j                  | d	| d	| d	| d
       $   ddlm} d|||t        |      t!        |      z  t#        j$                  |      ||dS )Nr   r   c                        d        y r[   r   r_   s   r   rc   z(run_optimum_ort_pipeline.<locals>.warmupq  rd   r   g        )r]   r   rf   r^   rg   rh   ri   rP   rj   rk   optimum_ortrm   )r   r   r   ru   r   r   rv   rw   rx   ry   rz   rH   r{   r*   rl   r|   r}   r~   r   )r7   rK   rX   r`   rb   ra   rp   ro   r   r   r   r   r   rc   r   r   r   r   r   r   r   ry   r   r   r   r   r   s   `` ```                     r   run_optimum_ort_pipeliner   _  s    ]d79UVWWWGd d
 **=v|T*+>U
HLw'	6{#A"iikO$) $"&0 f  !IIKM#o5G(OGC=9:%f-5

34AaS!AaSEF . $ (* 7   ""|,s</@@$++L9/ 1 r   c                 (   t        j                          }t        | |||      }t        j                          }t        d||z
   d       t        d| ||      }t	        ||||||||	|
|
      }|j                  | ||j                  dd      |dd       |S )Nr   ri   optimumr   rR   Fr   )rx   r   rH   rW   r   r   rU   )r   r   r"   rK   r   r`   rb   ra   rp   ro   r   r   r   r7   r   rX   r   s                    r   run_optimum_ortr     s     J#J	8E[\Dyy{H	: 56h
?@5iZYop%F MM$" (()<bA&<!&	
 Mr   work_dirrn   max_batch_sizenvtx_profileuse_cuda_graphc                   . t        d       ddlm}  |        |k  sJ ddlm}  ||      }|j                         }ddlm}m} ddl	m
} |j                  } || ||      \  }}}}} ||d|d|||||		      ..j                  j                  |||d
dddt        j                  j!                                .j#                         .fd}t%        |
||	      }t%        |
||	      } |        t'        d||      }g }t)               \  } }!t+        |       D ]  \  }"}#|"|k\  r nt-        |      D ]  }$t/        j.                         }%.j1                  |#gz  |!gz  dd      \  }&}'t/        j.                         }(|(|%z
  })|j3                  |)       t        d|)dd|'        t+        |&      D ]"  \  }*}+|+j5                  | d|" d|$ d|* d       $   .j7                          ddlm}, ddlm}- i d|j?                         ddd|-dd|, dd|d d!d"d#d$|d%|d&tA        |      tC        |      z  d'tE        jF                  |      d(|d)|d*|d+|S ),Nzd[I] Initializing ORT TensorRT EP accelerated StableDiffusionXL txt2img pipeline (static input shape)r   init_trt_pluginsPipelineInfo
EngineTypeget_engine_pathsr<   DDIMFr1   
output_dirverboser   r   r   framework_model_direngine_type   T)opt_image_heightopt_image_widthopt_batch_sizestatic_batchstatic_image_shapemax_workspace_size	device_idc                  D    j                  dg z  dg z  d       y Nr\   negativeT)denoising_stepsrc   runrK   r`   r   ra   rb   s   r   rc   z"run_ort_trt_static.<locals>.warmup  1    K*$zlZ&?`enr 	 	
r   ort_trtre   {   r   guidanceseedEnd2End took rh    seconds. Inference latency: rP   rj   rk   r   rJ   r*   rn   r"   z	tensorrt()r   r`   rb   ra   rK   ro   rp   rq   rr   rs   rt   r   r   )$rH   trt_utilitiesr   diffusion_modelsr   
short_nameengine_builderr   r   pipeline_stable_diffusionr<   ORT_TRTbackendbuild_enginesrA   r   current_deviceload_resourcesr   rW   r   rv   rw   rx   r   rz   r{   teardownr   rl   r*   namer|   r}   r~   r   )/r   rn   rK   r   r`   rb   ra   rp   ro   r   r   r   r   r   r   r   pipeline_infor   r   r   r<   r   onnx_dir
engine_dirr   r   rP   rc   r   r   rX   r   r   r   r   r   r   r   ry   pipeline_timer   r   r   r   trt_versionr   r   s/     ` ```                                       @r   run_ort_trt_staticr     sI     

pq /'''- )M))+J;A$$K?OPXZgit?u<Hj*&91 '!%%/
H ""
!**++- #   FE:6
 
 **=v|T*+>U
H5iZYopL.0G_w'	6{#A"iikO$,LL:% !J. % %1 %!FM !IIKM#o5G(M'#.KM?[\%f-5

34AaS!AaSEF .! $ (, 36m((*- 	; 	i}A.	
 	Z 	& 	 	 	j 	{ 	{ 	3|,s</@@ 	*++L9 	/ 	 1  	!"8!" 	^# r   c                   1 t        d       ddlm} ddlm}  |        |k  sJ ddlm}  ||      }ddlm}m	} ddl
m} |j                  } || ||      \  }}}}} ||d|d	||d
|      11j                  j                  |||dd
d
d	|       t        1j                  j!                         1j                  j!                               }|j#                  |      \  }}1j                  j%                  |       1j'                         1fd}t)        |||
      } t)        |||
      }! |        t+        d||      }"g }#t-               \  }$}%t/        |$      D ]  \  }&}'|&|k\  r nt1        |	      D ]  }(t3        j2                         })1j5                  |'gz  |%gz  dd      \  }*}+t3        j2                         },|,|)z
  }-|#j7                  |-       t        d|-dd|+        t/        |*      D ]"  \  }.}/|/j9                  |" d|& d|( d|. d       $   1j;                          dd l}0d|0j>                  d|	|tA        |#      tC        |#      z  tE        jF                  |#      | |!|dS )N][I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)r   cudartr   r   r   r   r   FT)r1   r   r   r   r   r   r   r   r   r   r   
onnx_opsetr   r   r   r   static_shapeenable_all_tacticstiming_cachec                  D    j                  dg z  dg z  d       y r   r   r   s   r   rc   z#run_tensorrt_static.<locals>.warmup  r   r   trtre   r   r   r   rh   r   rP   rj   r   default)rJ   rn   r"   r`   rb   ra   rK   ro   rp   rq   rr   rs   rt   r   )$rH   r   r   r   r   r   r   r   r   r   r   r<   TRTr   load_enginesmaxmax_device_memory
cudaMallocactivate_enginesr   r   rW   r   rv   rw   rx   r   rz   r{   r   r   rl   r|   r}   r~   r   )2r   rn   r   rK   r   r`   rb   ra   rp   ro   r   r   r   r   r   r   r   r   r   r   r   r<   r   r   r   r   r   r   r  rP   shared_device_memoryrc   r   r   rX   r   r   r   r   r   r   r   ry   r   r   r   r   r   r   r   s2      ` ```                                         @r   run_tensorrt_staticr  U  s   " 

ij /'''- )M;A..KJZ-KGHj*&9<
 '!%	H !!/! ! "  H,,>>@(BRBRBdBdBfg$//0ABA%%&:; FE:6
 
 **=v|T*+>U
H5eZUklL.0G_w'	6{#A"iikO$,LL:% !J. % %1 %!FM !IIKM#o5G(M'#.KM?[\%f-5

34AaS!AaSEF .! $ (,  ?? ""|,s</@@$++L9/ 1+ r   c                 F   *+,-./ t        d       dd l}ddlm} ddlm} ,-,dz  dk7  s-dz  dk7  rt        d, d- d       |        k  sJ dd	lm} dd
l	m
*m+ *+ f	d}ddlm}  ||      } |||      .t        .j                  j!                         .j                  j!                               }|j#                  |      \  }}.j                  j%                  |       .j'                  ,-       d,-.fd	//fd}t)        |
||	      }t)        |
||	      } |        |j+                         }t-        d||      }g }t/               \  }}t1        |      D ]  \  } }!| |k\  r nt3        |      D ]  }"t5        j4                         }#r|j7                           /|!gz  |gz  d      \  }$}%r|j9                          t5        j4                         }&|&|#z
  }'|j;                  |'       t        d|'dd|%        t1        |$      D ]"  \  }(})|)j=                  | d|  d|" d|( d       $   .j?                          |d|j@                  d||tC        |      tE        |      z  tG        jH                  |      ||dS )Nr   r   r   r      zCImage height and width have to be divisible by 8 but specified as: z and .r   r   c                    	 	j                   } ||      \  }}}}} | |d|d||	      }|j                  j                  |||d
ddd|       |S )Nr   Fr   r   Tr   )r   r   r   )pipeline_classr   r   r   r   r   r   r   r   r   rK   r   r`   r   r   r   rb   r   s            r   init_pipelinez-run_tensorrt_static_xl.<locals>.init_pipeline  s     nnN^m[O
K*j*=|
 "!%)) 3#

 	%%! 3%#!$% 	& 	
 r   r   c           	      4    j                  | |d|      S Ng      @r   r   )r   r   r   image_heightimage_widthr   ra   s      r   run_sd_xl_inferencez3run_tensorrt_static_xl.<locals>.run_sd_xl_inference3  s.    ||!  
 	
r   c                  (     dg z  dg z         y Nr\   r   r   rK   r  s   r   rc   z&run_tensorrt_static_xl.<locals>.warmup>      YK*4zlZ6OPr   r   r   r   r   rh   r   rP   .pngr   r   r   rJ   rn   r"   r`   rb   ra   rK   ro   rp   rq   rr   rs   rt   r   N)%rH   r   r   r   r   r   
ValueErrorr   r   r   r   r   r   r<   r  r   r  r  r  r   r   r   rW   r   rv   rw   rx   cudaProfilerStartcudaProfilerStoprz   r{   r   rl   r|   r}   r~   r   )0r   rn   rK   r   r`   rb   ra   rp   ro   r   r   r   r   r   r   r   r   r   r  r<   r   r  rP   r  rc   r   r   r   rX   r   r   r   r   r   r   r   ry   r   r   r   r   r   r   r   r  r  r   r  s0   ` ` ```    ```                            @@@@@@r   run_tensorrt_static_xlr    s     

ij. LKa1a1 4QR^Q__depdqqrs
 	

 '''-;! !F B )M4mDHH,,>>@(BRBRBdBdBfg$//0ABA%%&:; L+zB	
 	
Q
 **=v|T*+>U
H##%J5eZUklL.0G_w'	6{#A"iikO((*$7:8MP_O`cmOmtw$x!FM'') IIKM#o5G(M'#.KM?[\%f-5

34AaS!AaSEF . $ ($  !?? ""|,s</@@$++L9/ 1+ r   c                    &' ddl m} ddlm}  |||j                  | ||      &ddlm} |k  sJ &j                         d&fd	''fd}t        |
||	      }t        |
||	      } |        &j                  j                         }t        d||      }g }t               \  }}t        |      D ]  \  }}||k\  r nt        |      D ]  }t        j                         }|r|j!                           '|gz  |gz  d	
      \  }}|r|j#                          t        j                         }||z
  } |j%                  |        t'        d| dd|        t        |      D ]0  \  }!}"| d| d| d|! d}#|"j)                  |#       t'        d|#       2   &j+                          ddlm}$ ddlm}% |d|%d|$ d||t3        |      t5        |      z  t7        j8                  |      |||dS )Nr   )initialize_pipeline)r   )rn   r   r   r`   rb   r   r   r   r   c           	      4    j                  | |d|      S r  r   )r   r   r   r`   r   ra   rb   s      r   r  z+run_ort_trt_xl.<locals>.run_sd_xl_inference  s.    ||!  
 	
r   c                  (     dg z  dg z         y r  r   r  s   r   rc   zrun_ort_trt_xl.<locals>.warmup  r  r   r   r   r  r   rh   r   rP   r  zImage saved tork   r*   r   r   r  r  )
demo_utilsr  r   r   r   r   r   r   r   r   r   rW   r   rv   rw   rx   r  r  rz   rH   r{   r   r   rl   r*   r|   r}   r~   r   )(r   rn   rK   r   r`   rb   ra   rp   ro   r   r   r   r   r   r  r   r   rc   r   r   r   rX   r   r   r   r   r   r   r   ry   r   r   r   r   r   filenamer   r   r   r  s(     ` ```                               @@r   run_ort_trt_xlr$  t  s6     /)"&&%%!	H '''FE:6	
 	
Q
 **=v|T*+>U
H'',,.J5iZYopL.0G_w'	6{#A"iikO((*$7:8MP_O`cmOmtw$x!FM'') IIKM#o5G(M'#.KM?[\%f-534AaS!AaSE

8$&1 . $ (( 36 !{m1- ""|,s</@@$++L9/ 1+ r   c                 >   dt         j                  j                  _        dt         j                  j                  _        t        j
                  d       t        j                         }t        | |||      }t        j                         }t        d||z
   d       t        d| ||      }|s2t        j                         5  t        ||||||||	|
|
      }d d d        nt        ||||||||	|
|
      }j                  | d |rdn|rdnd|dd	       |S # 1 sw Y   *xY w)
NTFr   ri   rA   rE   xformersr   r   )rA   backendscudnnenabled	benchmarkr   rx   rI   rH   rW   inference_moder   r   )r   rK   r   r9   r:   r`   rb   ra   rp   ro   r   r   r   r7   r   rX   r   s                    r   	run_torchr,    s.    $(ENN %)ENN"	5!Jj*@BVXdeDyy{H	: 56h
?@5gz:Wmn!!#'%#F $# $!
 MM$%9	\z_h&<!&	
 MI $#s    DDc                     t        j                         } | j                  dddt        dg dd       | j                  dd	dt        d
t	        t
        j                               d       | j                  dddd       | j                  dddt        t	        t        j                               dd       | j                  dddt        d d       | j                  dddt        dd       | j                  dddd        | j                  d!       | j                  d"ddd#        | j                  d$       | j                  d%ddd&        | j                  d'       | j                  d(d)t        d*g d+d,-       | j                  d.dt        d/d0       | j                  d1dt        d/d2       | j                  d3d4dt        d5d6       | j                  d7d8dt        d*d9       | j                  d:d;dt        t        d*d<      d=d>       | j                  d?d@dt        t        d*dA      dBdC       | j                  dDdEdddF        | j                  dG       | j                         }|S )HNz-ez--engineFr*   )r*   r   rA   r   z-Engines to benchmark. Default is onnxruntime.)requiredtyper   choiceshelpz-rz
--providerr   z8Provider to benchmark. Default is CUDAExecutionProvider.z-tz--tuning
store_truezsEnable TunableOp and tuning. This will incur longer warmup latency, and is mandatory for some operators of ROCm EP.)actionr1  z-vz	--versionr   z>Stable diffusion version like 1.5, 2.0 or 2.1. Default is 1.5.)r.  r/  r0  r   r1  z-pz
--pipelinez[Directory of saved onnx pipeline. It could be the output directory of optimize_pipeline.py.)r.  r/  r   r1  z-wz
--work_dirr	  z?Root directory to save exported onnx models, built engines etc.z--enable_safety_checkerzEnable safety checker)r.  r3  r1  )enable_safety_checkerz--enable_torch_compilez#Enable compile unet for PyTorch 2.0)r9   z--use_xformerszUse xformers for PyTorch)r:   z-bz--batch_sizer   )r            r  
          z)Number of images per batch. Default is 1.)r/  r   r0  r1  z--heighti   z$Output image height. Default is 512.z--widthz#Output image width. Default is 512.z-sz--steps2   zNumber of steps. Default is 50.z-nz--num_promptsz Number of prompts. Default is 1.z-cz--batch_count      z(Number of batches to test. Default is 5.z-mz--max_trt_batch_sizer9  r7  zdMaximum batch size for TensorRT. Change the value may trigger TensorRT engine rebuild. Default is 4.z-gz--enable_cuda_graphz/Enable Cuda Graph. Requires onnxruntime >= 1.16)r   )argparseArgumentParseradd_argumentstrlist	PROVIDERSkeys	SD_MODELSset_defaultsintrw   
parse_args)parserargss     r   parse_argumentsrK  !  s   $$&F
?<   Y^^%&G   a	   Y^^%&M   j   N   !$	   e4
 2	   U3
'	   U+
+8   3   2   .   /   a7   as   >   %0DKr   c                     dd l }|j                  t        j                               }|j	                         D ].  | rt        fddD              st        j                         0 y )Nr   c              3   :   K   | ]  }|j                   v   y wr  )r,   ).0xlibs     r   	<genexpr>z)print_loaded_libraries.<locals>.<genexpr>  s     )`A_A!sxx-A_s   )libculibnvr   )psutilProcessr+   getpidmemory_mapsanyrH   r,   )cuda_related_onlyrT  prP  s      @r   print_loaded_librariesr[    sF    ryy{#A}}!c)`A_)`&`#((O r   c                     t               } t        |        | j                  dk(  r| j                  dv rdt        j
                  d<   ddlm} ddlm} |j                  |      |j                  d      k(  rdt        j
                  d	<   | j                  rb| j                  dk(  r| j                  d
v r| j                  t        d      |j                  |      |j                  d      k  rt        d      t        j                  d       | j                  dk(  rdnd}t!        |d       }t        d|       t"        | j                     }t$        | j                     }| j                  dk(  r=| j                  dk(  r-d| j                  v rt        d       t'        | j(                  | j                  | j*                  d| j,                  | j.                  | j0                  | j2                  | j4                  ||| j6                  d| j                        }nt        d       t9        | j(                  | j                  | j*                  | j:                   | j,                  | j.                  | j0                  | j2                  | j4                  ||| j6                  d| j                        }nR| j                  dk(  r|dk(  rd| j                  v rdt        j
                  d	<   t=        || j                  || j*                  | j:                   | j,                  | j.                  | j0                  | j2                  | j4                  ||      }n| j                  dk(  r| j                  r)t        j>                  jA                  | j                        sJ d       t        d| d| jB                          tE        || j                  || j*                  | j:                   | j,                  | j.                  | j0                  | j2                  | j4                  ||| jB                         }n| j                  dk(  rd| j                  v rt        d!       tG        | j(                  | j                  | j*                  d| j,                  | j.                  | j0                  | j2                  | j4                  ||| j6                  d| j                        }n1| j                  dk(  rt        d"       tI        | j(                  | j                  || j*                  d| j,                  | j.                  | j0                  | j2                  | j4                  ||| j6                  d| j                  #      }nt        d$| jJ                   d%| jL                   d&       tO        || j*                  | j:                   | jJ                  | jL                  | j,                  | j.                  | j0                  | j2                  | j4                  ||'      }t        |       tQ        d(d)d*+      5 }g d,}	tS        jT                  ||	-      }
|
jW                          |
jY                  |       d d d        | j0                  d.k(  rt[        | j                  d
v        y y # 1 sw Y   1xY w)/Nr*   )r   1ORT_DISABLE_TRT_FLASH_ATTENTIONr   )rn   rk   z1.16.0!ORT_ENABLE_FUSED_CAUSAL_ATTENTION)r   r   z:The stable diffusion pipeline does not support CUDA graph.z1.16z.CUDA graph requires ONNX Runtime 1.16 or laterz%(funcName)20s: %(message)s)fmtr   r   z&GPU memory used before loading models:r   r   zNTesting Txt2ImgXLPipeline with static input shape. Backend is ORT TensorRT EP.TF)r   rn   rK   r   r`   rb   ra   rp   ro   r   r   r   r   r   zLTesting Txt2ImgPipeline with static input shape. Backend is ORT TensorRT EP.r   r   )r   r   r"   rK   r   r`   rb   ra   rp   ro   r   r   z?--pipeline should be specified for the directory of ONNX modelsz/Testing diffusers StableDiffusionPipeline with z provider and tuning=)r   r   r"   rK   r   r`   rb   ra   rp   ro   r   r   r   zGTesting Txt2ImgXLPipeline with static input shape. Backend is TensorRT.zETesting Txt2ImgPipeline with static input shape. Backend is TensorRT.)r   rn   r   rK   r   r`   rb   ra   rp   ro   r   r   r   r   r   zNTesting Txt2ImgPipeline with dynamic input shape. Backend is PyTorch: compile=z, xformers=r	  )r   rK   r   r9   r:   r`   rb   ra   rp   ro   r   r   zbenchmark_result.csvarR   )modenewline)r   r   rJ   rn   r"   r   r`   rb   ra   rK   ro   rp   rq   rr   rs   rt   r   )
fieldnamesr   ).rK  rH   rJ   rn   r+   environ	packagingr*   rl   parser   r"   r   r  coloredlogsinstallr   rE  rC  r$  r   rK   r`   rb   ra   rp   ro   max_trt_batch_sizer   r4  r   r,   isdirr   r   r  r  r9   r:   r,  opencsv
DictWriterwriteheaderwriterowr[  )rJ  rn   r   r   r   sd_modelr"   r   csv_filecolumn_names
csv_writers              r   mainru    s   D	$K{{m#<<7" =@BJJ89%:==%x)@@ ?BBJJ:;!!KK=0T]]FZ5Z_c_l_l_t !]^^}}[)GMM&,AA !QRR9:$(MMV$;&%&94@L	
2LA&H'H{{m#(C4<<bc#??'+{{jjjj ,, ,,)$7#66"#55F" `a'??+/+E+E'E{{jjjj ,, ,,)$7#66"#55F  
		!h2I&I4<<>ABJJ:; mm'+'A'A#A;;****((((% 3
 
	%}}MM"
 	ML	M 
 	?zI^_c_j_j^klmmm'+'A'A#A;;****((((% 3;;
 

	"tt||';WX']]LL#';;****((((% 32211
  

	"UV$]]LL#';;****((((% 32211
$ 	\]a]v]v\w  xC  DH  DU  DU  CV  VW  X	
 '+'A'A#A!%!:!:**;;****((((% 3
 
&M	$3	;x
& ^^HF
 F#- 
<2 zzQt}}0DDE 3 
<	;s   =ZZ__main__r  )r   T)FT)T)(r>  rm  r+   r~   sysrx   __init__rh  rA   benchmark_helperr   rE  rC  r   r   rA  boolr8   rI   rG  rW   r   r   r   r   r   r   r   r  r  r$  r,  rK  r[  ru  __name__	traceback	Exceptionprint_exceptionexc_infor   r   r   <module>r     s     
 	  
     + ,+-;		 $#++		$&h  X\ <3  \` pt 6fc fs f fei f
@@ @F@@ @F/// / 	/
 !/ / / / / / /j %#'	,,, !	,^AA AH*** * 	*
 !* * * * * *t BBB B !	B
 B B B B B B B Bf EEE E 	E
 !E E E E E E E E Ej TTT T !	T
 T T T T T T TH hhh h !	h
 h h h h h h hV??? !? 	?
 ? ? ? ? ? ?D]@BFJ z3	 
  3!	!!<3<<>23s   E( (#FF