o
    i	                    @  s`  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZmZ d dl m Z  d dl	m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dlm5Z5 d dl6Z6d dl7Z7d dl8m9  m:Z; d d	l<m=Z= d d
l>m?Z? d dl@mAZA d dlBmCZC d dl8mDZDmEZE ddgZFd dlGmHZHmIZImJZJmKZK e+rId dlmLZLmMZMmNZN d dl7mOZOmPZPmQZQ d dlRmSZS d dlTmUZU d dlVmWZW ddlXmYZY ddlZm[Z[ ddl\m]Z] ddl^m_Z_m`Z`maZambZbmcZcmdZd ddlemfZf ddlgmhZhmiZi g dZje,dZkejldd!d"Zmd d#lnmoZo d d$lpmqZq d d%lrmsZs d d&ltmuZu d d'lvmwZw d d(lxmyZy d d)lzm{Z{m|Z|m}Z}m~Z~mZ d d*lmZmZ d d+lmZmZ dd,lmZ dd-lmZ ejd.kZeeZe7jed/Ze,d0Zee6je6jf Ze)e-e7jee7jQf  Zd1d2d3Zd4Zd4Zd4Zd5Zd6Zeed @ d kred7ksJ d8dd;d<Zdd@dAZG dBdC dCe6jZejdDdEG dFdG dGZdȐddOdPZ	IdȐddQdRZejlddSdTZddXdYZdd\d]ZddadbZddedfZddjdkZddndoZddsdtZddwdxZdd{d|ZdddZdd fdddZdddZdאdddZ		dِdddZ					dېdddZdddZdddZdddZdddZdddZe1dZe,ddDdZG dd de*e&eef ZdddZdddǄZddd̄ZdddфZ	dddd؄Zddd܄ZÐddd߄ZĐdddZŐdddZƐdddZǐdddZȐdddZɐdddZʐdddZːdddZ̐dddZ͐dd dZΐdddZd dlZАdddZg ZdeӐd< ddd	ZԐdd
dZej			DddddZeZeZeZڐdddZېdddZed7ddd ZG d!d" d"e(ZejG d#d$ d$ZG d%d& d&ZG d'd( d(eZej֐dd)d*ZG d+d, d,ZG d-d. d.eZejlddd1d2Zejݐdd3d4Zejݐdʐd5d6Zdd7d8Z	dd d=d>ZddCdDZddFdGZddHdIZdJdJdDdKddOdPZdJdQddUdVZdJdQddWdXZdd\d]Zdd_d`Ze-ee6jf ZdaeӐdb< ejlddcddZejlddedfZejlddgdhZejld	didjZejld
dldmZddndoZddpdqZddrdsZddtduZddvdwZdd{d|Z	J	D	J	ddddZdʐddZ G dd dZdddZdddZdddZdddZdddZdddZdddZej֐dddZ		ddddZ
dddZdddZdddZdddZdddZdddZej֐dddZdddZejldddZejldddZejldddZdddZddĐdńZddƐdǄZdʐdȐdɄZdʐdʐd˄ZddΐdτZddАdфZG dҐdӄ dejZd dאd؄Zd!dېd܄Z d!dݐdބZ!	dd"ddZ"d#ddZ#d$ddZ$d$ddZ%d%ddZ&d&ddZ'dd fd'ddZ(dd fd'd dZ)d(ddZ*d)ddZ+ejG d	d
 d
Z,ej֐d*ddZ-d+ddZ.d,ddZ/d-ddZ0d.ddZ1d/ddZ2d0ddZ3d1d!d"Z4d2d$d%Z5d3d'd(Z6d4d*d+Z7d5d.d/Z8d6d4d5Z9d7d6d7Z:	dd8d>d?Z;d9dAdBZ<d:dDdEZ=d;dHdIZ>dʐdJdKZ?d+dLdMZ@dNdOdPdQdRdSdSdTZAdUdV eAB D ZCeDdWZEd<dXdYZFd=dZd[ZGd>d^d_ZHd>d`daZIejld?dcddZJejG dedf dfZKi ZLdgeӐdh< d@dldmZMeC ZNdneӐdo< dAdpdqZOdŐdrdsZPdBdtduZQe,dvZRe,dwZSG dxdy dyeeReSf ZTe0dDdzddDdEdCd~dZUdDddZVG dd dejZWejldEddZXdʐddZYdFddZZdŐddZ[dGddZ\dʐddZ]dHddZ^dZ_dIddZ`dIddZadJddZb		dKdLddZcdMddZddʐddZedNddZf		dOdPddZgdQddZhejdDdEG dd dZie$de#f Zje$ejeigejf ZkG dd dZlel ZmdRddZndSdÐdĄZodS (T      )annotationsN)
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeAlias	TypeGuard)mock)datasheet_tops)DeviceProperties)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTreturnstrc                  C  s>   dd t D } t| dksJ t| dkrd}|S |  }|S )Nc                 S  s   g | ]}t t| r|qS  )getattrtorchis_available.0xrI   rI   K/mnt/sdb/aimis/docanh/lib/python3.10/site-packages/torch/_inductor/utils.py
<listcomp>k   s    z get_gpu_type.<locals>.<listcomp>r2   r   rB   )	GPU_TYPESlenpop)
avail_gpusgpu_typerI   rI   rP   get_gpu_typei   s   rW   )get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32
perf_hints_Tz.cubinz.spv)rB   rD         @      zmust be power of 2nbytesintc                 C  s   | t  d t  @ S )z/Round up to the nearest multiple of ALIGN_BYTESr2   )ALIGN_BYTES)rq   rI   rI   rP   _align   s   rt   v
sympy.Exprboolc                 C  s<   t | tjtjfrttt| jS t | tpt	| t
t
kS )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrs   )ru   rI   rI   rP   r~      s   r~   c                   @  s&   e Zd ZdZdZdZeddd	Zd
S )r   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr2   Tvaluerv   rG   Optional[sympy.Expr]c                 C  s,   t |ttjfrtt|S t|r|S d S N)rx   rr   ry   Integerrt   r~   )clsr   rI   rI   rP   eval   s
   z
align.evalN)r   rv   rG   r   )__name__
__module____qualname____doc__nargs
is_integerclassmethodr   rI   rI   rI   rP   r      s    r   Tfrozenc                   @  s2   e Zd ZU dZded< ded< ded< ded< d	S )
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    rr   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__rI   rI   rI   rP   r      s   
 r      d   fnCallable[[], Any]warmuprepfloatc              
   C  s   |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]	}|	  |   q)|  t j  |
|d }tdt|| }tdt|| }	t|D ]}|   qYdd	 t|	D }d
d	 t|	D }t jjt jjjgdP}
t j  t|	D ],}|	  ||   t jjd |   W d   n1 sw   Y  ||   qt j  t dd	 t||D }W d   n1 sw   Y  t | }td t|
 jddd tdd	 |
 D }|r|tdd |D d 8 }td| |S )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArB   dtypedeviceTenable_timing   r2   c                 S     g | ]	}t jjd dqS Tr   rK   rB   EventrN   _rI   rI   rP   rQ          zfp8_bench.<locals>.<listcomp>c                 S  r   r   r   r   rI   rI   rP   rQ      r   
activitiesRunCudaModuleNc                 S  s   g | ]	\}}| |qS rI   )elapsed_time)rN   serI   rI   rP   rQ      r   
raw eventsself_device_time_totalsort_by	row_limitc                 S  s.   g | ]}|j tjkrtd |jdur|qS )zfused_abs_max_\dN)device_typerZ   CUDArematchnamerN   eventrI   rI   rP   rQ      s    c                 s      | ]}|j V  qd S r   device_time_totalr   rI   rI   rP   	<genexpr>	      zfp8_bench.<locals>.<genexpr>     @@profiling results: %s ms)rK   rB   synchronizeemptyrr   float16r   recordrangezero_r   maxprofilerprofileProfilerActivityr   nvtxtensorzipmeanitemlogdebugkey_averagestabler[   events
statistics)r   r   r   cachestart_event	end_eventr   estimate_msn_warmupn_repeatpitimesresfiltered_eventsrI   rI   rP   	fp8_bench   sh   	





r   c                   s  |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]	}|  |   q)|  t j  |	|d }t
dt|| }t
dt|| }	t|D ]}|   qYt j  t jjt jjjgd}
t|	D ]	}|  |   qtt j  W d	   n1 sw   Y  td
 t|
 jddd tdd |
 D }t||	 dkrtdt||	t||	  t fddt|D }|  | }td t|jdd tdd |D d |	 }td| |S )r   r   rB   r   Tr   r   r2   r   Nr   r   r   r   c                 S  s&   g | ]}|j tjkr|jd kr|qS )zContext Sync)r   rZ   r   r   r   rI   rI   rP   rQ   G  s
    z,do_bench_using_profiling.<locals>.<listcomp>r   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %sc                   s    g | ]\}}|  d kr|qS r   rI   )rN   r   r   num_event_per_grouprI   rP   rQ   V  s
    zprofiling time breakdown)r   c                 s  r   r   r   r   rI   rI   rP   r   b  r   z+do_bench_using_profiling.<locals>.<genexpr>r   r   )rK   rB   r   r   rr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r[   r   rS   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   actual_eventsr   rI   r   rP   do_bench_using_profiling  sj   





r   c               
   C  s   zddl m}  tjdd | d uotttjdd dW S  ty&   Y dS  t	y@ } zdt
|v s5J W Y d }~dS d }~ww )	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   rK   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrJ   opsImportErrorr   rH   )r   r   rI   rI   rP   has_torchvision_roi_aligng  s   
r   r   "Union[Optional[torch.device], str]torch.devicec                 C  s`   | d u r
t djS t| trt | } | jdvr.| jd u r.t| j}t j| j|j	 dS | S )Ng        )cpumeta)index)
rK   r   r   rx   rH   typer  rX   Workercurrent_devicer   device_interfacerI   rI   rP   decode_devicew  s   


r  itIterable[sympy.Expr]c                 C  s   t tj| tjjS r   )	functoolsreduceoperatormulry   SOner	  rI   rI   rP   sympy_product     r  seq1Sequence[sympy.Expr]seq2c                 C  s2   t | t |ks
J ttdd t| |D S )Nc                 s  s    | ]	\}}|| V  qd S r   rI   )rN   abrI   rI   rP   r     s    zsympy_dot.<locals>.<genexpr>)rS   ry   expandr   r   )r  r  rI   rI   rP   	sympy_dot  s   r  Iterable[_T]ValuesView[_T]c                 C  s   dd | D   S )Nc                 S  s   i | ]}t ||qS rI   )r   rM   rI   rI   rP   
<dictcomp>      zunique.<locals>.<dictcomp>)valuesr  rI   rI   rP   unique     r   numberUnion[int, sympy.Expr]denomc              	   C  sr   t | tjst |tjrtt| t|S t | tr!t |ts4J |  dt|  d| dt| t| |S )Nz: , )rx   ry   Exprr^   sympifyrr   r  runtime_ceildiv)r"  r$  rI   rI   rP   ri     s    
ri   keyOptional[torch.dtype]c                 C  s   | d u rdS t | dd }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|d'd( t| D  t| t r`| S d)||  S )*Nz*i8.r   rw   i1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64c                 S  s   i | ]}||qS rI   rI   )rN   ru   rI   rI   rP   r    s    z_type_of.<locals>.<dictcomp>*)rH   splitupdatelistr  rx   )r)  	dtype_strtysrI   rI   rP   _type_of  sZ   
rV  lst"Iterable[Union[int, torch.SymInt]]list[sympy.Expr]c                 C     dd | D S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                 S  s   g | ]}t |qS rI   )ry   r'  rN   r   rI   rI   rP   rQ     r  z-convert_shape_to_inductor.<locals>.<listcomp>rI   rW  rI   rI   rP   convert_shape_to_inductor  s   r]  r   Union[int, torch.SymInt]c                 C  sB   ddl m} t| tr| S t| tjrt| S |jjjj	| ddS )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r2   VN)hint)
virtualizedr`  rx   rr   ry   r   graphsizevars	shape_envcreate_symintnode)r   r`  rI   rI   rP   convert_to_symint  s   
rg   Iterable[Union[int, sympy.Expr]]list[Union[int, torch.SymInt]]c                 C  rZ  )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    c                 S     g | ]}t |qS rI   )rg  r[  rI   rI   rP   rQ         z+convert_shape_to_symint.<locals>.<listcomp>rI   r\  rI   rI   rP   convert_shape_to_symint  s   rl  optorch._ops.OpOverloadc                 C  s   t dd | jjD S )z-
    Does this op overload have aliasing
    c                 s  s    | ]}|j d uV  qd S r   )
alias_inforN   r  rI   rI   rP   r         zis_view.<locals>.<genexpr>)any_schema	argumentsrm  rI   rI   rP   is_view  s   rv  c                 C     dS NFrI   )r   rI   rI   rP   <lambda>      ry  user1   is_pointwise_fn'Callable[[torch._ops.OpOverload], bool]c                   s~   | j dksdS t| jtjjs| jtju sdS ttjj| j}|tju s(t	|r4t
 fdd| jD S tjj|jv p> |S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc                 3  s    | ]}t | V  qd S r   )is_pointwise_use)rN   ur|  rI   rP   r     rq  z#is_pointwise_use.<locals>.<genexpr>)rm  rx   targetrK   _ops
OpOverloadr  getitemr   rv  r|   usersTag	pointwisetags)r{  r|  r  rI   r  rP   r    s   

r  r  r   r   	list[Any]kwargsdict[str, Any]&tuple[GraphModule, list[torch.Tensor]]c                   s   t j  g d
 fdd} j| gtt j|||fR  }t| jjdkr5t	| jjd j
d	kr5|f} | t ji  }|fS )Nargtorch.TensorrG   r1   c                   s    |   dt S )Nr  )appendplaceholderrS   )r  g
graph_argsrI   rP   add_tensor_arg  s   
z)gen_gm_and_inputs.<locals>.add_tensor_argr2   r   Tensor)r  r  rG   r1   )rK   fxGraphr~  r"   r  rS   rs  returnsrH   r  outputr0   )r  r   r  r  nodegmrI   r  rP   gen_gm_and_inputs  s   

r  rB   Nonec                 C  s,   | dkrd S t | }| r|  d S d S Nr   )rX   rL   r   r  rI   rI   rP   r      s   r   modelCallable[..., Any]example_inputsSequence[Any]r   c                 C  sT   t | td t }t|D ]
}| | }t | qt }|d us&J || S )Ni9  )r   rK   manual_seedtimeperf_counterr   )r  r  r   r   t0r   resultt1rI   rI   rP   timed(  s   

r  rI   
         ?repeatbaselinec                   sH   t  fddt|D }t | }t|| d | S )Nc                   s   g | ]	}t  qS rI   )r  r   r   r  r  r   rI   rP   rQ   C  r   z%print_performance.<locals>.<listcomp>z.6f)rK   r   r   medianprintr   )r  r  r   r  r  r   timingstookrI   r  rP   print_performance:  s   r  objmethodc                   s$   t | |  t| | fdd dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                     s    S r   rI   rI   r  rI   rP   ry  M  rz  z#precompute_method.<locals>.<lambda>N)rJ   setattr)r  r  rI   r  rP   precompute_methodJ  s   r  methodsr   c                 C  s   |D ]}t | | qdS )zFReplace methods with new methods that returns a precomputed constants.N)r  )r  r  r  rI   rI   rP   precompute_methodsP  s   r  r  r  c                 C  s   t | |kt | |k  S r   )rr   )r  r  rI   rI   rP   cmpV     r  rO   Union[int, Sequence[int]]sizeSequence[int]c                 C  s:   t | tr
| g| S t| dkrt| | d g| S | S )Nr2   r   )rx   rr   rS   r  )rO   r  rI   rI   rP   pad_listlikeZ  s
   

r  tuple[_T, ...]list[_T]c                 C  s&   t | dkrg S d	dd}t| |dS )
Nr   elemrl   rG   rH   c                 S  s0   t | tr| S ddlm} t | |sJ |  S )Nr2   )r@   )rx   rH   	schedulerr@   get_name)r  r@   rI   rI   rP   	sort_funcg  s
   
ztuple_sorted.<locals>.sort_funcr)  )r  rl   rG   rH   )rS   sorted)rO   r  rI   rI   rP   tuple_sortedc  s   
	r  PRV)	covariantc                   @  s$   e Zd ZedddZdddZdS )CachedMethodr   r   rG   r  c                 C     d S r   rI   )r   rI   rI   rP   clear_cachex     zCachedMethod.clear_cacher   P.argsr  P.kwargsr  c                 O  r  r   rI   selfr   r  rI   rI   rP   __call__{  rz  zCachedMethod.__call__N)r   r   rG   r  )r   r  r  r  rG   r  )r   r   r   staticmethodr  r  rI   rI   rI   rP   r  w  s    r  !Callable[Concatenate[Any, P], RV]CachedMethod[P, RV]c                   sl   | j }d| d d| i}td| d  d  d | t| || d }d fdd}||_|S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfr  r   rG   r  c                   s   t |  rt|   d S d S r   )r   delattrr  r  rI   rP   r    s   
z"cache_on_self.<locals>.clear_cache)r  r   rG   r  )r   execlstripr  wrapsr  )r   r   ctxwrapperr  rI   r  rP   cache_on_self  s$   	r  node_schedule0Union[Sequence[BaseSchedulerNode], ExternKernel]OrderedSet[Node]c                 C  sJ   ddl m} t| trttjdd | D t S t| |j	r"| j
S t S )Nr2   irc                 S  s$   g | ]}t |d r|jr|jjqS )r  )r   r  originsrN   r  rI   rI   rP   rQ     s    z%aggregate_origins.<locals>.<listcomp>) r  rx   rS  r  r  r  or_r    r9   r  )r  r  rI   rI   rP   aggregate_origins  s   
	r  Sequence[BaseSchedulerNode]descriptive_names8Literal[True, 'torch', 'original_aten', 'inductor_node']c                 C  s   t | }|dkrdd |D }tt|}nH|dkrPg }|D ]*}|jdkrHd|jv rH|jd d }t|d tr@||d  q||d j qtt|}n|d	kr\d
d |D }nt	|}d
dg| S )Noriginal_atenc                 S  s<   g | ]}|j d krd|jv r|jd dur|jd jjqS )r~  r  N)rm  r  _overloadpacketr   rN   originrI   rI   rP   rQ     s    

z)get_fused_kernel_name.<locals>.<listcomp>rK   r~  source_fn_stackr   r2   inductor_nodec                 S  s   g | ]
}|j d kr|jqS r~  )rm  r   r  rI   rI   rP   rQ     s    r   fused)r  r  r    rm  r  rx   rH   r  r   NotImplementedErrorjoin)r  r  all_originssourcesr  	source_fnrI   rI   rP   get_fused_kernel_name  s.   r  r  r5   tuple[str, str]c                   s~  t | }dd |D }tt}tt}dt|rKtdd |D }t|dkrK|d jtdsAd	d
 tj	D }|_
|jfddd |D ]3}d|jv rk|jd durkt|jd j}	||	 |j d|jv r|jd d j}	||	 |j qMdurdnd}
|j d|
 dd|  dd|  d}|j dg}t| D ]\}}||j d| ddt|  qdurddlm  ||j d t }g }t|  jsddlm} d9 fd#d$}d:d'd(d;fd+d,}| D ]}t|d-r|jdu rqt|jd.r`|jjdur`|jjD ];}|j|v r.q$||j |j|j}|du rBq$|||j\}}||j d/| d0|| d1| d q$t|jd2r|jj dur|jj D ] }|j|j}|du rqr|||j\}}|d3|  qrq|D ]}||j d|j!d4d5  q||j d6d7|  |d8|fS )<aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    c                 S  s   g | ]	}|j d kr|qS r  ru  r  rI   rI   rP   rQ     r   z'get_kernel_metadata.<locals>.<listcomp>Nc                 s  r   r   )rc  )rN   nrI   rI   rP   r     r   z&get_kernel_metadata.<locals>.<genexpr>r2   r   )_inductor_kernel_metadata_node_to_idx_mapc                 S     i | ]\}}||qS rI   rI   )rN   idxr  rI   rI   rP   r    r  z'get_kernel_metadata.<locals>.<dictcomp>c                   s
    j |  S r   )r  r  )single_graphrI   rP   ry    s   
 z%get_kernel_metadata.<locals>.<lambda>r  r  	from_nodezTopologically SortedUnsorted z Source Nodes: [r%  z], Original ATen: []z" Source node to ATen node mapping:z   z => r  z Graph fragment:r_  buffer2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]rw_namerH   rG   tuple[str, ir.Layout | None]c                   sp   t |  jrt | j jr| jjj}n| j}|d u r|}n|j}z	|  }W ||fS  ty7   d }Y ||fS w r   )rx   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr  )r  r  r  r   layoutr  rI   rP   get_buffer_info  s   
z,get_kernel_metadata.<locals>.get_buffer_infoshapeIterable[int]c                 S  s   dd dd | D  dS )N[r%  c                 S  rj  rI   )rH   rM   rI   rI   rP   rQ   0  rk  z@get_kernel_metadata.<locals>.stringify_shape.<locals>.<listcomp>r
  )r  )r  rI   rI   rP   stringify_shape/  s   z,get_kernel_metadata.<locals>.stringify_shaper  ir.Layout | Nonec                   sJ   | d u rdS  | j  } | j }| j }dt| j  | | | dS )Nr  ")r  strider   r   r   )r  shape_annotationstride_annotationdevice_annotation)r  rI   rP   stringfy_layout2  s   z,get_kernel_metadata.<locals>.stringfy_layoutread_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r  r  r  rH   rG   r  )r  r  rG   rH   )r  r  rG   rH   )"r  collectionsdefaultdictrS  rS   r    rc  r   r   nodesr  sortr  rH   r  r  r   commentr  keysr  itemsr  r  rx   r9   rb  r`  r!  r"  addtry_get_bufferr#  format_node)r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsnode_to_idx_mapr  r)  sort_strmetadatadetailed_metadataoriginal_noder*  	all_reads
all_writesr`  r  r   r  rr  
input_namer  woutput_namer   rI   )r  r  r  rP   get_kernel_metadata  s   










rA  initial_queueIterable[torch.fx.Node]skip_filterOptional[Callable[[Any], bool]]OrderedSet[torch.fx.Node]c                 C  sZ   t | } t| }| r+|  }|jD ]}|r||rq||vr(|| | | q| s
|S )zJReturns the set of nodes whose values depend on those within initial_queue)rS  r    rT   r  r/  r  )rB  rD  dominated_setr  userrI   rI   rP   dominated_nodesf  s   


	rI  Sequence[IRNode]dict[str, IRNode]c                   sp   ddl m  d fddt|\}}fd	d
|D }t| \}}fdd
|D }ttjg ||R  S )Nr2   r  r  r:   rG   rw   c                   sT   t |  jr| jS t |  jr| jS t |  jo)t |  j j j jf S r   )	rx   r  r  r  r:   ComputedBufferInputsKernelInputBufferTemplateBufferr  r  is_unrealized_noderI   rP   rQ    s   

z*gather_origins.<locals>.is_unrealized_nodec                      g | ]	} |r|j qS rI   r  rN   valrQ  rI   rP   rQ     r   z"gather_origins.<locals>.<listcomp>c                   rR  rI   rS  rT  rV  rI   rP   rQ     r   )r  r:   rG   rw   )r  r  r!   r    	itertoolschain)r   r  kwargs_flattenr   kwargs_originsargs_flattenargs_originsrI   rP  rP   gather_originsz  s   r]  exprc                   s@   ddd d fdd	d fd
ddfdd| S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    r^  rv   rG   rw   c                 S  s(   t | tjot| jdko| jd dkS )N   r   r   )rx   ry   MulrS   r   r^  rI   rI   rP   is_neg_lead  s   &zsympy_str.<locals>.is_neg_leadrH   c                   sj   t | tjr1t| jdkr( | jd r(| jd  d| jd jd  S dt| jS | S )Nr_  r2   r   z - z + )rx   ry   rz   rS   r   r  r}   ra  )rb  sympy_str_mulrI   rP   sympy_str_add  s
   (z sympy_str.<locals>.sympy_str_addc                   sB   t | tjr | rd| jd  S dt| jS | S )N-r2   z * )rx   ry   r`  r   r  r}   ra  )rb  sympy_str_atomrI   rP   rc    s
   z sympy_str.<locals>.sympy_str_mulc                   sp   t | tjr	| jS t | tjtjfrd |  dS t | tttt	fr4| j
j ddtt| j dS t| S )N()r%  )rx   ry   Symbolr   rz   r`  rb   r_   r`   ra   funcr   r  r}   	sympy_strr   rH   ra  )rd  rI   rP   rf    s   "z!sympy_str.<locals>.sympy_str_atomN)r^  rv   rG   rw   r^  rv   rG   rH   rI   ra  rI   )rb  rd  rf  rc  rP   rk    s
   

rk  r  ValueRanges[Any]c                 C  s>   ddl m} tjrt|jdd  }r|jdkrt| S t	 S )Nr2   r_  current_node
index_expr)
rb  r`  rh   compute_all_boundsrJ   interpreterr  re   rf   unknown)r  r`  fx_noderI   rI   rP   get_bounds_index_expr  s   
rt  prefixc                 C  s   | d dkS )Nr   r=  rI   )ru  rI   rI   rP   prefix_is_reduction     rv  rd   r  sympy.Symbolc                 C  s   | t jksJ t| |dddS )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)rd   SIZErc   )ru  r  rI   rI   rP   sympy_index_symbol_with_prefix  s   r~  checkc                 C  s   | st jot jS r   )rh   debug_index_assertsassert_indirect_indexing)r  rI   rI   rP   generate_assert     r  r   c                 C  s    | d dksJ t j| dddS )ry  r   r   Trz  )ry   ri  r   rI   rI   rP   sympy_index_symbol  s   r  replacementsdict[sympy.Expr, Any]c                   s,   ddd t |  fd	d
| D S )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    replacedrv   replacementUnion[sympy.Expr, str]rG   rx  c                 S  s2   t | tjsJ t |trtj|| j| jdS |S )Nrz  )rx   ry   r&  rH   ri  r   is_nonnegative)r  r  rI   rI   rP   	to_symbol  s   
zsympy_subs.<locals>.to_symbolc                   s   i | ]
\}}| ||qS rI   rI   rN   kru   r  rI   rP   r  	      zsympy_subs.<locals>.<dictcomp>N)r  rv   r  r  rG   rx  )ry   r'  xreplacer.  )r^  r  rI   r  rP   
sympy_subs  s   

r  ,TypeGuard[Union[torch.SymInt, torch.Tensor]]c                 C  s:   t | tjpt | tjotdd t|  |  D S )Nc                 s      | ]}t |V  qd S r   is_symbolicrM   rI   rI   rP   r         zis_symbolic.<locals>.<genexpr>)	rx   rK   r.   r  rr  rW  rX  r  r  )r  rI   rI   rP   r    s    r  c                  G     t dd | D S )Nc                 s  r  r   r  rp  rI   rI   rP   r     r  z"any_is_symbolic.<locals>.<genexpr>rr  )r   rI   rI   rP   any_is_symbolic  r!  r  r  torch.fx.GraphModuleOptional[torch.fx.Node]c                 C  s   ddl m} tg d}t r|d | jjD ]9}t|j	|v r&|  S tj
jjs@t|j	tjjr@tjjj|j	jv r@|  S |jd }d urR||rR|  S qd S )Nr   )r&   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outrU  )%torch.fx.experimental.symbolic_shapesr&   r    rK   $are_deterministic_algorithms_enabledrR  rc  r*  rH   r  	_inductorrh   graph_partitionrx   r  r  r   r  cudagraph_unsafer  r  get)r  r&   forbidden_setr  rU  rI   rI   rP   %get_first_incompatible_cudagraph_node  s*   r  c                 C  s&   t tt| jj}|jdksJ |S )z$Get the output node from an FX graphr  )nextiterreversedrc  r*  rm  )r  	last_noderI   rI   rP   output_nodeS  s   r  OrderedSet[torch.device]c                 C  s\   | j jdd}tdd |D }t| jd }t|tr|n|f}tdd |D }||B S )Nr  ru  c                 s  s0    | ]}t |jd tjr|jd  jV  qdS rU  N)rx   r  r  rK   r  r   r  rI   rI   rP   r   \  s    

z"get_all_devices.<locals>.<genexpr>r   c                 s  s>    | ]}t |tjjrt |jd tjr|jd  jV  qdS r  )rx   rK   r  r1   r  r  r  r   )rN   r  rI   rI   rP   r   d  s    

)rc  
find_nodesr    r  r   rx   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicesrI   rI   rP   get_all_devicesZ  s   r  c                  C  s   t tj D ]B} | dsqtj|  }|j D ]+}|drDt||}t|tj	j
jjrD|jD ]}t|tj	j
jjrC|jjj  q1qtj| = qdtjv r_tjd }t|jjj`|jj`t  d S )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rS  sysmodulesr-  
startswith__dict__rJ   rx   rK   r  runtimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  rI   rI   rP   unload_xpu_triton_pydsp  s.   








r  _registered_cachesc                 C  s0   t | dr
t| jst|  dt|  | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r   callabler  AttributeErrorr  r  r  rI   rI   rP   clear_on_fresh_cache  s   
r  c                  C  s   t D ]} |   qdS )z&
    Clear all registered caches.
    N)r  r  r  rI   rI   rP   clear_caches  s   
r  cache_entriesOptional[dict[str, Any]]dirOptional[str]deleteIterator[None]c              	   #  sh   t   ddlm} |tj|d zztjtj	d iZ t
d  |tj dtjtj	di1 dV  t| trbt| dksKJ d	tjrbt}| fd
d|D  W d   n1 slw   Y  W d   n1 s{w   Y  |rt rtj rt  tj t  fddd W n ty   t
d   w W t   dS t   w )z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)r  TORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictc              	     s,   i | ]}d |vr|t jt j |qS )z.lock)ospathgetsizer  )rN   f)triton_cache_dirrI   rP   r    s
    zfresh_cache.<locals>.<dictcomp>c                   s   t jd |dS )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)rj  r  r  )inductor_cache_dirrI   rP   ry    s
    zfresh_cache.<locals>.<lambda>)ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr   patchdictr  environr   r   r  r  rx   rS   existslistdirrR  
is_windowsrK   rD   rL   r  shutilrmtree	Exceptionr  )r  r  r  r  filesrI   )r  r  rP   fresh_cache  sT   




r  seq	list[int]c                 C  s(   | j }tt| }ttt||ddS )NT)r)  reverse)__getitem__r   rS   rS  r  r  )r  gettera_rrI   rI   rP   argsort  s   r  re  r(   .Sequence[Union[int, torch.SymInt, sympy.Expr]]c                   sD   d fdd}dd	 t |D }t|t|d
}dd	 |D }|S )Nr  tuple[int, sympy.Expr]r  rG   rr   c                   sZ   | \}}|\}}d
 fdd}|||k rdS |||krdS ||k r%dS ||kr+dS d	S )Nr^  %Union[bool, torch.SymInt, sympy.Expr]rG   rw   c                   s   t | tr| S  j| ddS )NT)size_oblivious)rx   rw   evaluate_exprra  re  rI   rP   evaluate  s   
z*argsort_sym.<locals>.cmp.<locals>.evaluater   r2   r   )r^  r   rG   rw   rI   )r  r  a_idxa_valb_idxb_valr  r  rI   rP   r    s   zargsort_sym.<locals>.cmpc                 S  s,   g | ]\}}|t |tjr|jjn|fqS rI   )rx   rK   r.   r  r^  )rN   r  r   rI   rI   rP   rQ   	  s    zargsort_sym.<locals>.<listcomp>r  c                 S  s   g | ]\}}|qS rI   rI   )rN   r  r   rI   rI   rP   rQ     rk  )r  r  r  r  rG   rr   )r   r  r  
cmp_to_key)re  r  r  exprsr  rI   r  rP   argsort_sym  s   r  r   torch.dtypec                 C  s    | t jkrdS t jd| d S )Nrp   rI   r   )rK   rO  r   element_sizer  rI   rI   rP   get_dtype_size  s   
r  c                   @  s   e Zd ZU ded< dS )LineContextr   contextNr   r   r   r   rI   rI   rI   rP   r    s   
 r  c                   @     e Zd ZU ded< ded< dS )ValueWithLineMaprH   r   zlist[tuple[int, LineContext]]line_mapNr  rI   rI   rI   rP   r       
 r  c                   @  s   e Zd ZdZdDdEddZejdFddZdGddZdHddZ	dHddZ
dIddZdJddZdHddZdIddZdKd d!ZdLd$d%ZdMdNd)d*ZdMdOd+d,ZdMdOd-d.Z	/dPdQd3d4ZdRd7d8ZdHd9d:ZdSd=d>ZdTdAdBZdCS )UIndentedBuffer   r   initial_indentrr   rG   r  c                 C  s   g | _ || _d S r   )_lines_indent)r  r  rI   rI   rP   __init__(     
zIndentedBuffer.__init__tabwidthr  c                 c  s*    | j }z|| _ d V  W || _ d S || _ w r   )r  )r  r  prevrI   rI   rP   set_tabwidth,  s   zIndentedBuffer.set_tabwidthr  c                 C  s   t  }d}g }| jD ]:}t|tr| }|d u rq
nt|tr(|||jf q
|}t|ts1J || |d |d|	d 7 }q
t
| |S )Nr2   r'  )r
   r  rx   DeferredLineBaser  r  r  rH   writecountr  getvalue)r  bufr   linemaplilinerI   rI   rP   getvaluewithlinemap5  s$   




z"IndentedBuffer.getvaluewithlinemaprH   c                 C  s
   |   jS r   )r)  r   r  rI   rI   rP   r$  I     
zIndentedBuffer.getvaluec                 C  s   t  }| jD ]8}t|tr| }|d u rqnt|trq|}t|ts%J |dr4||d d  q|| |d q| S )N\r   r'  )	r
   r  rx   r!  r  rH   endswithr"  r$  )r  r%  r'  r(  rI   rI   rP   getrawvalueL  s    




zIndentedBuffer.getrawvaluec                 C  s   | j   d S r   )r  clearr  rI   rI   rP   r.  `     zIndentedBuffer.clearrw   c                 C  
   t | jS r   )rw   r  r  rI   rI   rP   __bool__c  r*  zIndentedBuffer.__bool__c                 C  s   d| j | j  S )Nr	  )r  r  r  rI   rI   rP   ru  f  r  zIndentedBuffer.prefixc                 C  s   |  d d S )Nr'  	writeliner  rI   rI   rP   newlinei  r/  zIndentedBuffer.newliner(  )Union[LineContext, DeferredLineBase, str]c                 C  sr   t |tr| j| d S t |tr| j||   d S | r1| j|   |  d S | jd d S Nr  )rx   r  r  r  r!  with_prefixru  stripr  r(  rI   rI   rP   r3  l  s   

zIndentedBuffer.writelinelines3Sequence[Union[LineContext, DeferredLineBase, str]]c                 C  s   |D ]}|  | qd S r   r2  )r  r:  r(  rI   rI   rP   
writelinesv  s   zIndentedBuffer.writelinesr2   offset'contextlib.AbstractContextManager[None]c                   s   t jd fdd}| S )NrG   r  c                	   3  s<     j  7  _ zd V  W  j  8  _ d S  j  8  _ w r   r  rI   r=  r  rI   rP   r  }  
   "z"IndentedBuffer.indent.<locals>.ctxrG   r  )
contextlibcontextmanager)r  r=  r  rI   r@  rP   indent|  s   zIndentedBuffer.indentc                 C  s   |  j |7  _ d S r   r?  r  r=  rI   rI   rP   	do_indent  r!  zIndentedBuffer.do_indentc                 C  s   |  j |8  _ d S r   r?  rF  rI   rI   rP   do_unindent  r!  zIndentedBuffer.do_unindentF
other_codeUnion[IndentedBuffer, str]r8  c                 C  s   t |trJtd}|jD ]}t |ts"|r"t|t|t|  }qt	|r*d}|jD ]}t |tr;| j
| q-t| |t|d   q-d S t|}|rU| }|sYd S | }|dD ]}| | qbd S )Ninfr   r'  )rx   r  r   r  r  minrS   r  mathisinfr  r3  rr   textwrapdedentrstriprQ  )r  rI  r8  rP  r(  r   rI   rI   rP   splice  s,   





zIndentedBuffer.splicerj  Callable[[Any], Any]c                   s&   t | jd} fdd| jD |_|S )Nr  c                      g | ]} |qS rI   rI   )rN   r(  rj  rI   rP   rQ     rk  z&IndentedBuffer.map.<locals>.<listcomp>)r  r  r  )r  rj  r   rI   rV  rP   r}     s   zIndentedBuffer.mapc                 C  s   t |  d|   dS )Nrg  rh  )r  r$  r  rI   rI   rP   __repr__  r  zIndentedBuffer.__repr__otherr   c                 C  s8   | j |j ksJ t| j d}|| j ||j |S )NrT  )r  r  r<  r  )r  rX  r   rI   rI   rP   __add__  s
   zIndentedBuffer.__add__new_line)Union[DeferredLineBase, LineContext, str]c                 C  s
   || j v S r   )r  )r  rZ  rI   rI   rP   contains  r*  zIndentedBuffer.containsNr   )r  rr   rG   r  )r  rr   rG   r  )rG   r  rG   rH   rG   r  rG   rw   )r(  r5  rG   r  )r:  r;  rG   r  r   )r=  rr   rG   r>  )r=  rr   rG   r  )F)rI  rJ  r8  rw   rG   r  )rj  rS  rG   r  )rX  r   rG   r  )rZ  r[  rG   rw   )r   r   r   r  r  rC  rD  r   r)  r$  r-  r.  r1  ru  r4  r3  r<  rE  rG  rH  rR  r}   rW  rY  r\  rI   rI   rI   rP   r  %  s.    












r  c                      s(   e Zd Zd
 fddZddd	Z  ZS )FakeIndentedBufferrG   r  c                   s   t    d S r   )superr  r  	__class__rI   rP   r    r/  zFakeIndentedBuffer.__init__r   rH   r   c                 C  s$   |dkr
t | |S td| d)Nrc  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   rI   rI   rP   re    s
   
z#FakeIndentedBuffer.__getattribute__r^  )r   rH   rG   r   )r   r   r   r  re  __classcell__rI   rI   rb  rP   r`    s    r`  c               	   c  s<    t jt j} }zd V  W | |t _t _d S | |t _t _w r   )r  stdoutstderr)initial_stdoutinitial_stderrrI   rI   rP   restore_stdout_stderr  rA  rk  c                   @  s`   e Zd ZdZdddZddd	ZdddZd ddZd!ddZd"ddZ	d#ddZ
d$ddZdS )%r!  z.A line that can be 'unwritten' at a later timer(  rH   c                 C  s   |  sd}|| _d S r6  )r8  r(  r9  rI   rI   rP   r    s   
zDeferredLineBase.__init__rG   Union[str, None]c                 C     t )zJReturns either self.line or None to indicate the line has been 'unwritten'r  r  rI   rI   rP   r       zDeferredLineBase.__call__r   c                 C  rm  )z3Returns a new deferred line with the same conditionrn  r9  rI   rI   rP   	_new_line  ro  zDeferredLineBase._new_lineru  c                 C  s   |  | | j S r   rp  r(  )r  ru  rI   rI   rP   r7    r  zDeferredLineBase.with_prefixc                 C  s   |  | j S r   )rp  r(  r  r  rI   rI   rP   r    r  zDeferredLineBase.lstripr  Union[int, slice]c                 C  s   |  | j| S r   rq  )r  r  rI   rI   rP   r    r  zDeferredLineBase.__getitem__rw   c                 C  r0  r   )rw   r(  r  rI   rI   rP   r1    r*  zDeferredLineBase.__bool__rr   c                 C  r0  r   )rS   r(  r  rI   rI   rP   __len__  r*  zDeferredLineBase.__len__N)r(  rH   )rG   rl  )r(  rH   rG   r   )ru  rH   rG   r   )rG   r   )r  rr  rG   r   r_  rG   rr   )r   r   r   r   r  r  rp  r7  r  r  r1  rs  rI   rI   rI   rP   r!    s    






r!  c                      s6   e Zd ZdZd fddZdd
dZdddZ  ZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`r)  rH   value_fnCallable[[], str]r(  c                   s   t  | || _|| _d S r   )ra  r  r)  rv  )r  r)  rv  r(  rb  rI   rP   r    s   
zDelayReplaceLine.__init__rG   c                 C  s   | j | j|  S r   )r(  replacer)  rv  r  rI   rI   rP   r    r  zDelayReplaceLine.__call__c                 C  s   t | j| j|S r   )ru  r)  rv  r9  rI   rI   rP   rp    r  zDelayReplaceLine._new_line)r)  rH   rv  rw  r(  rH   r]  )r(  rH   rG   ru  )r   r   r   r   r  r  rp  rf  rI   rI   rb  rP   ru    s
    
ru  index_or_deviceUnion[int, torch.device]c                 C  s   t | tjr	| }ntt | }t|}tjjr3|jd us J |jdk s*|jdkr1t	
d dS dS |jdkr:dnd}|j}||k rOt	j
d	||d
d dS dS )N	   r  z6GPU arch does not support max_autotune_gemm mode usageFTrD   rm   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rx   rK   r   rW   r   createversionhipmajorr   r  r  multi_processor_count)ry  r   propr}  r~  rI   rI   rP   
is_big_gpu  s&   

r  c                   C  s$   t j rt j jS t jdjS )NrB   )rK   rD   rL   get_device_propertiesgpu_subslice_countrB   r  rI   rI   rI   rP   get_max_num_sms  s   
r  c                  C  s*   t j sdS t jt j } | jdkS )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rK   rB   rL   r  r  r  )device_propertiesrI   rI   rP   
using_b200%  s   

r  c                  C  s2   t j rt S t j } t | dur|  S d S )zFHandle experimental carveout if set otherwise return hardware SM countNr   )rK   rD   rL   r  r   _get_sm_carveout_experimental)carveoutrI   rI   rP   get_num_sms/  s   

r  num_tma_descriptorsnum_programsOptional[int]r3   c                 C  sH   ddl m}m} |du rt }|d}||  t }||||| dS )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r2   )r3   WorkspaceZeroModeNF)r#  	zero_moder   
outer_name)codegen.commonr3   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)r  r   r  r3   r  r  r  rI   rI   rP   get_tma_workspace_arg8  s   
r  r  r;   allowed_layout_dtypeslist[torch.dtype]c                 C  s:   | j |vrtd| j | t| jjo| j |v ot| jS )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r  r  rI   rI   rP   _use_template_for_gpuL  s   
r  backendc                 C  "   |   dd tj  dD v S )Nc                 S     g | ]}|  qS rI   r8  rM   rI   rI   rP   rQ   ]      z)_use_autotune_backend.<locals>.<listcomp>r&  )upperrh   max_autotune_gemm_backendsrQ  r  rI   rI   rP   _use_autotune_backend\     r  c                 C  r  )Nc                 S  r  rI   r  rM   rI   rI   rP   rQ   c  r  z._use_conv_autotune_backend.<locals>.<listcomp>r&  )r  rh   max_autotune_conv_backendsrQ  r  rI   rI   rP   _use_conv_autotune_backendb  r  r  F)enable_int32enable_float8check_max_autotuner  r  r  c                C  s   ddl m}m} tjtjtjg}|rtjtjtjtjg}|r'|tj	tj
g t| jjo1t| |p<| jjdko<| j|v oPtjpEtjpE| oPtdoP|| j|jS )Nr2   )BackendFeaturehas_backend_featurer   TRITON)r  r  r  rK   r   r;  r=  rE  extendr5  r6  r  r   r  r  r   rh   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r  r  r  r  r  r  layout_dtypesrI   rI   rP   use_triton_templateh  s"   	
r  )
add_guardsmatricesr:   r  c                   sf   ddl m} ddlm  d fd	d
d fddd fdd| o2tfdd|D S )u  
    Return True iff *all* supplied tensors satisfy the CUDA-12.9 TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 2 ≤ rank ≤ 5
      * dtype ∈ {FP16, BF16, FP8-E4M3FN}
      * Every logical size ≥ 2
      * Base pointer 16-byte aligned
      * All "outer" dims have 16-byte aligned strides
      * The “inner” dim has stride 1 (contiguous)
      * For FP8 tensors, inner dim ≥ 32
    r   )has_triton_tma_devicer2   r_  
expr_bytesr#  rG   rw   c                   s    j j| tS r   )rc  rd  statically_known_multiple_ofTMA_ALIGNMENT)r  r_  rI   rP   _aligned  r  zcan_use_tma.<locals>._alignedrO   r:   c                   s\  |   }|  }t|}|  }|j}|dk s|dkrdS |tjtjtjfvr)dS | 	  j
jv r3dS rD j
j|} j
j|}n fdd|D } fdd|D }t fdd|D rcdS  fd	dt|D }t|d
krvdS |d }	t|D ]\}
}|
|	krq~|| s dS q~||	 }|| sdS |tjkr j
j|dsdS dS )Nr_  r   Fc                      g | ]	} j j|qS rI   rc  rd  symbolic_hintrN   r   r_  rI   rP   rQ     r   zCcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<listcomp>c                   r  rI   r  rN   str_  rI   rP   rQ     r   c                 3  s"    | ]} j j|d  V  qdS r_  N)rc  rd  statically_known_geqr  r_  rI   rP   r     s     zBcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<genexpr>c                   $   g | ]\}} j j|d r|qS r   rc  rd  statically_known_equalsrN   r   r  r_  rI   rP   rQ         r2   r       T)get_size
get_striderS   	get_dtypeitemsizerK   r   r;  r5  r  rc  unaligned_buffersrd  guard_int_seqrr  r   r  )rO   sizesstridesrankr   r  sizes_i	strides_iinner	inner_idxr   r  	inner_dim)r`  r  r  rI   rP   _is_tma_compatible_default  sL   
z/can_use_tma.<locals>._is_tma_compatible_defaultc                   sD   |   } fdd|D } fddt|D }t|dkr dS dS )Nc                   r  rI   r  r  r_  rI   rP   rQ     r   z?can_use_tma.<locals>._is_tma_compatible_xpu.<locals>.<listcomp>c                   r  r   r  r  r_  rI   rP   rQ     r  r2   FT)r  r   rS   )rO   r  r  r  r_  rI   rP   _is_tma_compatible_xpu  s   
z+can_use_tma.<locals>._is_tma_compatible_xpuc                 3  s:    | ]}|   d u sjdkr |n|V  qd S )NrD   )
get_devicer  rN   r  )r  r  m_devicerI   rP   r     s    
zcan_use_tma.<locals>.<genexpr>N)r  r#  rG   rw   rO   r:   rG   rw   )torch.utils._tritonr  rb  r`  r|   )r  r  r  rI   )r`  r  r  r  r  r  rP   can_use_tma  s   <r  c                 G  s(   t dd |D ot|d| iotjjS )Nc                 s  s     | ]}t | d kV  qdS r  )rS   r  r  rI   rI   rP   r         z*use_triton_tma_template.<locals>.<genexpr>r  )r|   r  rh   r  enable_persistent_tma_matmul)r  r  rI   rI   rP   use_triton_tma_template  s
   r  r  r  r  c           	      C  s   ddl m} |jjj|| | dd}|dks|tjjk rdS ddlm	} t
jjr+dS t
jt
jt
jg}t| |oAtjp=tjoAtd}|rQ| sQtd	tjj dS |S )
Nr2   r_  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)rb  r`  rc  rd  	size_hintrh   rB   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr  rK   r  r  r   r;  rE  r  r  r  r  r   r  cutlass_dir)	r  r  r  r  r`  	gemm_sizer  r  r   rI   rI   rP   use_cutlass_template  s*   

r  op_namec                 C  s4   t jj }|dkrdS |  dd |dD v S )z8Check if CUTLASS should be used for the given operation.ALLTc                 S  r  rI   r  rM   rI   rI   rP   rQ     rk  z'_use_cutlass_for_op.<locals>.<listcomp>r&  )rh   rB   cutlass_enabled_opsr  rQ  )r  enabled_opsrI   rI   rP   _use_cutlass_for_op  s   r  r   _IntLikec              
   C  s`   ddl m} tjj}tjj o/|jj	
tt|||  t||| o/|jj o/|jj S )Nr   r_  )torch._inductor.virtualizedr`  rh   r  decompose_k_thresholdrK   r  r  rc  rd  statically_known_truery   AndGeaot_modecpp_wrapper)r  r  r  r`  r  rI   rI   rP   use_decompose_k_choice   s   
r  c              
   C  sb   t jj}ddlm} ttjjo0|j	j
tt|||  t||| o0|j	j o0|j	j S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   r_  )rh   rocmcontiguous_thresholdr  r`  rw   rK   r  r  rc  rd  r  ry   r  r  r  r  )r  r  r  r  r`  rI   rI   rP   use_contiguous3  s   r  c                   s0  t jj}g d}t|tjr|js|S |dkrg S t| tjr"| jr+t|tjr.|js.d n	t||  ||  dt|} fdd|D }g g g }}}|D ].}	||	 }
|
dk r]qR|
|
d @ dkro|
dkro|	|	 qR|
d	 dkr{|	|	 qR|	|	 qRt j
d
kr|| | S || | }|d | S )N)rm   r  ro   rn      r   r  r_  c                   s    g | ]}| kr|kr|qS rI   rI   )rN   divisormax_k_splitmin_k_splitrI   rP   rQ   c  s
    z get_k_splits.<locals>.<listcomp>rn   r2   r  
EXHAUSTIVE)rh   r  num_decompose_k_splitsrx   ry   r&  	is_numberrL  divisorsr  max_autotune_gemm_search_space)r  r  r  k_splits_limitdefault_k_splitsr  pow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitsrI   r  rP   get_k_splitsK  s@   


r  c                 C  s   t j| jS r   )rK   rB   r  gcnArchNamer   rI   rI   rP   _rocm_native_device_arch_name  s   r  Qtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]]c                  C  s|   zdd l } ddlm}m} ddlm} tj| j	}W n t
y7   ddd}ddd	}G d
d d}d }Y nw ||||fS )Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationrG   r  c                   S     g S r   rI   rI   rI   rI   rP   r    r  z*try_import_ck_lib.<locals>.gen_ops_libraryc                   S  r  r   rI   rI   rI   rI   rP   r    r  z.try_import_ck_lib.<locals>.gen_ops_preselectedc                   @  s   e Zd ZdS )z*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   rI   rI   rI   rP   r    s    r  )rG   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r  r  dirname__file__r   )r  r  r  r  package_dirnamerI   rI   rP   try_import_ck_lib  s   

r   c                   s   t jst jsdS tjjsdS | jjdksdS t| j}dd t j	j
D p,|dd |i  fdd  t j	j@ D }|s@dS | jtjtjtjfvrMdS t \}}}}|s]td	 dS t  re|t j	_t j	jsptd
 dS |t j	jkr}td dS dS )NFrB   c                 S  s   i | ]
}| d d |qS ):r   )rQ  rN   r  rI   rI   rP   r    r  z#use_ck_template.<locals>.<dictcomp>r!  r   c                   s   g | ]} | qS rI   rI   r"  requested_archsrI   rP   rQ     s    z#use_ck_template.<locals>.<listcomp>z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)rh   r  r  rK   r  r  r   r  r  r  archrQ  r-  ck_supported_archr   r   r;  r=  r   r   r  	is_fbcodeck_dir)r  native_archrequested_supported_archsck_package_dirnamer   rI   r#  rP   use_ck_template  s<   




r,  c                 C  :   ddl m} tdot| o|jjj|| | dddkS )Nr2   r_  CKr   r  r   rb  r`  r  r,  rc  rd  r  r  r  r  r  r`  rI   rI   rP   use_ck_gemm_template     r1  c                 C  r-  )Nr2   r_  CKTILEr   r  r   r/  r0  rI   rI   rP   use_ck_tile_gemm_template  r2  r4  c                 C  s   t dot| S )Nr.  )r  r,  r  rI   rI   rP   use_ck_conv_template  r  r6  c                 C  s   t jpt jo| jjdkS r  )rh   r  r  r   r  r5  rI   rI   rP   _use_template_for_cpu  s   

r7  mat1Union[ReinterpretView, Buffer]mat2c                 C  s6   ddl m} t|j|sJ t| ||ddo|j S )Nr2   )r;   F)require_constant_mat2)r  r;   rx   r  use_cpp_gemm_templateis_contiguous)r  r8  r:  r;   rI   rI   rP   use_cpp_bmm_template  s
   r>  mat2_transposedr;  is_woq_int4q_group_sizec                 C  s:  ddl m} ddlm} ddlm}	 ddlm}
 t| r t	ds"dS t
jjs(dS | tjtjfv }tjtjtjtjg}|
|||rD| jnd ||d\}}}} }}t||frXdS t||jrb| }|	| \}}|d	|||| | |t | |d

}ddd}| j|v o|d uo||ot||jo| p| S )Nr2   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtyper?  use_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refrA  rO   r:   rG   rw   c                 S  s   |    |  d dkS )Nr   r2   )freeze_layoutr  rO   rI   rI   rP   is_last_dim_stride13  s   z2use_cpp_gemm_template.<locals>.is_last_dim_stride1r  )r  r  codegen.cpp_micro_gemmrB  codegen.cpp_utilsrC  kernel.mm_commonrD  r7  r  rh   cppweight_prepackr  rK   rL  rA  r=  r;  halfr   has_free_symbolsrx   BaseViewunwrap_viewparallel_num_threadsr  is_module_buffer)r  r8  r:  r?  r;  r@  rA  r  rB  rC  rD  	int8_gemmr  r  r  r  rK  r   rH  rP  rI   rI   rP   r<     sX   		


r<  c                   C  s   t jpt j p
tdS )NATEN)rh   r  r  r  rI   rI   rI   rP   use_aten_gemm_kernels@  s   
r^  c                   @  s>   e Zd ZU edZded< dddZddd	ZdddZ	dS )DebugDirManagerr   rH   prev_debug_namerG   r  c                 C  s   t tj| _d S r   )r  r_  counterr   r  rI   rI   rP   r  J  r  zDebugDirManager.__init__c                 C  s0   t jjj| _| j d| j | _| jt jj_d S )N_tmp_)rK   _dynamorh   debug_dir_rootr`  r   new_namer  rI   rI   rP   	__enter__M  s   zDebugDirManager.__enter__r   r   c                 G  s   t | j | jtjj_d S r   )r  r  re  r`  rK   rc  rh   rd  )r  r   rI   rI   rP   __exit__R  s   zDebugDirManager.__exit__Nr^  )r   r   rG   r  )
r   r   r   rW  r#  ra  r   r  rf  rg  rI   rI   rI   rP   r_  F  s   
 


r_  Callable[P, _T]r  r  tuple[_T, list[str]]c                   st   ddl m} g  d
 fdd}tj|d	| tj  | |i |}W d    | fS 1 s1w   Y  | fS )Nr2   r6   coderH   rG   r  c                        |  d S r   r  rj  source_codesrI   rP   save_output_code`  r/  z*run_and_get_code.<locals>.save_output_coderp  rj  rH   rG   r  rc  r7   r   r  rd  rK   rc  reset)r   r   r  r7   rp  r  rI   rn  rP   run_and_get_codeW  s   

rt  c                 O  sF   t | g|R i |\}}g }|D ]}|td|tj q||fS )Nz	'''.*?''')rt  r  r   findallDOTALL)r   r   r  r  ro  kernelsrj  rI   rI   rP   run_and_get_kernelsi  s
   rx  tuple[Any, list[str]]c                   s   d fdd}t |S )NrG   r   c                    s     } |     | S r   )r   backwardr  r   rI   rP   run_with_backwardt  s   z1run_fw_bw_and_get_code.<locals>.run_with_backward)rG   r   )rt  )r   r|  rI   r{  rP   run_fw_bw_and_get_codes  s   r}  c              	     s   ddl m} g dfdd d fdd}tj|d|5 tj|d  tj  | |i |}W d   n1 s>w   Y  W d   S W d   S 1 sVw   Y  S )zLGet the inductor-generated code, but skip any actual compilation or running.r2   r6   rj  rH   rG   r  c                   rk  r   rl  rm  rn  rI   rP   rp    r/  z"get_code.<locals>.save_output_coder  r7   r   c                   sF   G dd d}| j r|  n|  \}} |j |r  |j | S )Nc                   @  s$   e Zd ZdZdddZdd	d
ZdS )z@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulerG   r  c                 S  r  r   rI   r  rI   rI   rP   r    r  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__r   r   r  c                 _  r  r   rI   r  rI   rI   rP   call  ro  zEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.callNr^  r   r   r  r   rG   r  )r   r   r   r   r  r~  rI   rI   rI   rP   DummyModule  s    
r  )r  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_code)rp  rI   rP   patched_compile_to_module  s   

z+get_code.<locals>.patched_compile_to_modulecompile_to_modulerp  Nrq  )r  r7   rG   r   rr  )r   r   r  r7   r  r   rI   )rp  ro  rP   get_code|  s$   
(


r  c                 O  sJ   t | g|R i |}dt|  krdks!n J dt| |d S Nr2   r_  z%expected one or two code outputs got r   )r  rS   )r   r   r  ro  rI   rI   rP   get_triton_code  s
   r  c                 O  sN   t | g|R i |\}}dt|  krdks#n J dt| |d S r  )rt  rS   )r   r   r  r   ro  rI   rI   rP   run_and_get_triton_code  s
   r  tuple[Any, list[GraphLowering]]c                   s   ddl m  ddlm} |jg d fd	d
}tj|d| | |i |}W d    |fS 1 s7w   Y  |fS )Nr   r6   r>   r   r   r  rG   r  c                    s2   | i | | d }t | sJ | d S )Nr_  )rx   r  )r   r  rc  r7   graph_lowerings	real_initrI   rP   	fake_init  s   z-run_and_get_graph_lowering.<locals>.fake_initr  r  )torch._inductor.graphr7   torch._inductor.output_coder?   r  r   r  rd  )r   r   r  r?   r  r  rI   r  rP   run_and_get_graph_lowering  s   
r  aten_opoverride_fnc              	   c  sN    ddl m} |j|  }zt|||j| < dV  W ||j| < dS ||j| < w )z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorr  	loweringsr  partial)r  r  r  orig_fnrI   rI   rP   override_lowering  s   
r  pre_fnpost_fnOptional[Callable[..., Any]]c                   s6   ddl m} |j d fdd}tjj|d	|S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerr  r   r*  rG   c                   s&   | |  | |}r| | |S r   rI   )r  r*  outr  r  r  rI   rP   r    s
   


z(add_scheduler_init_hook.<locals>.wrapperr  N)r  r   r*  r   rG   r   )torch._inductor.schedulerr  r  unittestr   r  rd  )r  r  r  r  rI   r  rP   add_scheduler_init_hook  s   r  msgc                 C  s"   t jr
t|  dS t|  dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)rh   developer_warningsr   r  info)r  rI   rI   rP   developer_warning  s   r  c                  C  s   z/t jd} | d tt jk r.tt j| d  dkr.t j| d  d dkr.t j| d  W S W n	 ty8   Y nw t jD ]}|drM|tdd   S q<dS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr2   r   re  z--only=N)r  argvr  rS   
ValueErrorr  )r  r  rI   rI   rP   get_benchmark_name	  s   

r  r.  c                 C  r  )Nc                 s      | ]}|d kV  qdS r2   NrI   rM   rI   rI   rP   r   %	  r  zis_ones.<locals>.<genexpr>r|   r.  rI   rI   rP   is_ones$	  r!  r  c                 C  r  )Nc                 s  r  )r   NrI   rM   rI   rI   rP   r   )	  r  zis_zeros.<locals>.<genexpr>r  r  rI   rI   rP   is_zeros(	  r!  r  inputsSequence[torch.Tensor]c                 C  r  )Nc                 s  s,    | ]}t |tjr|jtd kV  qdS )r   N)rx   rK   r  r   )rN   r   rI   rI   rP   r   -	  s    

z is_cpu_device.<locals>.<genexpr>r  )r  rI   rI   rP   is_cpu_device,	  s   r  rU  c                 C  s&   t | tjs
J d| jrtjS tjS )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rx   ry   r&  r   rK   rG  r?  )rU  rI   rI   rP   get_sympy_Expr_dtype4	  s   r  should_profileIterator[Any]c                 o  sN    | r"t jj|i |}|V  W d    d S 1 sw   Y  d S d V  d S r   )rK   r   r   )r  r   r  r   rI   rI   rP   maybe_profile>	  s   "
r  c                  C  s   t jj} | dk rt } | S Nr2   )rh   rT  threadsrK   get_num_threads)r  rI   rI   rP   rZ  G	  s   rZ  c                  C  s,   ddl m}  |  }|dtjjrdS dS )Nr2   )get_backend_options
num_stagesr_     )runtime.triton_helpersr  r  rK   r  r  )r  optionsrI   rI   rP   get_backend_num_stagesN	  s   r  c                 C  s  t | tjjjjd}|dur|S ddlm}m} tj	 o#tj
 dk}| tjtjtjfv s0J t|jdrcddlm} | }| tjtjfv rQ|rQ|| |S tjjjjr]|tj|S |tj|S | tjtjfv rq|rq|| S tjjjjr||tjS |tjS )z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    )is_tf32Nr   )get_max_simd_tflopsget_max_tensorcore_tflops)rp   r   
clock_rate)max_clock_rate)r   rK   backendsrB   matmul
allow_tf32triton.testingr  r  rL   get_device_capabilityr   r;  r=  inspect	signature
parametersr  torch._utils_internalr  )r   ds_topsr  r  SM80OrLaterr  sm_clockrI   rI   rP   get_device_tflopsV	  s&   


r  c                  C  s   ddl m}  |  S )Nr   get_dram_gbps)r  r  r  rI   rI   rP   get_gpu_dram_gbps	  s   r  c                  C  s"   ddl m}  | jjdddS )Nr   r  max_shared_mem)triton.runtimer  r  r  r  r  r  rI   rI   rP   get_gpu_shared_memory	  s   r  reduction_typec                 C  s
   |  dS )Nwelford)r  r  rI   rI   rP   is_welford_reduction	  r*  r  c                 C  s   t | rdS | dkrdS dS )Nr  online_softmax_reducer_  r2   )r  r  rI   rI   rP   reduction_num_outputs	  s
   r  c                   C  s   t  dkS )NLinux)platformsystemrI   rI   rI   rP   is_linux	  rw  r  c                   C  s
   t jdkS )Nrj   )r  r  rI   rI   rI   rP   r  	  r*  r  itrIterable[Any]c                 C  r  )Nc                 s  s$    | ]}t |tjo|j V  qd S r   )rx   ry   r&  r  rM   rI   rI   rP   r   	     " z#has_free_symbols.<locals>.<genexpr>r  )r  rI   rI   rP   rW  	  r!  rW  c                  G  s~   ddl m} | D ]4}t||j|j|j|j|jfr-t|	 pds)t|
 p'dr, dS qt||js4qtdt| dS )Nr2   r  rI   Tzunexpected type for is_dynamic F)r  r  rx   r  r  rX  rL  r8   rW  maybe_get_sizemaybe_get_strider:   	TypeErrorr  )r   r  trI   rI   rP   
is_dynamic	  s   
r  c                   @  s   e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r  r  rI   rI   rI   rP   r  	  s    r  rj  r0   inpc              	   C  s4  ddl m} tjdddd}t }t }t|t|dj|  t	d|j
 |d	 t	|j
|d	 t }t|| | |j
 W d    n1 sLw   Y  t | }	||j
 |j
  |  t	d
|j
 |d	 t	|j
|d	 | | k}
td||j|
|	 W d    d S 1 sw   Y  d S )Nr2   )stable_topological_sortr?  zutf-8F)modeencodingr  )r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r  NamedTemporaryFileior
   r]   rY   	propagater  rc  r	   nowr\   lint	recompiler$  r   r  r   )rj  r  r  r  r  r  	before_ioafter_io
start_timetime_elapsedr  rI   rI   rP   pass_execution_and_save	  s>   

"r   	input_buf"Optional[Union[Buffer, Operation]]c                 C  s&   ddl m} t| |jot| j|jS )zB
    Check if input buffer is a multi-outputs template buffer
    r2   r  )r  r  rx   CppTemplateBufferr  MultiOutputLayoutr  r  rI   rI   rP   is_multi_outputs_template	  s   r  c                 C  s4   ddl m} t| |jot| jdkot| jd S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r2   r  r   )r  r  rx   MultiOutputrS   r  r  r  rI   rI   rP   #is_output_of_multi_outputs_template	  s   r  r   Optional[Union[Node, Operation]]!Optional[torch._ops.OperatorBase]c                 C  s   | d u rdS ddl m} t| |jo!t| |j o!|d u p!| j|u pXt| |jkoXtt	j
jdo8| jt	j
jjjkpXtt	j
jdoH| jt	j
jjjkpXtt	j
jdoX| jt	j
jjjkS )NFr2   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  rx   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr   rK   r   torchrecr  defaultr  r  r  rm  r  rI   rI   rP   is_collective
  s(   

r  "Optional[Union[IRNode, Operation]]c                 C  s   ddl m} t| |jkS Nr2   r  )r  r  r  r  r  r  rI   rI   rP   is_wait'
  s   r  snoder@   c                 C  4   ddl m} t| |rtdd | jD S t| jS )Nr   GroupedSchedulerNodec                 s  r  r   )contains_collectiverM   rI   rI   rP   r   1
  r  z&contains_collective.<locals>.<genexpr>)r  r  rx   rr  snodesr  r  r  r  rI   rI   rP   r  -
     

r  c                 C  r  )Nr   r  c                 s  r  r   )contains_waitrM   rI   rI   rP   r   :
  r  z contains_wait.<locals>.<genexpr>)r  r  rx   rr  r  r  r  r   rI   rI   rP   r"  6
  r!  r"  Optional[Operation]?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]c                 C  s6   ddl m} t|tjjr|g}t| |jo| j|v S r  )r  r  rx   rK   r  r  r  r  r  rI   rI   rP   is_fallback_op?
  s   r%  buf_namename_to_bufname_to_fused_nodec                 C  s   |||  j   S r   )defining_opr  )r&  r'  r(  rI   rI   rP   buf_name_to_fused_snodeJ
  s   r*  c                 C  rw  rx  rI   r  rI   rI   rP   ry  U
  rz  collected_node_setMutableSet[BaseSchedulerNode]dict[str, SchedulerBuffer]dict[str, BaseSchedulerNode]criteria_cbCallable[[Any], bool]c                 C  sP   || rd S | |  | jD ]}t|j||}||v rqt|||||d qd S )Nr0  )r/  unmet_dependenciesr*  r   find_recursive_deps_of_node)r  r,  r'  r(  r0  depdefining_op_for_deprI   rI   rP   r4  P
  s"   

r4  c                 C  rw  rx  rI   r+  rI   rI   rP   ry  n
  rz  c              	   C  s   || rd S | |  |  D ]4}|jD ].}|jd usJ |j dkr%q|j |vr-q||j  }||v r9qt|||||d qqd S )NOUTPUTr2  )r/  get_outputsr  r  r  find_recursive_users_of_node)r  r,  r'  r(  r0  orH  user_oprI   rI   rP   r9  i
  s,   

r9  dynamo_gm_num_inputsaot_fw_gm_num_inputsc                 C  s   t jjjrdnd}||  | S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r_  r   )rK   
_functorchrh   functionalize_rng_ops)r<  r=  num_rng_seed_offset_inputsrI   rI   rP   num_fw_fixed_arguments
  s   rA  fx_gc                 C  sd   ddd}d}g }| j jD ]}|jdkr!||r|| |d	7 }q|ttt|ks.J t|S )z>
    Infers which inputs are static for a backwards graph
    rO   r1   rG   rw   c                 S  s(   d| j vod| j vod| j vod| j vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  rO  rI   rI   rP   is_saved_tensor
  s   
z'count_tangents.<locals>.is_saved_tensorr   r  r2   N)rO   r1   rG   rw   )rc  r*  rm  r  rS  r   rS   )rB  rG  	arg_countstatic_arg_idxsr  rI   rI   rP   count_tangents
  s   


rJ  c                   @  s.   e Zd ZU ded< dddZedd	d
ZdS )	BoxedBoolrw   r   rG   c                 C  s   | j S r   )r   r  rI   rI   rP   r1  
  s   zBoxedBool.__bool__r  r   Union[BoxedBool, bool]c                 C  s   t | tr
d| _| S dS rx  )rx   rK  r   r  rI   rI   rP   disable
  s   
zBoxedBool.disableNr_  )r  r   rG   rL  )r   r   r   r   r1  r  rM  rI   rI   rI   rP   rK  
  s
   
 
rK  kernel_listc                 #  sh    ddl m} |j	 		 dd fdd}tj|d| d V  W d    d S 1 s-w   Y  d S )Nr2   r4   Tr  r5   kernel_namerH   r  r8  r  gpurw   cpp_definitionrG   r   c                   s     | | |||||S r   rl  )r  rO  r  r8  rP  rQ  rN  orig_define_kernelrI   rP   define_kernel
  s   
z.collect_defined_kernels.<locals>.define_kernelrT  )NTN)r  r5   rO  rH   r  rH   r8  r  rP  rw   rQ  r  rG   r   )codegen.wrapperr5   rT  r   r  rd  )rN  r5   rT  rI   rR  rP   collect_defined_kernels
  s   "rV  c                 C  s   | d S )N__original__rI   r  rI   rI   rP    get_cloned_parameter_buffer_name
     rX  c                 C  s   | t v S r   )rR   r  rI   rI   rP   r  
  rY  r  c                 C  s   | dkot | S )NrC   )r  r  rI   rI   rP   device_need_guard
  r  rZ  c                 C  sL   t  r| tjkrtj rtj dkrt jrdS | ttj	tj
tjgv S )N)r{  r   F)rh   r'  rK   r;  rB   rL   r  bfloat16_atomic_adds_enabledr    rG  rw   r  rI   rI   rP   ,needs_fallback_due_to_atomic_add_limitations
  s   
r\  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensorc                 C  s   | j tjjjtjjjfv r|d u rdS | j tjjjkrdnd}|d |fvp]|o.t|o.t|p]| j tjjjkoM|dkoM|oM|dkoMt	j
joMt	j
jpMt dkp]||koY|tjtjfv p]t S )NFr/  r   r   r2   )overloadpacketrK   r   atenscatter_reduce_scatter_reducescatter_r  r\  rh   rT  fallback_scatter_reduce_sumdynamic_threadsrZ  rw   rG  r  )r  r  r]  r^  r_  r`  	reduce_tyrI   rI   rP   use_scatter_fallback
  s8   	ri  c                 C  s  ddl m}m} ddlm} tdt|  d t| D ]m\}}td|dd ||u r2td	 q||u r;td
 qt||r|	 }t|rIdnd d |rb|j
dusXJ td|j
jj  td |jjD ]}t| qjtd |jjD ]}t| qyqtdt| dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr	  3r!  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdrj  rk  r  rl  r  rS   r   rx   is_reductionr  r  reduction_hintr!  r"  r#  r   r  )r  rj  rk  rl  r  r  is_redr5  rI   rI   rP   dump_node_schedule  s0   




rt  r   r  c                 C  s*   ddl m} ||  t| j t dkS )Nr   )r  )r  r  storage_offsetr  r   GPU_ALIGN_BYTES)r   r  rI   rI   rP   tensor_is_aligned2  s   rw  example_inputc                 C  s   t | jjsdS tjpt| S rx  )r  r   r  rh   assume_aligned_inputsrw  )rx  rI   rI   rP   should_assume_input_aligned@  s   rz  r>  c                  C  s>   t jj } | st S | jr| jjst S | jj}| S r   )	rK   _guardsTracingContexttry_getrC  nullcontextr  re  suppress_guards)tracing_contextre  rI   rI   rP   #maybe_get_suppress_shape_guards_ctxI  s   r  tuple[_T, str]c                 O  s   t jjtddJ tj  dd l}dd l	}|
 }||}ddlm} || |j}||j | |i |}	| }
|| || W d    |	|
fS 1 sVw   Y  |	|
fS )Nr   Tr   )output_code_log)r  r   r  rd  rh   rK   rc  rs  r  loggingr
   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr$  removeHandler)r   r   r  r  r  log_capture_stringchr  
prev_levelr  r   rI   rI   rP   run_and_get_cpp_codeY  s$   




r  Sequence[InputType]Optional[ShapeEnv]c                 C  s<   t | }|d ur|jS | D ]}t|tjr|jj  S qd S r   )rY   re  rx   rK   r.   r  )r  r  inputrI   rI   rP   shape_env_from_inputsr  s   r  Callable[[list[InputType]], _T]inputs_to_checkmutated_input_idxsOrderedSet[int]c                   s&   t  dkrS d fdd}|S )	Nr   
new_inputslist[InputType]rG   r   c                   s0   t |  \}}| }t|rt|| |S r   )copy_misaligned_inputsrS   rK   _foreach_copy_)r  old_tensorsnew_tensorsr  r  r  r  rI   rP   r    s   z)align_inputs_from_check_idxs.<locals>.run)r  r  rG   r   )rS   )r  r  r  r  rI   r  rP   align_inputs_from_check_idxs  s   r  c                 C  s`   d|   v r	d}ntdd t|   |  D d }t| |fd }t||   |  S )Nr   c                 s  s     | ]\}}|d  | V  qdS r  rI   )rN   r  r  rI   rI   rP   r     r  z)clone_preserve_strides.<locals>.<genexpr>r2   r   )r  r   r   r  rK   
as_stridedclone)rO   needed_sizer  rI   rI   rP   clone_preserve_strides  s   "r  r  r  check_inputs_idxsreturn_pair_idxsOptional[OrderedSet[int]]-tuple[list[torch.Tensor], list[torch.Tensor]]c                 C  s   g }g }|du}|D ]3}| | }t |tjsJ dt| | t r=t|| |< |r=||v r=|| || |  q
||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )rx   rK   r  r  data_ptr	ALIGNMENTr  r  )r  r  r  r  r  ret_pair_definedr   _inprI   rI   rP   r    s   

r  static_input_idxsc                 C  sT   g }|D ]}| | }t |tjr| t dkr|| qt|t|kr(|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rx   rK   r  r  r  r  rS   )r  r  aligned_static_input_idxsr  r  rI   rI   rP   remove_unaligned_input_idxs  s   
r  r   c                 C  sv   ddl m} ttjj}|jjj}|jjj	j
}|jj| |kr#dS |jr1|jj| dk r1dS || o:|| |kS )Nr2   r_  Tg@xDF)rb  r`  rK   iinforE  r   rc  rd  r  re  has_hintr  aot_compilation)r   r`  int_maxr  r  rI   rI   rP   expr_fits_within_32bit  s   
r  compiled_graphr?   c                   s   t jj }|d urX|jd urZt|jdksJ t| |jd us#J |jD ]5}|d u r3|jd  q&d t jj  }r@|j d fdd|jt	fd	d
|D  q&d S d S d S )Nr   Fr   r   rG   ,Union[float, int, SymInt, SymFloat, SymBool]c                   s(   d u rt | S  r| S | S r   )rr   deserialize_symexprevaluate_symexpr)r   )fakify_first_callre  rI   rP   map_expr  s
   

z4set_tracing_context_output_strides.<locals>.map_exprc                 3  s    | ]} |V  qd S r   rI   rN   r   )r  rI   rP   r     r  z5set_tracing_context_output_strides.<locals>.<genexpr>)r   r   rG   r  )
rK   r{  r|  r}  output_stridesrS   r  r  r  r  )r  r  r  r
  r  rI   )r  r  re  rP   "set_tracing_context_output_strides  s"   
r  c                  C  s`   t jd urt jS t  sdS tj rdS zddlm}  W n
 ty'   Y dS w | tj	dkS )NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rh   fx_graph_remote_cacher'  rK   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  rI   rI   rP    should_use_remote_fx_graph_cache  s   

r  c                 C  s   t dd| S )Nz[^a-zA-Z0-9_]r   )r   subr  rI   rI   rP   normalize_name-  r/  r  ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2c                 C  r  rI   rI   r  rI   rI   rP   r  =  r  r  z^.*[.]c                 C  s   t dt| }t||S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  rH   _triton_type_mappingr  )r   triton_type_namerI   rI   rP   triton_typeC  s   r  c                 C  s6   t | | }|dd}tt|}t|tjsJ |S )Nr  r  )_torch_triton_mappingr  rx  rJ   rK   rx   r   )r   adjusted_type	type_namerF  rI   rI   rP   triton_type_to_torchI  s
   
r  r  r   c                 C  sh   | j  o3|  | ko3|  | ko3| j|jko3| j|jko3|   |  ko3|  | kS r   )	is_mkldnnr  r  r   r   untyped_storager  ru  r  r   rI   rI   rP   is_same_tensorQ  s   

r  c                 C  sJ   | j o$|  | ko$| j|jko$| j|jko$tjj| tjj|kS r   )r  r  r   r   rK   r   mkldnnr  r  rI   rI   rP   is_same_mkldnn_tensor]  s   

r  tuple[str, ...]c                   C  rw  )N)rN  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorrI   rI   rI   rI   rP   boolean_opsg  ro  r  c                   @  r  )OpDtypeRuler/   type_promotion_kindr*  override_return_dtypeNr  rI   rI   rI   rP   r  {  r  r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesr  r/   r  c                 C  s   t ||t| < d S r   )r  r  )r   r  r  rI   rI   rP   #register_op_dtype_propagation_rules  s   r  zOrderedSet[str]op_requires_libdevice_fp64c                 C  s   t |  d S r   )r  r/  r  rI   rI   rP   #register_op_requires_libdevice_fp64  r/  r  c                  C  s8   ddl m}  | j j}|dkrtjS |dkrdS tjS )Nr   r_  r   rC   )r  r`  rc  get_current_device_or_throwr  rh   cpu_backendcuda_backend)r`  
device_strrI   rI   rP   get_current_backend  s   r  c                 C  s,   | t jt jfv rtjjrt dkrt jS | S )z"Maybe upcast [b]float16 to float32r  )rK   r   r;  rh   r  codegen_upcast_to_fp32r  r=  r  rI   rI   rP   upcast_compute_type  s   
r  KeyTypeValTypec                   @  sl   e Zd ZdZd#ddZd$d
dZd%ddZd&ddZd'd(ddZd)ddZ	d*ddZ
d+dd Zd,d!d"ZdS )-
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    original_dictMapping[KeyType, ValType]c                 C  s   || _ i | _d S r   r  	new_items)r  r  rI   rI   rP   r    r  zScopedDict.__init__r)  r  rG   r  c                 C  s   || j v r
| j | S | j| S r   r  r  r  r)  rI   rI   rP   r    s   


zScopedDict.__getitem__r   r  c                 C  s   || j |< d S r   )r  )r  r)  r   rI   rI   rP   __setitem__  r/  zScopedDict.__setitem__rd  rw   c                 C  s   || j v p	|| jv S r   r  r  rI   rI   rP   __contains__  r  zScopedDict.__contains__Nr  Optional[ValType]c                 C  s"   || j v r
| j | S | j||S r   )r  r  r  )r  r)  r  rI   rI   rP   r    s   

zScopedDict.getrr   c                 C  s,   t | j}| jD ]}|| jvr|d7 }q|S r  )rS   r  r  )r  r  r  rI   rI   rP   rs    s   


zScopedDict.__len__Iterator[KeyType]c                 c  s.    | j E d H  | jD ]
}|| j vr|V  q
d S r   r  )r  r  rI   rI   rP   __iter__  s   

zScopedDict.__iter__c                 C  s   t | jp| jS r   )rw   r  r  r  rI   rI   rP   r1    r  zScopedDict.__bool__c                 C  rm  r   rn  r  rI   rI   rP   __delitem__  r  zScopedDict.__delitem__)r  r  )r)  r  rG   r  )r)  r  r   r  rG   r  )r)  rd  rG   rw   r   )r)  r  r  r  rG   r  rt  )rG   r  r_  )r)  r  rG   r  )r   r   r   r   r  r  r  r  r  rs  r   r1  r  rI   rI   rI   rP   r    s    






r  )frozen_defaultr   Optional[type[Any]]r   c                 s"   d fdd}| d u r|S || S )Nr   rl   rG   c                   s(   t jdkrtj| d dS tj|  dS )N)r  r  T)kw_onlyr   r   )r  version_infodataclasses	dataclass)r   r   rI   rP   wrap  s   
zir_dataclass.<locals>.wrap)r   rl   rG   rl   rI   )r   r   r  rI   r   rP   ir_dataclass  s   r	  Optional[list[int]]c                  C  s&   t jj } | d ur| jr| jjS d S r   )rK   r{  r|  r}  fw_metadatabw_donated_idxs)r  rI   rI   rP   get_donated_idxs  s   r  c                   @  s    e Zd ZdZdZdZdZdZdS )TritonAttrsDescriptorVersionr   r2   r_  r  r  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTrI   rI   rI   rP   r    s    r  c                  C  sT   t jdd u rtjS dd l} dd l} t| jj	drtj
S t| j	j	dr'tjS tjS )Nr  r   AttrsDescriptor)	importlibutil	find_specr  r  triton.backends.compilertriton.compiler.compilerr   r  compilerr  r  r  )r  rI   rI   rP   #get_triton_attrs_descriptor_version   s   r  c                   C  s   t  tjkS r   )r  r  r  rI   rI   rI   rP   triton_version_uses_attrs_dict  rw  r  r<   c                 C  sF   ddl m} t| |jsdS t| jtjjr!tjj	j
| jjv r!dS dS )zq
    Returns True if the node is an op that is not cudagraphable.
    Usually only custom ops have this tag.
    r2   r  FT)r  r  rx   r  r  rK   r  r  r   r  r  r  r  rI   rI   rP   is_cudagraph_unsafe_op  s   r  c                  C  sX   t jdd} t r*ddlm} | }|r*t j|dd}| r(t j	|| gn|} | S )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  r  rh   r'  libfb.py.parutilr  r  r  pathsep)r  r  runtime_pathlib_pathrI   rI   rP   get_ld_library_path1  s   r%  c                 C  s    ddl m} t| |o| jd uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr&  rx   partition_signatures)r  r&  rI   rI   rP   #is_codegen_graph_partition_subgraph>  s   
r)  c                   C  s    t jjjjp
tjd uot jjjS r   )rK   r  rh   r  
cudagraphs&_unstable_customized_partition_wrapperr  r  rI   rI   rI   rP   is_using_cudagraph_partitionG  s
   r,  c                 C  s8   ddl m} |jj| dr|jj| drtjS tjS )Nr2   r_  l        i   )	rb  r`  rc  rd  statically_known_ltr  rK   rE  rG  )r  r`  rI   rI   rP   dtype_from_sizeN  s   r.  )r   rD   r   c                 C  $   | dkr
t jj S d| v rdS dS )z;
    Returns True if the device supports MKL-DNN BF16.
    r   rD   TF)rK   r   r  _is_mkldnn_bf16_supportedr   rI   rI   rP   is_mkldnn_bf16_supported\  
   r2  c                 C  r/  )z;
    Returns True if the device supports MKL-DNN FP16.
    r   rD   TF)rK   r   r  _is_mkldnn_fp16_supportedr1  rI   rI   rP   is_mkldnn_fp16_supportedh  r3  r5  elementsSequence[Sequence[T]]headersSequence[T]c              	   C  s   dd |D }| D ]"}t |t |ksJ t|D ]\}}t|| t t|||< qq	g }|ddd t||D  t|t |d  t |d  }|d|  | D ]}|dd	d t||D  qWd
|S )Nc                 S  s   g | ]}t t|qS rI   )rS   rH   r  rI   rI   rP   rQ   u  s    ztabulate_2d.<locals>.<listcomp>|c                 s  $    | ]\}}d || d V  qdS r	  NrI   )rN   hr?  rI   rI   rP   r   {  r  ztabulate_2d.<locals>.<genexpr>r_  r2   re  c                 s  r;  r<  rI   )rN   r   r?  rI   rI   rP   r     r  r'  )rS   r   r   rH   r  r  r   r   )r6  r8  widthsrowr   r   r:  total_widthrI   rI   rP   tabulate_2dt  s     "
rA  dict1r  dict2
d1_defaultValType | None
d2_defaultEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None]c                 c  s`    t |  t | B }|D ]}| |}||}||dur"|n||dur)|n|fV  qdS )a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r    r-  r  )rB  rC  rD  rF  all_keysr)  value1value2rI   rI   rP   	zip_dicts  s   

rK  config_patchesc                 C  s`   dd	d
}|  dtjj}|  } |r.|| dd || dd || dtjj  || dd | S )a1  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    rL  r  config_namerH   config_valuer   rG   r  c                 S  sP   |  |tt|}|d u r|| |< d S |s$||kr&td| d| dd S d S )NzInvalid config: =z. when aot_inductor.compile_standalone is True.)r  rJ   rh   r   )rL  rM  rN  r   rI   rI   rP   patch_config  s   z2maybe_aoti_standalone_config.<locals>.patch_configzaot_inductor.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelN)rL  r  rM  rH   rN  r   rG   r  )r  rh   aot_inductorcompile_standalonecopyrK   r  r  )rL  rP  rS  rI   rI   rP   maybe_aoti_standalone_config  s   
rU  c                  C  sV   ddl m}  | jj}|du rdS t|tstd|dkrdS td|s)tddS )	zL
    Validates if a model name is suitable for use in code generation.

    r   rg   NTz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r  rh   rR  model_name_for_generated_filesrx   rH   r  r   r   )rh   
model_namerI   rI   rP   is_valid_aoti_model_name  s   
rX  r'   unbacked_onlyOrderedSet[sympy.Symbol]c                 C  s   |rt | S t| S r   )r&   r%   )rO   rY  rI   rI   rP   get_free_symbols  s   r[  cudagraph partition due to Optional[BaseSchedulerNode]c                 C  s`   t jjsdS | |  }|r)|j }r)|  }r)|jdd }r)| d| }t| dS )z
    Cudagraph partition may lead to extra memory overhead so we
    log partition reasons to help users understand the overhead.
    Nstack_tracez. Found from : 
 )	rh   r  r*  r  get_origin_noder  r  perf_hint_logr  )r  ru  r  warning_msgir_noders  r^  rI   rI   rP   maybe_log_cudagraph_partition  s   	
rc  dict[str, str]c                  C  s@   i t jdt jdt jtji} t rt	
d| d< | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHr  
PYTHONHOME)r  r  r  r"  r  r  r  rh   r'  	sysconfigget_path)envrI   rI   rP   python_subprocess_env  s   rk  c                   @  s"   e Zd ZU dZded< ded< dS )CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    rr   num_partitionspartition_indexNr   rI   rI   rI   rP   rl  .  s   
 rl  .c                   @  s   e Zd ZU dZded< dS )CUDAGraphWrapperNzOptional[CUDAGraphWrapperType]r  )r   r   r   r  r   rI   rI   rI   rP   ro  E  s   
 ro  CUDAGraphWrapperTypec                 C  s
   | t _d S r   )r+  r  )r  rI   rI   rP   !set_customized_partition_wrappersW  r*  rq   tuple[list[Any], dict[str, Any]]c                   s   | j j}| j g || j j| j j}| j j}t||f\}}ddd  fdd|D }ddd	dfddfdd|D }t||\}}||fS )NrG   rw   c                 S  s"   t | tjjjot | tjjj S r   )rx   rK   r  r  r:   GeneratorStaterO  rI   rI   rP   _is_tensor_ird  s   
z(snode_args_kwargs.<locals>._is_tensor_irc                   s*   g | ]} |rt jjj|d dn|qS )F)guard_shape)rK   r  r  ir_node_to_tensorrp  )rt  rI   rP   rQ   i  s    z%snode_args_kwargs.<locals>.<listcomp>r  c                 S  s   t j| ||dS )Nr   )rK   r   )r  r   r   rI   rI   rP   _tensorp  r  z"snode_args_kwargs.<locals>._tensorr   r   c                   s(   t | tjs| S  |  | j| j}|S r   )rx   rK   r  r  r   r   )r   r  )rw  rI   rP   to_real_tensors  s   z)snode_args_kwargs.<locals>.to_real_tensorc                   rU  rI   rI   rp  )rx  rI   rP   rQ   y  rk  r_  )rG   r  )r   r   rG   r   )r  r  fill_non_provided_argsconstant_argsr  pytreer!   tree_unflatten)r  r   r  	flat_argsflat_args_pytree_specrI   )rt  rw  rx  rP   snode_args_kwargs[  s    


r  r]  )rq   rr   rG   rr   )ru   rv   rG   rw   )r   r   )r   r   r   rr   r   rr   rG   r   r_  )r   r   rG   r   )r	  r
  rG   rv   )r  r  r  r  rG   rv   )r	  r  rG   r  )r"  r#  r$  r#  rG   r#  )r)  r*  rG   rH   )rW  rX  rG   rY  )r   r#  rG   r^  )rW  rh  rG   ri  )rm  rn  rG   rw   )r{  r1   r|  r}  rG   rw   )r  r   r   r  r  r  rG   r  )rB   )r   rH   rG   r  )r2   rB   )
r  r  r  r  r   rr   r   rH   rG   r   )rI   r  r  r  rB   )r  r  r  r  r   rr   r  rr   r  r   r   rH   rG   r   )r  r   r  rH   rG   r  )r  r   r  r   rG   r  )r  rr   r  rr   rG   rr   )rO   r  r  rr   rG   r  )rO   r  rG   r  )r   r  rG   r  )r  r  rG   r  )r  r  r  r  rG   rH   )r  r  r  r5   rG   r   r   )rB  rC  rD  rE  rG   rF  )r   rJ  r  rK  rG   rF  rl  )r  rv   rG   rm  )ru  rH   rG   rw   )ru  rd   r  rr   rG   rx  )r  rw   rG   rw   )r   rH   rG   rx  )r^  rv   r  r  rG   rv   )r  r   rG   r  )r   r   rG   rw   )r  r  rG   r  )r  r  rG   r1   )r  r  rG   r  r^  )r  r   rG   r   )NNT)r  r  r  r  r  rw   rG   r  )r  r  rG   r  )re  r(   r  r  rG   r  )r   r  rG   rr   rB  r   )ry  rz  rG   rw   rt  )r  rr   r   r   r  r  rG   r3   )r  r;   r  r  rG   rw   )r  rH   rG   rw   )
r  r;   r  rw   r  rw   r  rw   rG   rw   )r  r:   r  rw   rG   rw   )
r  r;   r  rr   r  rr   r  rr   rG   rw   )r  rH   rG   rw   )r  r  r  r  r  r  rG   rw   )r  r  r  r  r  r  rG   r  )r   rH   rG   rH   )rG   r  )r  r;   rG   rw   )r  r;   r8  r9  r:  r:   rG   rw   )FTFN)r  r;   r8  r:   r:  r:   r?  rw   r;  rw   r@  rw   rA  r  rG   rw   )r   rh  r   r  r  r  rG   ri  )r   r  rG   ry  )r   rh  r   r  r  r  rG   r   )r   rh  r   r  r  r  rG   rH   )r   rh  r   r  r  r  rG   r  )r  r  r  r  rG   r  )r  r  r  r  rG   r   )r  rH   rG   r  )rG   r  )r.  r  rG   rw   )r  r  rG   rw   )rU  rv   rG   r  )r  rw   r   r   r  r   rG   r  )r   r  rG   r   )r  rH   rG   rw   )r  rH   rG   rr   )r  r  rG   rw   )
rj  r  r  r0   r  r  r  rH   rG   r  )r  r  rG   rw   )r  r	  rm  r
  rG   rw   )r  r  rG   rw   )r  r@   rG   rw   )r  r#  rm  r$  rG   rw   )r&  rH   r'  r  r(  r  rG   r   )r  r@   r,  r-  r'  r.  r(  r/  r0  r1  rG   r  )r<  rr   r=  rr   rG   rr   )rB  r  rG   rr   )rN  r   rG   r  )r   rH   rG   rH   )r   r  rG   rw   )r   rH   rG   rw   )r   r  rG   rw   )r  rn  r  r  r]  r  r^  r  r_  rH   r`  rw   rG   rw   )r  r  rG   r  )r   r  rG   rw   )rx  r  rG   rw   )rG   r>  )r   rh  r   r  r  r  rG   r  )r  r  rG   r  )r  r  r  r  r  r  rG   r  )rO   r  rG   r  )r  r  r  r  r  r  rG   r  )r  r  r  r  rG   r  )r   rv   rG   rw   )r  r  r  r?   rG   r  )r   r  rG   rH   )r   rH   rG   r  )r  r  r   r  rG   rw   )rG   r  )r   rH   r  r/   r  r*  rG   r  )r   rH   rG   r  )r   r  rG   r  )r   r  r   rw   rG   r   )rG   r
  )rG   r  )r  r<   rG   rw   )r  r5   rG   rw   )r  rr   rG   r  )r   rH   rG   rw   )r6  r7  r8  r9  rG   rH   )NN)
rB  r  rC  r  rD  rE  rF  rE  rG   rG  )rL  r  rG   r  )rO   r'   rY  rw   rG   rZ  )r\  N)r  rH   ru  r  r  r]  rG   r  )rG   rd  )r  rp  rG   r  )r  r@   rG   rr  (p  
__future__r   r(  rC  r  enumr  r  r  r  rW  r  rM  r  r  r  r   r  r   r  rh  r  rO  r  r  collections.abcr   r   r   r   r   r   r	   r
   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   r   r   ry   rK   torch.utils._pytreer  _pytreer{  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   torch.utils._dtype_abbrsr   torch.utils._ordered_setr    r!   r"   OPTIMUS_EXCLUDE_POST_GRADr  r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   torch._prims_commonr/   torch.fxr0   torch.fx.noder1   r  r3   rU  r5   rc  r7   r  r8   r9   r:   r;   r<   r=   output_coder?   r  r@   rA   rR   rF   r   rW   torch._dynamo.device_interfacerX   torch._dynamo.utilsrY   torch.autogradrZ   torch.autograd.profiler_utilr[   (torch.fx.passes.graph_transform_observerr\   torch.fx.passes.shape_propr]   torch.utils._sympy.functionsr^   r_   r`   ra   rb   torch.utils._sympy.symbolrc   rd   torch.utils._sympy.value_rangesre   rf   r  rh   runtime.runtime_utilsri   r(  _IS_WINDOWS	getLoggerr   r   _logginggetArtifactLoggerr`  rl   r  r&  	VarRangesr  rr   	InputTypeGPU_KERNEL_BIN_EXTSrv  r  r  r  rs   rt   r~   Functionr   r  r   r   r   r   r  r  r  r   rV  r]  rg  rl  rv  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  rA  rI  r]  rk  rt  rv  r~  r  r  r  r  r  r  r  r  r  r  r  r   r  r  rD  r  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher  r  	lru_cacher  r  r  r  r`  rk  r!  ru  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r,  r1  r4  r6  r7  r>  r<  r^  r_  rt  rx  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  rZ  r  r  r  r  r  r  r  r  rW  r  Enumr  r   r  r  r  r  r  r"  r%  r*  r4  r9  rA  rJ  rK  rV  rX  r  rZ  r\  ri  rt  rw  rz  r  r  r  r  r  r  r  r  r  r  r  r  r.  r  compiler  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r  r  r%  r)  r,  r.  SUPPORTED_MKLDNN_DEVICESr2  r5  rA  rK  rU  rX  r[  rc  rk  rl  PartitionFnTyperp  ro  r+  rq  r  rI   rI   rI   rP   <module>   s    4  


$
KV&
		$ /;=$  		g!8.

@
	+	!
(	
$&		'	#


$
0
	#1	

