o
    iî€  ã                   @   sú  d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlZd dlmZ ddlmZmZ ddlmZ dd	lmZ d
dlmZmZmZ d
dlmZ d
dlmZmZ g d¢ZeG dd„ dƒƒZ dee! de!fdd„Z"dej#de!de!de$ej#e!f fdd„Z%dej#de!de!de!dej#f
dd„Z&ej' (d¡ ej' (d¡ G dd„ dej)ƒZ*dej#d e!dej#fd!d"„Z+d#ej#d$ej#d%e$e!e!e!f d&e$e!e!e!f d'ej#d(ej#d)ej#dej#fd*d+„Z,dej#d,ej#d-e-fd.d/„Z.ej' (d+¡ ej' (d/¡ G d0d1„ d1ej)ƒZ/G d2d3„ d3ej)ƒZ0G d4d5„ d5ej)ƒZ1G d6d7„ d7ej)ƒZ2d8e3e  d9e4d:e
e d;e-d<ede2fd=d>„Z5G d?d@„ d@eƒZ6G dAdB„ dBeƒZ7eƒ edCe6j8fdDddEdFœd:e
e6 d;e-d<ede2fdGdH„ƒƒZ9eƒ edCe7j8fdDddEdFœd:e
e7 d;e-d<ede2fdIdJ„ƒƒZ:dS )Ké    N)ÚSequence)Ú	dataclass)Úpartial)ÚAnyÚCallableÚOptionalé   )ÚMLPÚStochasticDepth)ÚVideoClassification)Ú_log_api_usage_onceé   )Úregister_modelÚWeightsÚWeightsEnum)Ú_KINETICS400_CATEGORIES)Ú_ovewrite_named_paramÚhandle_legacy_interface)ÚMViTÚMViT_V1_B_WeightsÚ	mvit_v1_bÚMViT_V2_S_WeightsÚ	mvit_v2_sc                   @   sV   e Zd ZU eed< eed< eed< ee ed< ee ed< ee ed< ee ed< dS )	ÚMSBlockConfigÚ	num_headsÚinput_channelsÚoutput_channelsÚkernel_qÚ	kernel_kvÚstride_qÚ	stride_kvN)Ú__name__Ú
__module__Ú__qualname__ÚintÚ__annotations__Úlist© r'   r'   úS/mnt/sdb/aimis/docanh/lib/python3.10/site-packages/torchvision/models/video/mvit.pyr      s   
 r   ÚsÚreturnc                 C   s   d}| D ]}||9 }q|S ©Né   r'   )r)   ÚproductÚvr'   r'   r(   Ú_prod'   s   
r/   ÚxÚ
target_dimÚ
expand_dimc                 C   sF   |   ¡ }||d kr|  |¡} | |fS ||krtd| j› ƒ‚| |fS )Nr,   zUnsupported input dimension )ÚdimÚ	unsqueezeÚ
ValueErrorÚshape©r0   r1   r2   Ú
tensor_dimr'   r'   r(   Ú
_unsqueeze.   s   
þr9   r8   c                 C   s   ||d kr|   |¡} | S r+   )Úsqueezer7   r'   r'   r(   Ú_squeeze7   s   
r;   c                       s|   e Zd Z		ddejdeej deej deddf
‡ fdd	„Zd
ej	de
eeef de
ej	e
eeef f fdd„Z‡  ZS )ÚPoolNFÚpoolÚnormÚ
activationÚnorm_before_poolr*   c                    sV   t ƒ  ¡  || _g }|d ur| |¡ |d ur| |¡ |r#tj|Ž nd | _|| _d S )N)ÚsuperÚ__init__r=   ÚappendÚnnÚ
SequentialÚnorm_actr@   )Úselfr=   r>   r?   r@   Úlayers©Ú	__class__r'   r(   rB   B   s   



zPool.__init__r0   Úthwc                 C   sö   t |ddƒ\}}tj|ddd\}}| dd¡}|jd d… \}}}| || |f| ¡ ¡ }| jr<| jd ur<|  |¡}|  	|¡}|jdd … \}}	}
| |||d¡ dd¡}tj
||fdd}| jsm| jd urm|  |¡}t|dd|ƒ}|||	|
ffS )	Né   r,   ©r,   r   )Úindicesr3   r   éÿÿÿÿ©r3   )r9   ÚtorchÚtensor_splitÚ	transposer6   ÚreshapeÚ
contiguousr@   rF   r=   Úcatr;   )rG   r0   rK   r8   Úclass_tokenÚBÚNÚCÚTÚHÚWr'   r'   r(   ÚforwardS   s   


zPool.forward)NF)r!   r"   r#   rD   ÚModuler   ÚboolrB   rQ   ÚTensorÚtupler$   r^   Ú__classcell__r'   r'   rI   r(   r<   A   s    ûþýüûú>r<   Ú	embeddingÚdc                 C   s@   | j d |kr	| S tjj|  dd¡ d¡|dd d¡ dd¡S )Nr   r,   Úlinear)ÚsizeÚmode)r6   rD   Ú
functionalÚinterpolateÚpermuter4   r:   )rd   re   r'   r'   r(   Ú_interpolatem   s   ýùrl   ÚattnÚqÚq_thwÚk_thwÚ	rel_pos_hÚ	rel_pos_wÚ	rel_pos_tc           %      C   s6  |\}}}	|\}
}}t dt||ƒ d ƒ}t dt|	|ƒ d ƒ}t dt||
ƒ d ƒ}t|| dƒ}t|| dƒ}t |¡d d …d f | t |¡d d d …f d|  |  }t||	 dƒ}t|	| dƒ}t |	¡d d …d f | t |¡d d d …f d|  |  }t|
| dƒ}t||
 dƒ}t |¡d d …d f | t |
¡d d d …f d|
  |  }t||ƒ}t||ƒ}t||ƒ}|| ¡  }|| ¡  }|| ¡  }|j\}}}}|d d …d d …dd …f  |||||	|¡} t d| |¡}!t d| |¡}"|  	dddddd	¡ ||| | |	 |¡} t 
| | dd¡¡ dd¡}#|# ||||	||
¡ 	dddddd	¡}#|!d d …d d …d d …d d …d d …d d d …d f |"d d …d d …d d …d d …d d …d d d d …f  |#d d …d d …d d …d d …d d …d d …d d f   |||| |	 |
| | ¡}$| d d …d d …dd …dd …f  |$7  < | S )
Nr   r,   ç      ð?zbythwc,hkc->bythwkzbythwc,wkc->bythwkr   r   rL   é   )r$   ÚmaxrQ   Úarangerl   Úlongr6   rT   Úeinsumrk   ÚmatmulrS   Úview)%rm   rn   ro   rp   rq   rr   rs   Úq_tÚq_hÚq_wÚk_tÚk_hÚk_wÚdhÚdwÚdtÚ	q_h_ratioÚ	k_h_ratioÚdist_hÚ	q_w_ratioÚ	k_w_ratioÚdist_wÚ	q_t_ratioÚ	k_t_ratioÚdist_tÚRhÚRwÚRtrX   Ún_headÚ_r3   Úr_qÚrel_h_qÚrel_w_qÚrel_q_tÚrel_posr'   r'   r(   Ú_add_rel_pos|   sH   


<<<


**$..ÿ.þü(r˜   ÚshortcutÚresidual_with_cls_embedc              	   C   sZ   |r	|   |¡ | S | d d …d d …dd …d d …f  |d d …d d …dd …d d …f 7  < | S r+   )Úadd_)r0   r™   rš   r'   r'   r(   Ú_add_shortcut¸   s
   
Dÿrœ   c                       s²   e Zd Zdejfdee dedededee dee dee d	ee d
edededede	dej
f ddf‡ fdd„Zdejdeeeef deejeeeef f fdd„Z‡  ZS )ÚMultiscaleAttentionç        Ú
input_sizeÚ	embed_dimÚ
output_dimr   r   r   r   r    Úresidual_poolrš   Úrel_pos_embedÚdropoutÚ
norm_layer.r*   Nc              
      sp  t ƒ  ¡  || _|| _|| _|| | _dt | j¡ | _|	| _	|
| _
t |d| ¡| _t ||¡g}|dkr@| tj|dd¡ tj|Ž | _d | _t|ƒdksUt|ƒdkrrdd„ |D ƒ}ttj| j| j|||| jd	d
|| jƒƒ| _d | _d | _t|ƒdks„t|ƒdkr·dd„ |D ƒ}ttj| j| j|||| jd	d
|| jƒƒ| _ttj| j| j|||| jd	d
|| jƒƒ| _d | _d | _d | _|r6t|dd … ƒ}t|ƒdkr×||d  n|}t|ƒdkrå||d  n|}dt||ƒ d }d|d  d }t t || j¡¡| _t t || j¡¡| _t t || j¡¡| _tj j!| jdd tj j!| jdd tj j!| jdd d S d S )Nrt   r   rž   T©Úinplacer,   c                 S   ó   g | ]}t |d  ƒ‘qS ©r   ©r$   )Ú.0rn   r'   r'   r(   Ú
<listcomp>æ   ó    z0MultiscaleAttention.__init__.<locals>.<listcomp>F)ÚstrideÚpaddingÚgroupsÚbiasc                 S   r¨   r©   rª   )r«   Úkvr'   r'   r(   r¬   ÷   r­   r   r   ç{®Gáz”?©Ústd)"rA   rB   r    r¡   r   Úhead_dimÚmathÚsqrtÚscalerr¢   rš   rD   ÚLinearÚqkvrC   ÚDropoutrE   ÚprojectÚpool_qr/   r<   ÚConv3dÚpool_kÚpool_vrq   rr   rs   rv   ÚlenÚ	ParameterrQ   ÚzerosÚinitÚtrunc_normal_)rG   rŸ   r    r¡   r   r   r   r   r    r¢   rš   r£   r¤   r¥   rH   Ú	padding_qÚ
padding_kvrg   Úq_sizeÚkv_sizeÚspatial_dimÚtemporal_dimrI   r'   r(   rB   Å   s   

ù	öù	öù	öõzMultiscaleAttention.__init__r0   rK   c                 C   s:  |j \}}}|  |¡ ||d| j| j¡ dd¡jdd\}}}| jd ur-|  ||¡\}}	n|}	| jd ur<|  ||¡d }| j	d urI|  	||¡\}}t
 | j| | dd¡¡}
| jd urr| jd urr| jd urrt|
|||	| j| j| jƒ}
|
jdd}
t
 |
|¡}| jrˆt||| jƒ | dd¡ |d| j¡}|  |¡}||fS )Nr   r,   r   rP   r   rO   )r6   r»   rT   r   r¶   rS   ÚunbindrÀ   rÁ   r¾   rQ   rz   r¹   rq   rr   rs   r˜   Úsoftmaxr¢   rœ   rš   r¡   r½   )rG   r0   rK   rX   rY   rZ   rn   Úkr.   rp   rm   r'   r'   r(   r^   !  s6   2


ù	
zMultiscaleAttention.forward)r!   r"   r#   rD   Ú	LayerNormr&   r$   r`   Úfloatr   r_   rB   rQ   ra   rb   r^   rc   r'   r'   rI   r(   r   Ä   sB    òþýüûúùø	÷
öõôóòñ>\r   c                       s”   e Zd Zddejfdee dededededede	d	e	d
e
dejf ddf‡ fdd„Zdejdeeeef deejeeeef f fdd„Z‡  ZS )ÚMultiscaleBlockrž   rŸ   Úcnfr¢   rš   r£   Úproj_after_attnr¤   Ústochastic_depth_probr¥   .r*   Nc
                    s  t ƒ  ¡  || _d | _t|jƒdkr.dd„ |jD ƒ}
dd„ |
D ƒ}ttj|
|j|dd ƒ| _|r3|j	n|j
}|	|j
ƒ| _|	|ƒ| _t| jtjƒ| _t||j
||j|j|j|j|j|||||	d| _t|d| |j	gtj|d d| _t|d	ƒ| _d | _|j
|j	kr‹t |j
|j	¡| _d S d S )
Nr,   c                 S   s    g | ]}|d kr|d  n|‘qS rM   r'   )r«   r)   r'   r'   r(   r¬   V  s     z,MultiscaleBlock.__init__.<locals>.<listcomp>c                 S   r¨   r©   rª   )r«   rÏ   r'   r'   r(   r¬   W  r­   )r®   r¯   )	r   r   r   r    r£   r¢   rš   r¤   r¥   rL   )Úactivation_layerr¤   r§   Úrow)rA   rB   rÔ   Ú	pool_skipr/   r   r<   rD   Ú	MaxPool3dr   r   Únorm1Únorm2Ú
isinstanceÚBatchNorm1dÚneeds_transposalr   r   r   r   r    rm   r	   ÚGELUÚmlpr
   Ústochastic_depthr½   rº   )rG   rŸ   rÓ   r¢   rš   r£   rÔ   r¤   rÕ   r¥   Úkernel_skipÚpadding_skipÚattn_dimrI   r'   r(   rB   E  sP   
ÿ
óûÿzMultiscaleBlock.__init__r0   rK   c           	      C   sä   | j r|  | dd¡¡ dd¡n|  |¡}|  ||¡\}}| jd u s%| js'|n|  |¡}| jd u r3|n|  ||¡d }||  |¡ }| j rR|  | dd¡¡ dd¡n|  |¡}| jd u s_| jra|n|  |¡}||  |  	|¡¡ |fS )Nr,   r   r   )
rÞ   rÚ   rS   rm   r½   rÔ   rØ   rá   rÛ   rà   )	rG   r0   rK   Úx_norm1Úx_attnÚthw_newÚx_skipÚx_norm2Úx_projr'   r'   r(   r^     s   **zMultiscaleBlock.forward)r!   r"   r#   rD   rÐ   r&   r$   r   r`   rÑ   r   r_   rB   rQ   ra   rb   r^   rc   r'   r'   rI   r(   rÒ   D  s4    	öþýüûúùø	÷
öõ>:rÒ   c                
       sP   e Zd Zdedeeef dededdf
‡ fdd„Zd	ejdejfd
d„Z	‡  Z
S )ÚPositionalEncodingÚ
embed_sizeÚspatial_sizeÚtemporal_sizer£   r*   Nc                    s’   t ƒ  ¡  || _|| _t t |¡¡| _d | _	d | _
d | _|sGt t | jd | jd  |¡¡| _	t t | j|¡¡| _
t t |¡¡| _d S d S )Nr   r,   )rA   rB   rí   rî   rD   rÃ   rQ   rÄ   rW   Úspatial_posÚtemporal_posÚ	class_pos)rG   rì   rí   rî   r£   rI   r'   r(   rB     s   
$ýzPositionalEncoding.__init__r0   c                 C   s¼   | j  | d¡d¡ d¡}tj||fdd}| jd ur\| jd ur\| jd ur\| jj	\}}tj
| j|dd}| | j d¡ | jdd¡ d|¡¡ tj| j d¡|fdd d¡}| |¡ |S )Nr   rO   r,   rP   )rW   Úexpandrg   r4   rQ   rV   rï   rð   rñ   r6   Úrepeat_interleaver›   rî   rT   )rG   r0   rW   Úhw_sizerì   Úpos_embeddingr'   r'   r(   r^   ›  s   & 
zPositionalEncoding.forward)r!   r"   r#   r$   rb   r`   rB   rQ   ra   r^   rc   r'   r'   rI   r(   rë   Œ  s    *rë   c                $       sÒ   e Zd Z									ddeeef d	ed
ee dedededededededede	e
dejf  de	e
dejf  deeeef deeeef deeeef ddf"‡ fdd„Zdejdejfdd„Z‡  ZS ) r   ç      à?rž   é  N©r   é   rù   ©r   rL   rL   ©r,   r   r   rí   rî   Úblock_settingr¢   rš   r£   rÔ   r¤   Úattention_dropoutrÕ   Únum_classesÚblock.r¥   Úpatch_embed_kernelÚpatch_embed_strideÚpatch_embed_paddingr*   c                    s  t ƒ  ¡  t| ƒ t|ƒ}|dkrtdƒ‚|du rt}|du r&ttjdd}tj	d|d j
|||d| _dd	„ t|f| | jjƒD ƒ}t|d j
|d
 |d f|d |d| _t ¡ | _t|ƒD ]/\}}|
| |d  }| j ||||||||	||d	¡ t|jƒdkrdd	„ t||jƒD ƒ}q`||d jƒ| _t tj|ddt |d j|¡¡| _|  ¡ D ][}t|tjƒrÔtjj|jdd t|tjƒrÓ|j durÓtj !|j d¡ q¯t|tjƒrõ|jdurçtj !|jd¡ |j durôtj !|j d¡ q¯t|tƒr
| "¡ D ]
}tjj|dd qÿq¯dS )aÄ  
        MViT main class.

        Args:
            spatial_size (tuple of ints): The spacial size of the input as ``(H, W)``.
            temporal_size (int): The temporal size ``T`` of the input.
            block_setting (sequence of MSBlockConfig): The Network structure.
            residual_pool (bool): If True, use MViTv2 pooling residual connection.
            residual_with_cls_embed (bool): If True, the addition on the residual connection will include
                the class embedding.
            rel_pos_embed (bool): If True, use MViTv2's relative positional embeddings.
            proj_after_attn (bool): If True, apply the projection after the attention.
            dropout (float): Dropout rate. Default: 0.0.
            attention_dropout (float): Attention dropout rate. Default: 0.0.
            stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
            num_classes (int): The number of classes.
            block (callable, optional): Module specifying the layer which consists of the attention and mlp.
            norm_layer (callable, optional): Module specifying the normalization layer to use.
            patch_embed_kernel (tuple of ints): The kernel of the convolution that patchifies the input.
            patch_embed_stride (tuple of ints): The stride of the convolution that patchifies the input.
            patch_embed_padding (tuple of ints): The padding of the convolution that patchifies the input.
        r   z+The configuration parameter can't be empty.Ngíµ ÷Æ°>)Úepsr   )Úin_channelsÚout_channelsÚkernel_sizer®   r¯   c                 S   ó   g | ]\}}|| ‘qS r'   r'   ©r«   rg   r®   r'   r'   r(   r¬   ì  r­   z!MViT.__init__.<locals>.<listcomp>r,   r   )rì   rí   rî   r£   rt   )	rŸ   rÓ   r¢   rš   r£   rÔ   r¤   rÕ   r¥   c                 S   r  r'   r'   r  r'   r'   r(   r¬     r­   rO   Tr¦   r³   r´   rž   )#rA   rB   r   rÂ   r5   rÒ   r   rD   rÐ   r¿   r   Ú	conv_projÚzipr®   rë   Úpos_encodingÚ
ModuleListÚblocksÚ	enumeraterC   r   r   r>   rE   r¼   rº   ÚheadÚmodulesrÜ   rÅ   rÆ   Úweightr±   Ú	constant_Ú
parameters)rG   rí   rî   rü   r¢   rš   r£   rÔ   r¤   rý   rÕ   rþ   rÿ   r¥   r   r  r  Útotal_stage_blocksrŸ   Ústage_block_idrÓ   Úsd_probÚmÚweightsrI   r'   r(   rB   ª  s€   
)ûü
÷ÿ€þ€

€€ôzMViT.__init__r0   c                 C   sŠ   t |ddƒd }|  |¡}| d¡ dd¡}|  |¡}| jjf| jj }| jD ]	}|||ƒ\}}q'|  |¡}|d d …df }|  	|¡}|S )Nru   r   r   r,   )
r9   r	  ÚflattenrS   r  rî   rí   r  r>   r  )rG   r0   rK   rÿ   r'   r'   r(   r^   "  s   




zMViT.forward)	rö   rž   rž   r÷   NNrø   rú   rû   )r!   r"   r#   rb   r$   r   r   r`   rÑ   r   r   rD   r_   rB   rQ   ra   r^   rc   r'   r'   rI   r(   r   ©  s\    
ï
þýüûúùø	÷
öõôóòñðïîxr   rü   rÕ   r  ÚprogressÚkwargsc                 K   sÚ   |d ur1t |dt|jd ƒƒ |jd d |jd d ksJ ‚t |d|jd ƒ t |d|jd ƒ | dd	¡}| dd
¡}td||| | dd¡| dd¡| dd¡| dd¡|dœ|¤Ž}|d urk| |j|dd¡ |S )Nrþ   Ú
categoriesÚmin_sizer   r,   rí   rî   Úmin_temporal_size©éà   r   é   r¢   Frš   Tr£   rÔ   )rí   rî   rü   r¢   rš   r£   rÔ   rÕ   )r  Ú
check_hashr'   )r   rÂ   ÚmetaÚpopr   Úload_state_dictÚget_state_dict)rü   rÕ   r  r  r  rí   rî   Úmodelr'   r'   r(   Ú_mvit9  s,    



ø	÷r(  c                   @   óJ   e Zd Zedeedddddddedd	d
ddddœidddœ	dZeZdS )r   z:https://download.pytorch.org/models/mvit_v1_b-dbeb1030.pthr  ©é   ©çÍÌÌÌÌÌÜ?r-  r-  ©çÍÌÌÌÌÌÌ?r/  r/  ©Ú	crop_sizeÚresize_sizeÚmeanrµ   r!  zShttps://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.mdúœThe weights were ported from the paper. The accuracies are estimated on video-level with parameters `frame_rate=7.5`, `clips_per_video=5`, and `clip_len=16`ip¢.úKinetics-400gJ+‡žS@gh‘í|?eW@©zacc@1zacc@5gu“V¦Q@gœÄ °rxa@©	r  r  r  ÚrecipeÚ_docsÚ
num_paramsÚ_metricsÚ_opsÚ
_file_size©ÚurlÚ
transformsr#  N©	r!   r"   r#   r   r   r   r   ÚKINETICS400_V1ÚDEFAULTr'   r'   r'   r(   r   Z  ó4    ûþÿï÷r   c                   @   r)  )r   z:https://download.pytorch.org/models/mvit_v2_s-ae3be167.pthr  r*  r,  r.  r0  r!  zChttps://github.com/facebookresearch/SlowFast/blob/main/MODEL_ZOO.mdr4  ir5  gœÄ °r0T@gÃõ(\ªW@r6  gu“VP@g?5^ºI|`@r7  r>  NrA  r'   r'   r'   r(   r   {  rD  r   Ú
pretrained)r  T)r  r  c                 K   sØ  t  | ¡} g d¢g d¢g d¢g g d¢g g d¢g g g g g g g g g g g d¢g gg d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gg g d¢g g d¢g g g g g g g g g g g d¢g gg d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gd	œ}g }tt|d
 ƒƒD ],}| t|d
 | |d | |d | |d | |d | |d | |d | d	¡ q«tddd|dd| dd¡| |dœ|¤ŽS )a¿  
    Constructs a base MViTV1 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V1_B_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V1_B_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V1_B_Weights
        :members:
    ©r,   r   r   rL   rL   rL   rL   rL   rL   rL   rL   rL   rL   rL   é   rG  ©é`   éÀ   rJ  é€  rK  rK  rK  rK  rK  rK  rK  rK  rK  rK  é   rL  )rJ  rJ  rK  rK  rK  rK  rK  rK  rK  rK  rK  rK  rK  rL  rL  rL  ©r   r   r   ©r,   r   r   ©r,   rG  rG  ©r,   rL   rL   ©r,   r,   r,   ©r   r   r   r   r   r   r    r   r   r   r   r   r   r    r  r!  FrÕ   çš™™™™™É?)rí   rî   rü   r¢   rš   rÕ   r  r  Nr'   )r   ÚverifyÚrangerÂ   rC   r   r(  r$  ©r  r  r  Úconfigrü   Úir'   r'   r(   r   œ  s„   
.ð.ðè,






ùÿ
ø	÷r   c                 K   sD  t  | ¡} g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gg d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gg d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gg d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢g d¢gd	œ}g }tt|d
 ƒƒD ],}| t|d
 | |d | |d | |d | |d | |d | |d | d	¡ qßtddd|dddd| dd¡| |dœ
|¤ŽS )aC  Constructs a small MViTV2 architecture from
    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__ and
    `MViTv2: Improved Multiscale Vision Transformers for Classification
    and Detection <https://arxiv.org/abs/2112.01526>`__.

    .. betastatus:: video module

    Args:
        weights (:class:`~torchvision.models.video.MViT_V2_S_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MViT_V2_S_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.MViT``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/mvit.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MViT_V2_S_Weights
            :members:
    rF  )rI  rI  rJ  rJ  rK  rK  rK  rK  rK  rK  rK  rK  rK  rK  rK  rL  rH  rM  rQ  rN  rO  rP  rR  r   r   r   r   r   r   r    r  r!  TFrÕ   rS  )
rí   rî   rü   r¢   rš   r£   rÔ   rÕ   r  r  Nr'   )r   rT  rU  rÂ   rC   r   r(  r$  rV  r'   r'   r(   r   þ  sÈ   
ððððÆN






ùÿ
öõr   );r·   Úcollections.abcr   Údataclassesr   Ú	functoolsr   Útypingr   r   r   rQ   Útorch.fxÚtorch.nnrD   Úopsr	   r
   Útransforms._presetsr   Úutilsr   Ú_apir   r   r   Ú_metar   Ú_utilsr   r   Ú__all__r   r$   r/   ra   rb   r9   r;   ÚfxÚwrapr_   r<   rl   r˜   r`   rœ   r   rÒ   rë   r   r&   rÑ   r(  r   r   rB  r   r   r'   r'   r'   r(   Ú<module>   sŽ    	
&"	,ÿþýüûúù
ø< H ÿþýüû
ú!!!*`.