o
    ?"g<                     @   s  d Z ddlmZmZmZ ddlZddlmZ e r'ddlm	Z	m
Z
 ddlmZ G dd	 d	Zd
ejddfddZejjdddejdejdejdejfddZ			ddejjdejdejdejdeejdf dee dee deej deejejf fddZdS )a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalTupleUnionN   )is_torch_flex_attn_available)	BlockMaskflex_attention)create_block_maskc                       sJ   e Zd ZdZdZdZdZ fddZej	j
dddd Zd	d
 Z  ZS )WrappedFlexAttentionzh
    We are doing a singleton class so that flex attention is compiled once when it's first called.
    NFc                    s   | j d u rt | | _ | j S N)	_instancesuper__new__)clsargskwargs	__class__ j/mnt/skqttb/ctump_chatbot/chatbot/lib/python3.10/site-packages/transformers/integrations/flex_attention.pyr   7   s   
zWrappedFlexAttention.__new__	recursivec                 C   s(   | j du rtjtdd| _d| _ dS dS )z>
        Initialize or update the singleton instance.
        F)dynamicTN)_is_flex_compiledtorchcompiler   _compiled_flex_attentionselfr   r   r   __init__=   s   

zWrappedFlexAttention.__init__c                 C   s   | j S r   )r   r   r   r   r   __call__F   s   zWrappedFlexAttention.__call__)__name__
__module____qualname____doc__r   r   r   r   r   compilerdisabler   r    __classcell__r   r   r   r   r
   .   s    
r
   attention_mask_2dreturnr   c                    s4   | j }|   j\}} fdd}t||d|||dS )a  
    Create a block causal document mask for a batch of sequences, both packed and unpacked.
    Create Block causal logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full block causal
    mask. BlockMask is essential for performant computation of flex attention.
    See: https://pytorch.org/blog/flexattention/

    Args:
        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
        of shape (batch_size, total_seq_len). e.g.

        For unpacked sequence:
        [[1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0]]

        For packed sequence:
        [[1, 1, 1, 2, 2, 2, 0],
         [1, 1, 2, 2, 2, 3, 3]]

    Returns:
        BlockMask
    c                    s<   ||k} | |f  | |f k} | |f dk}||@ |@ S )z
        Defines the logic of a block causal mask by combining both a standard causal mask
        and a block diagonal document mask.

        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
        for an illustration.
        r   r   )	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_maskdocument_idsr   r   causal_mask_modk   s   z4make_flex_block_causal_mask.<locals>.causal_mask_modN)mask_modBHQ_LENKV_LENdevice)r9   shapecreate_block_causal_mask_flex)r(   r9   
batch_sizetotal_seq_lenr3   r   r1   r   make_flex_block_causal_maskJ   s   
r>   Fr   querykeyvaluec                 K   s   t   }|| ||fi |S r   )r
   )r?   r@   rA   r   flex_attention_compiledr   r   r   compile_friendly_flex_attention   s   rC   moduleattention_maskscalingsoftcap	head_maskc              
      s   d }	d  t |tr|}	n|  d ur% d d d d d d d |jd f   fdd}
t||||
|	d|dd\}}||j}|dd }||fS )Nc                    s^   d urt |   }  d ur|  | d | |  } d ur-| | | d d  } | S )Nr   )r   tanh)scorer*   r+   r,   r-   r.   rH   rG   r   r   	score_mod   s   z)flex_attention_forward.<locals>.score_modT)rM   
block_mask
enable_gqascale
return_lse   r   )
isinstancer   r:   rC   todtype	transpose
contiguous)rD   r?   r@   rA   rE   rF   rG   rH   r   rN   rM   attn_outputattention_weightsr   rL   r   flex_attention_forward   s*   
&	
rZ   )NNN)r$   typingr   r   r   r   utilsr   !torch.nn.attention.flex_attentionr   r   r	   r;   r
   Tensorr>   r%   r&   rC   nnModulefloatrZ   r   r   r   r   <module>   sR    8
