o
    ?"g                     @   s   d dl mZmZ d dlZddlmZ ddlmZ e  Z				ddej	j
dejd	ejd
ejdeej dedee dee dee deejdf fddZdS )    )OptionalTupleN   )_flash_attention_forward)#is_flash_attn_greater_or_equal_2_10        modulequerykeyvalueattention_maskdropoutscalingsliding_windowsoftcapreturnc	                 K   s   |j d }
|dd}|dd}|dd}d }|jtjkr@t r(t }nt| jdr3| jj	}nt
dd |  D jj}|	dd  t||||f|
| j||||t|d|	}|d fS )Nr      _pre_quantization_dtypec                 s   s"    | ]}t |tjjr|V  qd S )N)
isinstancetorchnnLinear).0layer r   k/mnt/skqttb/ctump_chatbot/chatbot/lib/python3.10/site-packages/transformers/integrations/flash_attention.py	<genexpr>-   s     z*flash_attention_forward.<locals>.<genexpr>	is_causal)query_lengthr   r   softmax_scaler   r   use_top_left_masktarget_dtype)shape	transposedtyper   float32is_autocast_enabledget_autocast_gpu_dtypehasattrconfigr   nextmodulesweightpopr   r   _use_top_left_mask)r   r	   r
   r   r   r   r   r   r   kwargsseq_lenr!   attn_outputr   r   r   flash_attention_forward   s<   


r2   )r   NNN)typingr   r   r   modeling_flash_attention_utilsr   utilsr   r.   r   ModuleTensorfloatintr2   r   r   r   r   <module>   s<    		