o
    S"g                     @  sn   d dl mZ d dlmZ d dlmZ d dlZd dlmZmZ d dl	m
Z
mZ d dlmZ G dd	 d	ejZdS )
    )annotations)Iterable)AnyN)Tensornn)StaticEmbeddingTransformer)SentenceTransformerc                      sN   e Zd Z	dd fd	d
ZdddZdddZdddZed ddZ  Z	S )!GISTEmbedLoss{Gz?modelr	   guidetemperaturefloatreturnNonec                   s   t    || _|| _|| _tjdd| _t|d t	r#t|d t	s't
d|j |j kp6|j|jk | _| jrL| jj| _t| jd trNt
ddS dS )a
  
        This loss is used to train a SentenceTransformer model using the GISTEmbed algorithm.
        It takes a model and a guide model as input, and uses the guide model to guide the
        in-batch negative sample selection. The cosine similarity is used to compute the loss
        and the temperature parameter is used to scale the cosine similarities.

        Args:
            model: SentenceTransformer model based on a `transformers`
                model.
            guide: SentenceTransformer model to guide the in-batch
                negative sample selection.
            temperature: Temperature parameter to scale the cosine
                similarities.

        References:
            - For further details, see: https://arxiv.org/abs/2402.16829

        Requirements:
            1. (anchor, positive, negative) triplets
            2. (anchor, positive) pairs

        Inputs:
            +---------------------------------------+--------+
            | Texts                                 | Labels |
            +=======================================+========+
            | (anchor, positive, negative) triplets | none   |
            +---------------------------------------+--------+
            | (anchor, positive) pairs              | none   |
            +---------------------------------------+--------+

        Recommendations:
            - Use ``BatchSamplers.NO_DUPLICATES`` (:class:`docs <sentence_transformers.training_args.BatchSamplers>`) to
              ensure that no in-batch negatives are duplicates of the anchor or positive samples.

        Relations:
            - :class:`MultipleNegativesRankingLoss` is similar to this loss, but it does not use
              a guide model to guide the in-batch negative sample selection. `GISTEmbedLoss` yields
              a stronger training signal at the cost of some training overhead.

        Example:
            ::

                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
                from datasets import Dataset

                model = SentenceTransformer("microsoft/mpnet-base")
                guide = SentenceTransformer("all-MiniLM-L6-v2")
                train_dataset = Dataset.from_dict({
                    "anchor": ["It's nice weather outside today.", "He drove to work."],
                    "positive": ["It's so sunny.", "He took the car to the office."],
                })
                loss = losses.GISTEmbedLoss(model, guide)

                trainer = SentenceTransformerTrainer(
                    model=model,
                    train_dataset=train_dataset,
                    loss=loss,
                )
                trainer.train()
        dimr   z_Both the training model and the guiding model must be based on the `transformers` architecture.zIf we must retokenize because the guide model has a different tokenizer, then the Sentence Transformer model must not be based on a StaticEmbedding.N)super__init__r   r   r   r   CosineSimilaritysimilarity_fct
isinstancer   
ValueError	tokenizer	get_vocabmax_seq_lengthmust_retokenizer   )selfr   r   r   	__class__ l/mnt/skqttb/ctump_chatbot/chatbot/lib/python3.10/site-packages/sentence_transformers/losses/GISTEmbedLoss.pyr      s&   
B
zGISTEmbedLoss.__init__embed1r   embed2c                 C  s   |  |d|dS )N   r   )r   	unsqueeze)r   r$   r%   r"   r"   r#   
sim_matrixe   s   zGISTEmbedLoss.sim_matrixsentence_featuresIterable[dict[str, Tensor]]labelsc                   s   fdd|D }t  /  jr, fdd|D } fdd|D } fdd|D } fdd|D }W d    n1 s?w   Y  d }d }t|dkrW|\}}	|\}
}nt|dkrh|\}}	}|\}
}}n	td	t|  ||	} ||} |	|	} |
|} |
|
} ||}| d
d}t j |||k< t j |||k< t j |||k< |||g}|d ur׈ ||} |
|}t j |||k< |	| t j
|dd j }t |d |j}t ||S )Nc                      g | ]	}  |d  qS sentence_embedding)r   .0sentence_featurer   r"   r#   
<listcomp>i   s    z)GISTEmbedLoss.forward.<locals>.<listcomp>c                   s    g | ]} j j|d  ddqS )	input_idsT)skip_special_tokens)r   batch_decoder/   r2   r"   r#   r3   l   s    c                   s   g | ]} j |qS r"   )r   tokenize)r0   	sentencesr2   r"   r#   r3   p   s    c                   s"   g | ]} fd d|  D qS )c                   s    i | ]\}}||  jjqS r"   )tor   device)r0   keyvaluer2   r"   r#   
<dictcomp>r   s     z4GISTEmbedLoss.forward.<locals>.<listcomp>.<dictcomp>)itemsr/   r2   r"   r#   r3   q   s    c                   r,   r-   )r   r/   r2   r"   r#   r3   v   s          z Expected 2 or 3 embeddings, got r   r&   r   r   )torchno_gradr   lenr   r(   diagonalviewinfappendcatr   arangesizelongr9   r:   r   CrossEntropyLoss)r   r)   r+   
embeddingsdecodedguide_embeddingsnegativenegative_guideanchorpositiveanchor_guidepositive_guideap_simaa_simpp_simguided_ap_simguided_aa_simguided_pp_sim
guided_simscoresan_simguided_an_simr"   r2   r#   forwardh   sT   







zGISTEmbedLoss.forwarddict[str, Any]c                 C  s   | j | jdS )Nr   r   rb   r2   r"   r"   r#   get_config_dict   s   zGISTEmbedLoss.get_config_dictstrc                 C  s   dS )Na  
@misc{solatorio2024gistembed,
    title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
    author={Aivin V. Solatorio},
    year={2024},
    eprint={2402.16829},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}
r"   r2   r"   r"   r#   citation   s   zGISTEmbedLoss.citation)r   )r   r	   r   r	   r   r   r   r   )r$   r   r%   r   r   r   )r)   r*   r+   r   r   r   )r   ra   )r   rd   )
__name__
__module____qualname__r   r(   r`   rc   propertyre   __classcell__r"   r"   r    r#   r
      s    
W

Er
   )
__future__r   collections.abcr   typingr   rA   r   r   sentence_transformers.modelsr   r   )sentence_transformers.SentenceTransformerr	   Moduler
   r"   r"   r"   r#   <module>   s    