o
    S"go&                     @  s   d dl mZ d dlZd dlZd dlmZ d dlZd dlZd dl	m
Z d dl	mZ d dlmZ d dlmZ d dlmZ d d	lmZ G d
d dejZdS )    )annotationsN)Path)	load_file)	save_file)	Tokenizer)nn)PreTrainedTokenizerFast)get_device_namec                      s   e Zd Z		d4d5 fd
dZd6ddZd7ddZd8ddZed9ddZd9ddZ	d:d;d!d"Z
d<d$d%Ze			&		d=d>d/d0Zed?d2d3Z  ZS )@StaticEmbeddingN	tokenizer#Tokenizer | PreTrainedTokenizerFastembedding_weights np.ndarray | torch.Tensor | Noneembedding_dim
int | NonereturnNonec                   s   t    t|tr|j}n	t|tstd|dur0t|tjr&t	
|}tjj|dd| _n|dur>t| || _ntd| jj| _| jj| _|| _| j  |dd| _dS )a  
        Initializes the StaticEmbedding model given a tokenizer. The model is a simple embedding bag model that
        takes the mean of trained per-token embeddings to compute text embeddings.

        Args:
            tokenizer (Tokenizer | PreTrainedTokenizerFast): The tokenizer to be used. Must be a fast tokenizer
                from ``transformers`` or ``tokenizers``.
            embedding_weights (np.ndarray | torch.Tensor | None, optional): Pre-trained embedding weights.
                Defaults to None.
            embedding_dim (int | None, optional): Dimension of the embeddings. Required if embedding_weights
                is not provided. Defaults to None.

        Example::

            from sentence_transformers import SentenceTransformer
            from sentence_transformers.models import StaticEmbedding
            from tokenizers import Tokenizer

            # Pre-distilled embeddings:
            static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
            # or distill your own embeddings:
            static_embedding = StaticEmbedding.from_distillation("BAAI/bge-base-en-v1.5", device="cuda")
            # or start with randomized embeddings:
            tokenizer = Tokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
            static_embedding = StaticEmbedding(tokenizer, embedding_dim=512)

            model = SentenceTransformer(modules=[static_embedding])

            embeddings = model.encode(["What are Pandas?", "The giant panda, also known as the panda bear or simply the panda, is a bear native to south central China."])
            similarity = model.similarity(embeddings[0], embeddings[1])
            # tensor([[0.8093]]) (If you use potion-base-8M)
            # tensor([[0.6234]]) (If you use the distillation method)
            # tensor([[-0.0693]]) (For example, if you use randomized embeddings)

        Raises:
            ValueError: If the tokenizer is not a fast tokenizer.
            ValueError: If neither `embedding_weights` nor `embedding_dim` is provided.
        zThe tokenizer must be fast (i.e. Rust-backed) to use this class. Use Tokenizer.from_pretrained() from `tokenizers` to load a fast tokenizer.NF)freezez?Either `embedding_weights` or `embedding_dim` must be provided.
base_model)super__init__
isinstancer   
_tokenizerr   
ValueErrornpndarraytorch
from_numpyr   EmbeddingBagfrom_pretrained	embeddingget_vocab_sizenum_embeddingsr   r   
no_paddinggetr   )selfr   r   r   kwargs	__class__ n/mnt/skqttb/ctump_chatbot/chatbot/lib/python3.10/site-packages/sentence_transformers/models/StaticEmbedding.pyr      s&   
-





zStaticEmbedding.__init__texts	list[str]dict[str, torch.Tensor]c              	   K  sj   | j j|dd}dd |D }ttdgdd |d d D  }tjdd |D tjd	}||d
S )NF)add_special_tokensc                 S  s   g | ]}|j qS r)   )ids).0encodingr)   r)   r*   
<listcomp>_   s    z,StaticEmbedding.tokenize.<locals>.<listcomp>r   c                 S  s   g | ]}t |qS r)   )len)r0   	token_idsr)   r)   r*   r2   a   s    c                 S  s   g | ]	}|D ]}|qqS r)   r)   )r0   r4   token_idr)   r)   r*   r2   b   s    )dtype)	input_idsoffsets)r   encode_batchr   r   r   cumsumtensorlong)r%   r+   r&   	encodingsencodings_idsr9   r8   r)   r)   r*   tokenize]   s
   (
zStaticEmbedding.tokenizefeaturesc                 K  s   |  |d |d |d< |S )Nr8   r9   sentence_embedding)r    )r%   rA   r&   r)   r)   r*   forwarde   s   zStaticEmbedding.forwarddict[str, float]c                 C  s   i S Nr)   r%   r)   r)   r*   get_config_dicti   s   zStaticEmbedding.get_config_dictintc                 C  s   t jS rE   )mathinfrF   r)   r)   r*   max_seq_lengthl   s   zStaticEmbedding.max_seq_lengthc                 C  s   | j S rE   )r   rF   r)   r)   r*    get_sentence_embedding_dimensionp   s   z0StaticEmbedding.get_sentence_embedding_dimensionTsave_dirstrsafe_serializationboolc                 K  sT   |rt |  tj|d nt|  tj|d | jtt	|d  d S )Nmodel.safetensorspytorch_model.bintokenizer.json)
save_safetensors_file
state_dictospathjoinr   saver   rN   r   )r%   rM   rO   r&   r)   r)   r*   rY   s   s   zStaticEmbedding.saveload_dirc                 K  s   t tt| d }tjtj| dr ttj| d}nt	j
tj| dt	ddd}z|d }W n tyC   |d }Y nw t||d	S )
NrS   rQ   rR   cpuT)map_locationweights_onlyzembedding.weight
embeddings)r   )r   	from_filerN   r   rV   rW   existsrX   load_safetensors_filer   loaddeviceKeyErrorr
   )rZ   r&   r   weightsr)   r)   r*   rb   z   s   zStaticEmbedding.load   
model_name
vocabularylist[str] | Nonerc   
str | Nonepca_dims
apply_zipfuse_subwordc                 C  sz   zddl m} W n ty   tdw t }|||||||d}t|jtjr/t	|j}	n|jj
}	|j}
| |
|	|dS )a  
        Creates a StaticEmbedding instance from a distillation process using the `model2vec` package.

        Args:
            model_name (str): The name of the model to distill.
            vocabulary (list[str] | None, optional): A list of vocabulary words to use. Defaults to None.
            device (str): The device to run the distillation on (e.g., 'cpu', 'cuda'). If not specified,
                the strongest device is automatically detected. Defaults to None.
            pca_dims (int | None, optional): The number of dimensions for PCA reduction. Defaults to 256.
            apply_zipf (bool): Whether to apply Zipf's law during distillation. Defaults to True.
            use_subword (bool): Whether to use subword tokenization. Defaults to True.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the distilled model's
                tokenizer and embedding weights.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )distillz\To use this method, please install the `model2vec` package: `pip install model2vec[distill]`)rh   rc   rk   rl   rm   r   r   )model2vec.distillrn   ImportErrorr	   r   r    r   r   r   r   weightr   )clsrg   rh   rc   rk   rl   rm   rn   static_modelr   r   r)   r)   r*   from_distillation   s*   z!StaticEmbedding.from_distillationmodel_id_or_pathc                 C  sj   zddl m} W n ty   tdw ||}t|jtjr't	|j}n|jj
}|j}| |||dS )aH  
        Create a StaticEmbedding instance from a model2vec model. This method loads a pre-trained model2vec model
        and extracts the embedding weights and tokenizer to create a StaticEmbedding instance.

        Args:
            model_id_or_path (str): The identifier or path to the pre-trained model2vec model.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the tokenizer and embedding weights
                 the model2vec model.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )StaticModelzSTo use this method, please install the `model2vec` package: `pip install model2vec`ro   )	model2vecrw   rq   r   r   r    r   r   r   r   rr   r   )rs   rv   rw   rt   r   r   r)   r)   r*   from_model2vec   s   
zStaticEmbedding.from_model2vec)NN)r   r   r   r   r   r   r   r   )r+   r,   r   r-   )rA   r-   r   r-   )r   rD   )r   rH   )T)rM   rN   rO   rP   r   r   )rZ   rN   r   r
   )NNrf   TT)rg   rN   rh   ri   rc   rj   rk   r   rl   rP   rm   rP   r   r
   )rv   rN   r   r
   )__name__
__module____qualname__r   r@   rC   rG   propertyrK   rL   rY   rb   classmethodru   ry   __classcell__r)   r)   r'   r*   r
      s*    
J



5r
   )
__future__r   rI   rV   pathlibr   numpyr   r   safetensors.torchr   ra   r   rT   
tokenizersr   r   transformersr   sentence_transformers.utilr	   Moduler
   r)   r)   r)   r*   <module>   s    