o
    S"gE                     @  st   d dl mZ d dlZd dlZd dlZd dlmZ d dlZd dlmZm	Z	 ddl
mZ eeZG dd de	jZdS )	    )annotationsN)Literal)Tensornn   )WhitespaceTokenizerc                      sp   e Zd ZdZi ddfd' fddZd(ddZd)ddZdd Z	d*d+dd Zd!d" Z	d#d$ Z
ed%d& Z  ZS ),BoWzImplements a Bag-of-Words (BoW) model to derive sentence embeddings.

    A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
    r   Tvocab	list[str]word_weightsdict[str, float]unknown_word_weightfloatcumulative_term_frequencyboolc                   s   t    tt|}g d| _|| _|| _|| _|| _g | _	d}|D ]$}|}||v r0|| }n|
 |v r=||
  }n|d7 }| j	| q#t| dt| d|  t|t dd| _t|| _d S )N)r	   r   r   r   r   r   z out of z0 words without a weighting value. Set weight to F)
stop_wordsdo_lower_case)super__init__listsetconfig_keysr	   r   r   r   weightslowerappendloggerinfolenr   	tokenizersentence_embedding_dimension)selfr	   r   r   r   num_unknown_wordswordweight	__class__ b/mnt/skqttb/ctump_chatbot/chatbot/lib/python3.10/site-packages/sentence_transformers/models/BoW.pyr      s,   


zBoW.__init__featuresdict[str, Tensor]c                 C  s   |S Nr&   )r    r(   r&   r&   r'   forward9   s   zBoW.forwardtextsreturn	list[int]c                   s    fdd|D } |S )Nc                   s    g | ]}j j|fi  qS r&   )r   tokenize).0textkwargsr    r&   r'   
<listcomp>>   s     z BoW.tokenize.<locals>.<listcomp>)get_sentence_features)r    r,   r3   	tokenizedr&   r2   r'   r/   =   s   
zBoW.tokenizec                 C  s   | j S r*   )r   r    r&   r&   r'    get_sentence_embedding_dimensionA   s   z$BoW.get_sentence_embedding_dimensionr   tokenized_textslist[list[int]]pad_seq_lengthint1dict[Literal['sentence_embedding'], torch.Tensor]c                 C  sp   g }|D ],}t j|  t jd}|D ]}| jr#||  | j| 7  < q| j| ||< q|| qdt |iS )N)dtypesentence_embedding)torchzerosr8   float32r   r   r   stack)r    r9   r;   vectorstokensvectortokenr&   r&   r'   r5   D   s   zBoW.get_sentence_featuresc                   s    fdd j D S )Nc                   s   i | ]}| j | qS r&   )__dict__)r0   keyr7   r&   r'   
<dictcomp>U   s    z'BoW.get_config_dict.<locals>.<dictcomp>)r   r7   r&   r7   r'   get_config_dictT   s   zBoW.get_config_dictc                 C  sN   t tj|dd}tj|  |dd W d    d S 1 s w   Y  d S )Nconfig.jsonw   )indent)openospathjoinjsondumprK   )r    output_pathfOutr&   r&   r'   saveW   s   "zBoW.savec                 C  sJ   t tj| d}t|}W d    n1 sw   Y  tdi |S )NrL   r&   )rP   rQ   rR   rS   rT   loadr   )
input_pathfInconfigr&   r&   r'   rY   [   s   zBoW.load)r	   r
   r   r   r   r   r   r   )r(   r)   )r,   r
   r-   r.   )r   )r9   r:   r;   r<   r-   r=   )__name__
__module____qualname____doc__r   r+   r/   r8   r5   rK   rX   staticmethodrY   __classcell__r&   r&   r$   r'   r      s    
#
r   )
__future__r   rT   loggingrQ   typingr   r@   r   r   r   r   	getLoggerr]   r   Moduler   r&   r&   r&   r'   <module>   s    
