Prathmesh48 commited on
Commit
45b151c
·
verified ·
1 Parent(s): 3e578ec

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -0
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModel
4
+
5
+ # Load the tokenizer and model
6
+ @st.cache_resource
7
+ def load_model():
8
+ tokenizer = AutoTokenizer.from_pretrained('dwzhu/e5-base-4k', trust_remote_code=True)
9
+ model = AutoModel.from_pretrained('dwzhu/e5-base-4k', trust_remote_code=True)
10
+ model.to('cpu')
11
+ return tokenizer, model
12
+
13
+ tokenizer, model = load_model()
14
+
15
+ def extract_embeddings(text, tokenizer, model):
16
+ # Tokenize the input text
17
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
18
+ inputs = {k: v.to('cpu') for k, v in inputs.items()}
19
+
20
+ # Get the model's outputs
21
+ with torch.no_grad():
22
+ outputs = model(**inputs)
23
+
24
+ # Extract the embeddings (use the output of the last hidden state)
25
+ embeddings = outputs.last_hidden_state.mean(dim=1)
26
+
27
+ return embeddings.squeeze().cpu().numpy()
28
+
29
+ # Streamlit app
30
+ st.title("Text Embeddings Extractor")
31
+
32
+ text = st.text_area("Enter text to extract embeddings:", "This is an example sentence.")
33
+
34
+ if st.button("Extract Embeddings"):
35
+ embeddings = extract_embeddings(text, tokenizer, model)
36
+ st.write("Embeddings:")
37
+ st.write(embeddings)