|
from llama_index.core import ( |
|
VectorStoreIndex |
|
) |
|
from llama_index.core import Settings |
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
|
from llama_index.vector_stores.qdrant import QdrantVectorStore |
|
from qdrant_client import QdrantClient |
|
from typing import Any, List, Tuple |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForMaskedLM |
|
import streamlit as st |
|
from llama_index.llms.huggingface import ( |
|
HuggingFaceInferenceAPI |
|
) |
|
import os |
|
HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN") |
|
Q_END_POINT = os.environ.get("Q_END_POINT") |
|
Q_API_KEY = os.environ.get("Q_API_KEY") |
|
|
|
|
|
|
|
|
|
|
|
doc_tokenizer = AutoTokenizer.from_pretrained( |
|
"naver/efficient-splade-VI-BT-large-doc" |
|
) |
|
doc_model = AutoModelForMaskedLM.from_pretrained( |
|
"naver/efficient-splade-VI-BT-large-doc" |
|
) |
|
|
|
query_tokenizer = AutoTokenizer.from_pretrained( |
|
"naver/efficient-splade-VI-BT-large-query" |
|
) |
|
query_model = AutoModelForMaskedLM.from_pretrained( |
|
"naver/efficient-splade-VI-BT-large-query" |
|
) |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
doc_model = doc_model.to(device) |
|
query_model = query_model.to(device) |
|
|
|
|
|
def sparse_doc_vectors( |
|
texts: List[str], |
|
) -> Tuple[List[List[int]], List[List[float]]]: |
|
""" |
|
Computes vectors from logits and attention mask using ReLU, log, and max operations. |
|
""" |
|
tokens = doc_tokenizer( |
|
texts, truncation=True, padding=True, return_tensors="pt" |
|
) |
|
if torch.cuda.is_available(): |
|
tokens = tokens.to("cuda:1") |
|
|
|
output = doc_model(**tokens) |
|
logits, attention_mask = output.logits, tokens.attention_mask |
|
relu_log = torch.log(1 + torch.relu(logits)) |
|
weighted_log = relu_log * attention_mask.unsqueeze(-1) |
|
tvecs, _ = torch.max(weighted_log, dim=1) |
|
|
|
|
|
indices = [] |
|
vecs = [] |
|
for batch in tvecs: |
|
indices.append(batch.nonzero(as_tuple=True)[0].tolist()) |
|
vecs.append(batch[indices[-1]].tolist()) |
|
|
|
return indices, vecs |
|
|
|
|
|
def sparse_query_vectors( |
|
texts: List[str], |
|
) -> Tuple[List[List[int]], List[List[float]]]: |
|
""" |
|
Computes vectors from logits and attention mask using ReLU, log, and max operations. |
|
""" |
|
|
|
tokens = query_tokenizer( |
|
texts, truncation=True, padding=True, return_tensors="pt" |
|
) |
|
if torch.cuda.is_available(): |
|
tokens = tokens.to("cuda:1") |
|
|
|
|
|
output = query_model(**tokens) |
|
logits, attention_mask = output.logits, tokens.attention_mask |
|
relu_log = torch.log(1 + torch.relu(logits)) |
|
weighted_log = relu_log * attention_mask.unsqueeze(-1) |
|
tvecs, _ = torch.max(weighted_log, dim=1) |
|
|
|
|
|
indices = [] |
|
vecs = [] |
|
for batch in tvecs: |
|
indices.append(batch.nonzero(as_tuple=True)[0].tolist()) |
|
vecs.append(batch[indices[-1]].tolist()) |
|
|
|
return indices, vecs |
|
|
|
st.header("Chat with the Bhagavad Gita docs π¬ π") |
|
|
|
if "messages" not in st.session_state.keys(): |
|
st.session_state.messages = [ |
|
{"role": "assistant", "content": "Ask me a question about Gita!"} |
|
] |
|
|
|
|
|
|
|
client = QdrantClient( |
|
Q_END_POINT, |
|
api_key=Q_API_KEY, |
|
) |
|
|
|
|
|
vector_store = QdrantVectorStore( |
|
"bhagavad_gita", client=client, enable_hybrid=True, batch_size=20,force_disable_check_same_thread=True, |
|
sparse_doc_fn=sparse_doc_vectors, |
|
sparse_query_fn=sparse_query_vectors, |
|
) |
|
|
|
|
|
llm = HuggingFaceInferenceAPI( |
|
model_name="meta-llama/Meta-Llama-3-8B-Instruct", |
|
token=HUGGINGFACEHUB_API_TOKEN, |
|
context_window=8096, |
|
) |
|
Settings.llm = llm |
|
Settings.tokenzier = AutoTokenizer.from_pretrained( |
|
"meta-llama/Meta-Llama-3-8B-Instruct" |
|
) |
|
|
|
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5", device="cpu") |
|
Settings.embed_model = embed_model |
|
|
|
index = VectorStoreIndex.from_vector_store(vector_store=vector_store,embed_model=embed_model) |
|
|
|
from llama_index.core.memory import ChatMemoryBuffer |
|
memory = ChatMemoryBuffer.from_defaults(token_limit=1500) |
|
|
|
chat_engine = index.as_chat_engine(chat_mode="condense_question", |
|
verbose=True, |
|
memory=memory, |
|
sparse_top_k=10, |
|
vector_store_query_mode="hybrid", |
|
similarity_top_k=3, |
|
) |
|
|
|
if prompt := st.chat_input("Your question"): |
|
st.session_state.messages.append({"role": "user", "content": prompt}) |
|
|
|
for message in st.session_state.messages: |
|
with st.chat_message(message["role"]): |
|
st.write(message["content"]) |
|
|
|
|
|
if st.session_state.messages[-1]["role"] != "assistant": |
|
with st.chat_message("assistant"): |
|
with st.spinner("Thinking..."): |
|
response = chat_engine.chat(prompt) |
|
st.write(response.response) |
|
message = {"role": "assistant", "content": response.response} |
|
st.session_state.messages.append(message) |
|
|
|
|
|
|
|
|
|
|