Spaces:

manjunathshiva
/

BibleGPT2

Sleeping

App Files Files Community

manjunathshiva commited on Feb 18

Commit

e072b8c

•

1 Parent(s): 6f2bfa6

Create app.py

Browse files

Files changed (1) hide show

app.py +167 -0

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from llama_index.core import (
+ VectorStoreIndex
+)
+from llama_index.core import Settings
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from qdrant_client import QdrantClient
+from typing import Any, List, Tuple
+import torch
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+import streamlit as st
+from llama_index.llms.huggingface import (
+ HuggingFaceInferenceAPI
+)
+import os
+HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
+Q_END_POINT = os.environ.get("Q_END_POINT")
+Q_API_KEY = os.environ.get("Q_API_KEY")
+#DOC
+#https://docs.llamaindex.ai/en/stable/examples/vector_stores/qdrant_hybrid.html
+doc_tokenizer = AutoTokenizer.from_pretrained(
+ "naver/efficient-splade-VI-BT-large-doc"
+)
+doc_model = AutoModelForMaskedLM.from_pretrained(
+ "naver/efficient-splade-VI-BT-large-doc"
+)
+query_tokenizer = AutoTokenizer.from_pretrained(
+ "naver/efficient-splade-VI-BT-large-query"
+)
+query_model = AutoModelForMaskedLM.from_pretrained(
+ "naver/efficient-splade-VI-BT-large-query"
+)
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+doc_model = doc_model.to(device)
+query_model = query_model.to(device)
+def sparse_doc_vectors(
+ texts: List[str],
+) -> Tuple[List[List[int]], List[List[float]]]:
+ """
+ Computes vectors from logits and attention mask using ReLU, log, and max operations.
+ """
+ tokens = doc_tokenizer(
+ texts, truncation=True, padding=True, return_tensors="pt"
+ )
+ if torch.cuda.is_available():
+ tokens = tokens.to("cuda:1")
+ output = doc_model(**tokens)
+ logits, attention_mask = output.logits, tokens.attention_mask
+ relu_log = torch.log(1 + torch.relu(logits))
+ weighted_log = relu_log * attention_mask.unsqueeze(-1)
+ tvecs, _ = torch.max(weighted_log, dim=1)
+ # extract the vectors that are non-zero and their indices
+ indices = []
+ vecs = []
+ for batch in tvecs:
+ indices.append(batch.nonzero(as_tuple=True)[0].tolist())
+ vecs.append(batch[indices[-1]].tolist())
+ return indices, vecs
+def sparse_query_vectors(
+ texts: List[str],
+) -> Tuple[List[List[int]], List[List[float]]]:
+ """
+ Computes vectors from logits and attention mask using ReLU, log, and max operations.
+ """
+ # TODO: compute sparse vectors in batches if max length is exceeded
+ tokens = query_tokenizer(
+ texts, truncation=True, padding=True, return_tensors="pt"
+ )
+ if torch.cuda.is_available():
+ tokens = tokens.to("cuda:1")
+ output = query_model(**tokens)
+ logits, attention_mask = output.logits, tokens.attention_mask
+ relu_log = torch.log(1 + torch.relu(logits))
+ weighted_log = relu_log * attention_mask.unsqueeze(-1)
+ tvecs, _ = torch.max(weighted_log, dim=1)
+ # extract the vectors that are non-zero and their indices
+ indices = []
+ vecs = []
+ for batch in tvecs:
+ indices.append(batch.nonzero(as_tuple=True)[0].tolist())
+ vecs.append(batch[indices[-1]].tolist())
+ return indices, vecs
+st.header("Chat with the Bible docs 💬 📚")
+if "messages" not in st.session_state.keys(): # Initialize the chat message history
+ st.session_state.messages = [
+ {"role": "assistant", "content": "Ask me a question about Bible!"}
+ ]
+# creates a persistant index to disk
+client = QdrantClient(
+ Q_END_POINT,
+ api_key=Q_API_KEY,
+ )
+# create our vector store with hybrid indexing enabled
+# batch_size controls how many nodes are encoded with sparse vectors at once
+vector_store = QdrantVectorStore(
+ "bible", client=client, enable_hybrid=True, batch_size=20,force_disable_check_same_thread=True,
+ sparse_doc_fn=sparse_doc_vectors,
+ sparse_query_fn=sparse_query_vectors,
+)
+llm = HuggingFaceInferenceAPI(
+ model_name="mistralai/Mistral-7B-Instruct-v0.2",
+ token=HUGGINGFACEHUB_API_TOKEN,
+ context_window=8096,
+)
+Settings.llm = llm
+Settings.tokenzier = AutoTokenizer.from_pretrained(
+ "mistralai/Mistral-7B-Instruct-v0.2"
+)
+embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5", device="cpu")
+Settings.embed_model = embed_model
+index = VectorStoreIndex.from_vector_store(vector_store=vector_store,embed_model=embed_model)
+from llama_index.core.memory import ChatMemoryBuffer
+memory = ChatMemoryBuffer.from_defaults(token_limit=1500)
+chat_engine = index.as_chat_engine(chat_mode="condense_question",
+ verbose=True,
+ memory=memory,
+ sparse_top_k=10,
+ vector_store_query_mode="hybrid",
+ similarity_top_k=3,
+ )
+if prompt := st.chat_input("Your question"): # Prompt for user input and save to chat history
+ st.session_state.messages.append({"role": "user", "content": prompt})
+for message in st.session_state.messages: # Display the prior chat messages
+ with st.chat_message(message["role"]):
+ st.write(message["content"])
+# If last message is not from assistant, generate a new response
+if st.session_state.messages[-1]["role"] != "assistant":
+ with st.chat_message("assistant"):
+ with st.spinner("Thinking..."):
+ response = chat_engine.chat(prompt)
+ st.write(response.response)
+ message = {"role": "assistant", "content": response.response}
+ st.session_state.messages.append(message) # Add response to message history