File size: 7,478 Bytes
b4853bc 4896206 b4853bc 68c232e 9fa317e 68c232e b4853bc 85d96eb 00ae860 c5a7d95 00ae860 c5a7d95 00ae860 c5a7d95 b4853bc 85d96eb 00ae860 b4853bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import os
import streamlit as st
from dotenv import load_dotenv
import itertools
from pinecone import Pinecone
from langchain_community.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import time
# Set up environment, Pinecone is a database
cache_dir = None # Directory for cache
Huggingface_token = st.secrets["HUGGINGFACEHUB_API_TOKEN"] # Huggingface API key
pc = Pinecone(api_key=st.secrets["PINECONE_API_KEY"]) # Database API key
index = pc.Index(st.secrets["Index_Name"]) # Database index name
# Initialize embedding model (LLM will be saved to cache_dir if assigned)
embedding_model = "all-mpnet-base-v2" # See link https://www.sbert.net/docs/pretrained_models.html
if cache_dir:
embedding = SentenceTransformer(embedding_model, cache_folder=cache_dir)
else:
embedding = SentenceTransformer(embedding_model)
# Read the PDF files, divide them into chunks, and Embedding
def read_doc(file_path):
file_loader = PyPDFDirectoryLoader(file_path)
documents = file_loader.load()
return documents
def chunk_data(docs, chunk_size=300, chunk_overlap=50):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
doc = text_splitter.split_documents(docs)
return doc
# Save embeddings to database
def chunks(iterable, batch_size=100):
"""A helper function to break an iterable into chunks of size batch_size."""
it = iter(iterable)
chunk = tuple(itertools.islice(it, batch_size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it, batch_size))
# Streamlit interface start, uploading file
st.title("RAG-Anwendung (RAG Application)")
st.caption("Diese Anwendung kann Ihnen helfen, kostenlos Fragen zu PDF-Dateien zu stellen. (This application can help you ask questions about PDF files for free.)")
uploaded_file = st.file_uploader("Wählen Sie eine PDF-Datei, das Laden kann eine Weile dauern. (Choose a PDF file, loading might take a while.)", type="pdf")
if uploaded_file is not None:
# Ensure the temp directory exists and is empty
temp_dir = "tempDir"
if os.path.exists(temp_dir):
for file in os.listdir(temp_dir):
file_path = os.path.join(temp_dir, file)
if os.path.isfile(file_path):
os.remove(file_path)
elif os.path.isdir(file_path):
os.rmdir(file_path) # Only removes empty directories
os.makedirs(temp_dir, exist_ok=True)
# Save the uploaded file temporarily
temp_file_path = os.path.join(temp_dir, uploaded_file.name)
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
doc = read_doc(temp_dir+"/")
documents = chunk_data(docs=doc)
texts = [document.page_content for document in documents]
pdf_vectors = embedding.encode(texts)
vector_count = len(documents)
example_data_generator = map(lambda i: (f'id-{i}', pdf_vectors[i], {"text": texts[i]}), range(vector_count))
# Update the Pinecone index with new vectors
for ids_vectors_chunk in chunks(example_data_generator, batch_size=100): # Iterate through chunks of example data
index.upsert(vectors=ids_vectors_chunk, namespace='ns1') # Upsert (update or insert) vectors
time.sleep(0.05) # Pause to avoid overwhelming the server
ns_count = index.describe_index_stats()['namespaces']['ns1']['vector_count'] # Get current vector count in namespace 'ns1'
if vector_count < ns_count: # Check if the old vectors are still inside
ids_to_delete = [f'id-{i}' for i in range(vector_count, ns_count)] # Generate list of IDs to delete
index.delete(ids=ids_to_delete, namespace='ns1') # Delete old vectors
time.sleep(0.05) # Pause to avoid overwhelming the server
# Input for the search query
with st.form(key='my_form'):
sample_query = st.text_input("Stellen Sie eine Frage zu dem PDF: (Ask a question related to the PDF:)") # User query input
submit_button = st.form_submit_button(label='Abschicken (Submit)') # Submit button
if submit_button:
if uploaded_file is not None and sample_query: # Check if file is uploaded and query provided
query_vector = embedding.encode(sample_query).tolist() # Encode query to vector
query_search = index.query(vector=query_vector, top_k=5, include_metadata=True, namespace='ns1') # Search index
time.sleep(0.1) # Pause to avoid overwhelming the server
matched_contents = [match["metadata"]["text"] for match in query_search["matches"]] # Extract text metadata from results
# Rerank
rerank_model = "BAAI/bge-reranker-v2-m3"
if cache_dir:
tokenizer = AutoTokenizer.from_pretrained(rerank_model, cache_dir=cache_dir)
model = AutoModelForSequenceClassification.from_pretrained(rerank_model, cache_dir=cache_dir)
else:
tokenizer = AutoTokenizer.from_pretrained(rerank_model)
model = AutoModelForSequenceClassification.from_pretrained(rerank_model)
model.eval()
pairs = [[sample_query, content] for content in matched_contents]
with torch.no_grad():
inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=300)
scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
matched_contents = [content for _, content in sorted(zip(scores, matched_contents), key=lambda x: x[0], reverse=True)]
matched_contents = matched_contents[0]
del model
torch.cuda.empty_cache()
# Display matched contents after reranking
st.markdown("### Möglicherweise relevante Abschnitte aus dem PDF (Potentially relevant sections from the PDF):")
st.write(matched_contents)
# Get answer
query_model = "meta-llama/Meta-Llama-3-8B-Instruct"
llm_huggingface = HuggingFaceHub(repo_id=query_model, model_kwargs={"temperature": 0.7, "max_length": 500})
prompt_template = PromptTemplate(input_variables=['query', 'context'], template="{query}, Beim Beantworten der Frage bitte mit dem Wort 'Antwort:' beginnen,unter Berücksichtigung des folgenden Kontexts: \n\n{context}")
prompt = prompt_template.format(query=sample_query, context=matched_contents)
chain = LLMChain(llm=llm_huggingface, prompt=prompt_template)
result = chain.run(query=sample_query, context=matched_contents)
# Polish answer
result = result.replace(prompt, "")
special_start = "Antwort:"
start_index = result.find(special_start)
if (start_index != -1):
result = result[start_index + len(special_start):].lstrip()
else:
result = result.lstrip()
# Display the final answer with a note about limitations
st.markdown("### Antwort (Answer):")
st.write(result)
st.markdown("**Hinweis:** Aufgrund begrenzter Rechenleistung kann das große Sprachmodell möglicherweise keine vollständige Antwort liefern. (Note: Due to limited computational power, the large language model might not be able to provide a complete response.)")
|