Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
import os | |
import shutil | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings | |
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter | |
import transformers | |
import re | |
import string | |
import random | |
import pandas as pd | |
import streamlit as st | |
from langchain import HuggingFaceHub | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain import HuggingFacePipeline | |
import tempfile | |
from PyPDF2 import PdfReader | |
import random | |
st.markdown( | |
""" | |
# π΅ Homo Inspectus | |
(Question Answering Engine) | |
## Suat ATAN | |
""") | |
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
def process_string(input_string): | |
alphanumeric_string = re.sub(r'[^a-zA-Z0-9]', '', input_string) | |
no_blank_string = alphanumeric_string.replace(' ', '') | |
random_numbers = ''.join(random.choices(string.digits, k=2)) | |
final_string = no_blank_string + random_numbers | |
return final_string | |
def show_long_repeating_sentences(text, min_words=4): | |
sentences = text.split('.') | |
unique_sentences = set() | |
repeating_sentences = [] | |
for s in sentences: | |
stripped_sentence = s.strip() | |
if stripped_sentence in unique_sentences and len(stripped_sentence.split()) >= min_words: | |
repeating_sentences.append(stripped_sentence) | |
else: | |
unique_sentences.add(stripped_sentence) | |
return repeating_sentences | |
def create_vdb(fileobj, chunk_size = 200, overlap = 100): | |
if chunk_size < overlap: | |
chunk_size, overlap = 500, 100 | |
print(f'Chunk size: {chunk_size}, overlap: {overlap}') | |
#loader = PyPDFLoader(fileobj) | |
pdf_reader = PdfReader(fileobj) | |
all_text = "" | |
for page in pdf_reader.pages: | |
all_text += page.extract_text() | |
repeating_sentences = show_long_repeating_sentences(all_text) | |
print('Repeating sentences') | |
print(repeating_sentences) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=overlap, | |
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], | |
) | |
chunks = text_splitter.split_text(text = all_text) | |
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
cname = str(random.randint(1000, 9999)) | |
db = Chroma.from_texts(chunks, embedding_function, persist_directory=cname) | |
print(''' Index is created βΉοΈ''') | |
return db | |
""" | |
# Information Retrieval (IR) Tool | |
This tool is your small local and sweet π¬ Google for the document | |
""" | |
db = None | |
uploaded_file = st.file_uploader('Choose your .pdf file', type="pdf") | |
if uploaded_file is not None: | |
db = create_vdb(uploaded_file, 500, 100) | |
st.write("filename:", uploaded_file.name) | |
st.write(uploaded_file) | |
else: | |
st.info("Please upload a PDF file.") | |
"""π You did. Now write your keyword, or question or anything to search in the document""" | |
# q = 'leak' # @param {type:"string"} | |
q = st.text_input('Write your question or keywords', 'HVAC', key='q1') | |
st.write(f'Your question is: {q}') | |
if db: | |
answers = [] | |
pgnos = [] | |
for d in db.similarity_search(q, k=10): | |
answers.append(d.page_content) | |
#pgnos.append(d.metadata['pdf_page_number']) | |
results = pd.DataFrame({'Cite':answers}) | |
st.markdown('## Results') | |
st.dataframe(results) | |
st.markdown( | |
"""The order of result is based the most relevant ones to less. Like Google | |
# Chat Tool (Humble ) π π | |
This tool is allow your direct question to answer like my rival πͺ ChatGPT | |
""") | |
def ask_to_alpaca(question, vector_store, qa_chain): | |
similar_docs = vector_store.similarity_search(question) | |
response = qa_chain.run(input_documents=similar_docs, question=question) | |
return response | |
explicit_question = st.text_input('Write your question', 'Roof problems', key='q2') | |
if db: | |
max_length=1024 | |
model_name = "declare-lab/flan-alpaca-large" | |
llm = HuggingFacePipeline.from_model_id( | |
model_id=model_name, | |
task="text2text-generation", | |
model_kwargs={"max_length": max_length}, | |
) | |
qa_chain = load_qa_chain(llm, chain_type="stuff") | |
# explicit_question = 'Is there any problem of the roofs?' # @param {type:"string"} | |
st.markdown("Answer:") | |
st.write(ask_to_alpaca(explicit_question, db, qa_chain)) | |
else: | |
st.markdown('Please upload a PDF file.') | |