homoinspectus / app.py
suatatan's picture
Update app.py
0b31cee
# -*- coding: utf-8 -*-
import os
import shutil
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
import transformers
import re
import string
import random
import pandas as pd
import streamlit as st
from langchain import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFacePipeline
import tempfile
from PyPDF2 import PdfReader
import random
st.markdown(
"""
# 🐡 Homo Inspectus
(Question Answering Engine)
## Suat ATAN
""")
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
def process_string(input_string):
alphanumeric_string = re.sub(r'[^a-zA-Z0-9]', '', input_string)
no_blank_string = alphanumeric_string.replace(' ', '')
random_numbers = ''.join(random.choices(string.digits, k=2))
final_string = no_blank_string + random_numbers
return final_string
def show_long_repeating_sentences(text, min_words=4):
sentences = text.split('.')
unique_sentences = set()
repeating_sentences = []
for s in sentences:
stripped_sentence = s.strip()
if stripped_sentence in unique_sentences and len(stripped_sentence.split()) >= min_words:
repeating_sentences.append(stripped_sentence)
else:
unique_sentences.add(stripped_sentence)
return repeating_sentences
def create_vdb(fileobj, chunk_size = 200, overlap = 100):
if chunk_size < overlap:
chunk_size, overlap = 500, 100
print(f'Chunk size: {chunk_size}, overlap: {overlap}')
#loader = PyPDFLoader(fileobj)
pdf_reader = PdfReader(fileobj)
all_text = ""
for page in pdf_reader.pages:
all_text += page.extract_text()
repeating_sentences = show_long_repeating_sentences(all_text)
print('Repeating sentences')
print(repeating_sentences)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
chunks = text_splitter.split_text(text = all_text)
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
cname = str(random.randint(1000, 9999))
db = Chroma.from_texts(chunks, embedding_function, persist_directory=cname)
print(''' Index is created ⛹️''')
return db
"""
# Information Retrieval (IR) Tool
This tool is your small local and sweet 🍬 Google for the document
"""
db = None
uploaded_file = st.file_uploader('Choose your .pdf file', type="pdf")
if uploaded_file is not None:
db = create_vdb(uploaded_file, 500, 100)
st.write("filename:", uploaded_file.name)
st.write(uploaded_file)
else:
st.info("Please upload a PDF file.")
"""πŸ‘Œ You did. Now write your keyword, or question or anything to search in the document"""
# q = 'leak' # @param {type:"string"}
q = st.text_input('Write your question or keywords', 'HVAC', key='q1')
st.write(f'Your question is: {q}')
if db:
answers = []
pgnos = []
for d in db.similarity_search(q, k=10):
answers.append(d.page_content)
#pgnos.append(d.metadata['pdf_page_number'])
results = pd.DataFrame({'Cite':answers})
st.markdown('## Results')
st.dataframe(results)
st.markdown(
"""The order of result is based the most relevant ones to less. Like Google
# Chat Tool (Humble ) πŸ†“ πŸš—
This tool is allow your direct question to answer like my rival πŸ”ͺ ChatGPT
""")
def ask_to_alpaca(question, vector_store, qa_chain):
similar_docs = vector_store.similarity_search(question)
response = qa_chain.run(input_documents=similar_docs, question=question)
return response
explicit_question = st.text_input('Write your question', 'Roof problems', key='q2')
if db:
max_length=1024
model_name = "declare-lab/flan-alpaca-large"
llm = HuggingFacePipeline.from_model_id(
model_id=model_name,
task="text2text-generation",
model_kwargs={"max_length": max_length},
)
qa_chain = load_qa_chain(llm, chain_type="stuff")
# explicit_question = 'Is there any problem of the roofs?' # @param {type:"string"}
st.markdown("Answer:")
st.write(ask_to_alpaca(explicit_question, db, qa_chain))
else:
st.markdown('Please upload a PDF file.')