homoinspectus / app.py
suatatan's picture
Update app.py
9b868d6
raw
history blame
5.22 kB
# -*- coding: utf-8 -*-
"""homo-inspectus.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Syuv1HSvRJe5Lf_Y9PRPjB3lupei-O5b
# 🐡 Homo Inspectus (Question Answering Engine for Home Inspection Reports)
No need to be data scientist to run my codes πŸ˜€ Just click left triangles in the circles couple times until seeing πŸ›‘ sign.
- First running may take 5-10 minutes.
- Once engine is ready it will quickly answer your questions
"""
import os
import shutil
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
import transformers
import re
import string
import random
import pandas as pd
from google.colab import data_table
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
def process_string(input_string):
alphanumeric_string = re.sub(r'[^a-zA-Z0-9]', '', input_string)
no_blank_string = alphanumeric_string.replace(' ', '')
random_numbers = ''.join(random.choices(string.digits, k=2))
final_string = no_blank_string + random_numbers
return final_string
def show_long_repeating_sentences(text, min_words=4):
sentences = text.split('.')
unique_sentences = set()
repeating_sentences = []
for s in sentences:
stripped_sentence = s.strip()
if stripped_sentence in unique_sentences and len(stripped_sentence.split()) >= min_words:
repeating_sentences.append(stripped_sentence)
else:
unique_sentences.add(stripped_sentence)
return repeating_sentences
def create_vdb(filename, chunk_size = 200, overlap = 100):
if chunk_size < overlap:
chunk_size, overlap = 500, 100
print(f'Chunk size: {chunk_size}, overlap: {overlap}')
path = os.path.abspath(f'{filename}')
loader = PyPDFLoader(path)
pages = loader.load_and_split()
# all text
all_text = ' '.join([page.page_content for page in pages])
repeating_sentences = show_long_repeating_sentences(all_text)
print('Repeating sentences')
print(repeating_sentences)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=50,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
corpus = []
for index, page in enumerate(pages):
metadatas = [{"pdf_page_number": index}]
content = page.page_content
# repeating sentences remuval
for sentence in repeating_sentences:
content= content.replace(sentence, '#RRC')
texts = text_splitter.create_documents([content], metadatas=metadatas)
output_message =f'==================Page {page.metadata["page"]} len(texts) = {len(texts)} ================='
print(output_message)
for text in texts:
corpus.insert(0,text)
#--------------------------------
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
cname = process_string(filename)
db = Chroma.from_documents(corpus, embedding_function, persist_directory=cname)
print(''' Index is created ⛹️''')
return db
"""πŸ›‘ Stop. Now do in one more time for next code block. Then it will show you the upload button. Click it and upload the **home inspection** file Then you will be able to ask any question without running previous codes
# Information Retrieval (IR) Tool
This tool is your small local and sweet 🍬 Google for the document
"""
db = create_vdb('hir.pdf', 500, 100)
"""πŸ‘Œ You did. Now write your keyword, or question or anything to search in the document"""
q = 'leak' # @param {type:"string"}
print(f'Your question is: {q}')
answers = []
pgnos = []
for d in db.similarity_search(q, k=10):
answers.append(d.page_content)
pgnos.append(d.metadata['pdf_page_number'])
results = pd.DataFrame({'Page': pgnos,'Cite':answers,})
data_table.DataTable(results, include_index=False, num_rows_per_page=5)
"""The order of result is based the most relevant ones to less. Like Google
# Chat Tool (Humble ) πŸ†“ πŸš—
This tool is allow your direct question to answer like my rival πŸ”ͺ ChatGPT
"""
from langchain import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFacePipeline
max_length=1024
model_name = "declare-lab/flan-alpaca-large"
llm = HuggingFacePipeline.from_model_id(
model_id=model_name,
task="text2text-generation",
model_kwargs={"max_length": max_length},
)
qa_chain = load_qa_chain(llm, chain_type="stuff")
def ask_to_alpaca(question, vector_store, qa_chain):
similar_docs = vector_store.similarity_search(question)
response = qa_chain.run(input_documents=similar_docs, question=question)
return response
explicit_question = 'Is there any problem of the roofs?' # @param {type:"string"}
ask_to_alpaca(explicit_question, db, qa_chain)