Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""homo-inspectus.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1Syuv1HSvRJe5Lf_Y9PRPjB3lupei-O5b | |
# π΅ Homo Inspectus (Question Answering Engine for Home Inspection Reports) | |
No need to be data scientist to run my codes π Just click left triangles in the circles couple times until seeing π sign. | |
- First running may take 5-10 minutes. | |
- Once engine is ready it will quickly answer your questions | |
""" | |
import os | |
import shutil | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings | |
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter | |
import transformers | |
import re | |
import string | |
import random | |
import pandas as pd | |
from google.colab import data_table | |
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
def process_string(input_string): | |
alphanumeric_string = re.sub(r'[^a-zA-Z0-9]', '', input_string) | |
no_blank_string = alphanumeric_string.replace(' ', '') | |
random_numbers = ''.join(random.choices(string.digits, k=2)) | |
final_string = no_blank_string + random_numbers | |
return final_string | |
def show_long_repeating_sentences(text, min_words=4): | |
sentences = text.split('.') | |
unique_sentences = set() | |
repeating_sentences = [] | |
for s in sentences: | |
stripped_sentence = s.strip() | |
if stripped_sentence in unique_sentences and len(stripped_sentence.split()) >= min_words: | |
repeating_sentences.append(stripped_sentence) | |
else: | |
unique_sentences.add(stripped_sentence) | |
return repeating_sentences | |
def create_vdb(filename, chunk_size = 200, overlap = 100): | |
if chunk_size < overlap: | |
chunk_size, overlap = 500, 100 | |
print(f'Chunk size: {chunk_size}, overlap: {overlap}') | |
path = os.path.abspath(f'{filename}') | |
loader = PyPDFLoader(path) | |
pages = loader.load_and_split() | |
# all text | |
all_text = ' '.join([page.page_content for page in pages]) | |
repeating_sentences = show_long_repeating_sentences(all_text) | |
print('Repeating sentences') | |
print(repeating_sentences) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=50, | |
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], | |
) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap) | |
corpus = [] | |
for index, page in enumerate(pages): | |
metadatas = [{"pdf_page_number": index}] | |
content = page.page_content | |
# repeating sentences remuval | |
for sentence in repeating_sentences: | |
content= content.replace(sentence, '#RRC') | |
texts = text_splitter.create_documents([content], metadatas=metadatas) | |
output_message =f'==================Page {page.metadata["page"]} len(texts) = {len(texts)} =================' | |
print(output_message) | |
for text in texts: | |
corpus.insert(0,text) | |
#-------------------------------- | |
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
cname = process_string(filename) | |
db = Chroma.from_documents(corpus, embedding_function, persist_directory=cname) | |
print(''' Index is created βΉοΈ''') | |
return db | |
"""π Stop. Now do in one more time for next code block. Then it will show you the upload button. Click it and upload the **home inspection** file Then you will be able to ask any question without running previous codes | |
# Information Retrieval (IR) Tool | |
This tool is your small local and sweet π¬ Google for the document | |
""" | |
db = create_vdb('hir.pdf', 500, 100) | |
"""π You did. Now write your keyword, or question or anything to search in the document""" | |
q = 'leak' # @param {type:"string"} | |
print(f'Your question is: {q}') | |
answers = [] | |
pgnos = [] | |
for d in db.similarity_search(q, k=10): | |
answers.append(d.page_content) | |
pgnos.append(d.metadata['pdf_page_number']) | |
results = pd.DataFrame({'Page': pgnos,'Cite':answers,}) | |
data_table.DataTable(results, include_index=False, num_rows_per_page=5) | |
"""The order of result is based the most relevant ones to less. Like Google | |
# Chat Tool (Humble ) π π | |
This tool is allow your direct question to answer like my rival πͺ ChatGPT | |
""" | |
from langchain import HuggingFaceHub | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain import HuggingFacePipeline | |
max_length=1024 | |
model_name = "declare-lab/flan-alpaca-large" | |
llm = HuggingFacePipeline.from_model_id( | |
model_id=model_name, | |
task="text2text-generation", | |
model_kwargs={"max_length": max_length}, | |
) | |
qa_chain = load_qa_chain(llm, chain_type="stuff") | |
def ask_to_alpaca(question, vector_store, qa_chain): | |
similar_docs = vector_store.similarity_search(question) | |
response = qa_chain.run(input_documents=similar_docs, question=question) | |
return response | |
explicit_question = 'Is there any problem of the roofs?' # @param {type:"string"} | |
ask_to_alpaca(explicit_question, db, qa_chain) | |