Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""homo-inspectus.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1Syuv1HSvRJe5Lf_Y9PRPjB3lupei-O5b
|
8 |
+
|
9 |
+
# π΅ Homo Inspectus (Question Answering Engine for Home Inspection Reports)
|
10 |
+
|
11 |
+
No need to be data scientist to run my codes π Just click left triangles in the circles couple times until seeing π sign.
|
12 |
+
|
13 |
+
- First running may take 5-10 minutes.
|
14 |
+
- Once engine is ready it will quickly answer your questions
|
15 |
+
"""
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
import os
|
20 |
+
import shutil
|
21 |
+
from langchain.document_loaders import PyPDFLoader
|
22 |
+
from langchain.vectorstores import Chroma
|
23 |
+
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
24 |
+
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
|
25 |
+
import transformers
|
26 |
+
import re
|
27 |
+
import string
|
28 |
+
import random
|
29 |
+
import pandas as pd
|
30 |
+
from google.colab import data_table
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
35 |
+
|
36 |
+
def process_string(input_string):
|
37 |
+
alphanumeric_string = re.sub(r'[^a-zA-Z0-9]', '', input_string)
|
38 |
+
no_blank_string = alphanumeric_string.replace(' ', '')
|
39 |
+
random_numbers = ''.join(random.choices(string.digits, k=2))
|
40 |
+
final_string = no_blank_string + random_numbers
|
41 |
+
return final_string
|
42 |
+
|
43 |
+
def show_long_repeating_sentences(text, min_words=4):
|
44 |
+
sentences = text.split('.')
|
45 |
+
unique_sentences = set()
|
46 |
+
repeating_sentences = []
|
47 |
+
for s in sentences:
|
48 |
+
stripped_sentence = s.strip()
|
49 |
+
if stripped_sentence in unique_sentences and len(stripped_sentence.split()) >= min_words:
|
50 |
+
repeating_sentences.append(stripped_sentence)
|
51 |
+
else:
|
52 |
+
unique_sentences.add(stripped_sentence)
|
53 |
+
|
54 |
+
return repeating_sentences
|
55 |
+
|
56 |
+
def create_vdb(filename, chunk_size = 200, overlap = 100):
|
57 |
+
if chunk_size < overlap:
|
58 |
+
chunk_size, overlap = 500, 100
|
59 |
+
print(f'Chunk size: {chunk_size}, overlap: {overlap}')
|
60 |
+
path = os.path.abspath(f'{filename}')
|
61 |
+
loader = PyPDFLoader(path)
|
62 |
+
pages = loader.load_and_split()
|
63 |
+
# all text
|
64 |
+
all_text = ' '.join([page.page_content for page in pages])
|
65 |
+
repeating_sentences = show_long_repeating_sentences(all_text)
|
66 |
+
print('Repeating sentences')
|
67 |
+
print(repeating_sentences)
|
68 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
69 |
+
chunk_size=chunk_size,
|
70 |
+
chunk_overlap=50,
|
71 |
+
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
|
72 |
+
)
|
73 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
|
74 |
+
corpus = []
|
75 |
+
for index, page in enumerate(pages):
|
76 |
+
metadatas = [{"pdf_page_number": index}]
|
77 |
+
content = page.page_content
|
78 |
+
# repeating sentences remuval
|
79 |
+
for sentence in repeating_sentences:
|
80 |
+
content= content.replace(sentence, '#RRC')
|
81 |
+
texts = text_splitter.create_documents([content], metadatas=metadatas)
|
82 |
+
output_message =f'==================Page {page.metadata["page"]} len(texts) = {len(texts)} ================='
|
83 |
+
print(output_message)
|
84 |
+
for text in texts:
|
85 |
+
corpus.insert(0,text)
|
86 |
+
#--------------------------------
|
87 |
+
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
88 |
+
cname = process_string(filename)
|
89 |
+
db = Chroma.from_documents(corpus, embedding_function, persist_directory=cname)
|
90 |
+
print(''' Index is created βΉοΈ''')
|
91 |
+
return db
|
92 |
+
|
93 |
+
"""π Stop. Now do in one more time for next code block. Then it will show you the upload button. Click it and upload the **home inspection** file Then you will be able to ask any question without running previous codes
|
94 |
+
|
95 |
+
# Information Retrieval (IR) Tool
|
96 |
+
|
97 |
+
This tool is your small local and sweet π¬ Google for the document
|
98 |
+
"""
|
99 |
+
|
100 |
+
from google.colab import files
|
101 |
+
uploaded = files.upload()
|
102 |
+
fnames = []
|
103 |
+
for filename, content in uploaded.items():
|
104 |
+
print(f"Processing file: {filename}")
|
105 |
+
fnames.append(filename)
|
106 |
+
db = create_vdb(fnames[0], 500, 100)
|
107 |
+
|
108 |
+
"""π You did. Now write your keyword, or question or anything to search in the document"""
|
109 |
+
|
110 |
+
q = 'leak' # @param {type:"string"}
|
111 |
+
print(f'Your question is: {q}')
|
112 |
+
answers = []
|
113 |
+
pgnos = []
|
114 |
+
for d in db.similarity_search(q, k=10):
|
115 |
+
answers.append(d.page_content)
|
116 |
+
pgnos.append(d.metadata['pdf_page_number'])
|
117 |
+
results = pd.DataFrame({'Page': pgnos,'Cite':answers,})
|
118 |
+
data_table.DataTable(results, include_index=False, num_rows_per_page=5)
|
119 |
+
|
120 |
+
"""The order of result is based the most relevant ones to less. Like Google
|
121 |
+
|
122 |
+
# Chat Tool (Humble ) π π
|
123 |
+
|
124 |
+
This tool is allow your direct question to answer like my rival πͺ ChatGPT
|
125 |
+
"""
|
126 |
+
|
127 |
+
from langchain import HuggingFaceHub
|
128 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
129 |
+
from langchain.chains.question_answering import load_qa_chain
|
130 |
+
from langchain import HuggingFacePipeline
|
131 |
+
|
132 |
+
max_length=1024
|
133 |
+
model_name = "declare-lab/flan-alpaca-large"
|
134 |
+
llm = HuggingFacePipeline.from_model_id(
|
135 |
+
model_id=model_name,
|
136 |
+
task="text2text-generation",
|
137 |
+
model_kwargs={"max_length": max_length},
|
138 |
+
)
|
139 |
+
qa_chain = load_qa_chain(llm, chain_type="stuff")
|
140 |
+
|
141 |
+
def ask_to_alpaca(question, vector_store, qa_chain):
|
142 |
+
similar_docs = vector_store.similarity_search(question)
|
143 |
+
response = qa_chain.run(input_documents=similar_docs, question=question)
|
144 |
+
return response
|
145 |
+
|
146 |
+
explicit_question = 'Is there any problem of the roofs?' # @param {type:"string"}
|
147 |
+
ask_to_alpaca(explicit_question, db, qa_chain)
|