Spaces:
Runtime error
Runtime error
File size: 4,584 Bytes
64e8180 78ec8cc 2631752 64e8180 38c9bda 64e8180 cd40ec3 c6515a5 ffd8646 2295808 cd40ec3 15a98a5 cd40ec3 15a98a5 64e8180 51fe99a 64e8180 ffd8646 b826878 ffd8646 64e8180 2295808 64e8180 0b31cee 64e8180 2295808 64e8180 2295808 0b31cee 64e8180 0b31cee 64e8180 2631752 64e8180 0e51722 cd40ec3 0e51722 cd40ec3 f812c0e 0b31cee 3508d84 0e51722 15a98a5 64e8180 38c9bda fe8a6df 38c9bda 15a98a5 cd40ec3 0b31cee cd40ec3 4359be5 64e8180 15a98a5 64e8180 15a98a5 64e8180 4359be5 0b31cee cd40ec3 0b31cee cd40ec3 0b31cee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
# -*- coding: utf-8 -*-
import os
import shutil
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
import transformers
import re
import string
import random
import pandas as pd
import streamlit as st
from langchain import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFacePipeline
import tempfile
from PyPDF2 import PdfReader
import random
st.markdown(
"""
# π΅ Homo Inspectus
(Question Answering Engine)
## Suat ATAN
""")
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
def process_string(input_string):
alphanumeric_string = re.sub(r'[^a-zA-Z0-9]', '', input_string)
no_blank_string = alphanumeric_string.replace(' ', '')
random_numbers = ''.join(random.choices(string.digits, k=2))
final_string = no_blank_string + random_numbers
return final_string
def show_long_repeating_sentences(text, min_words=4):
sentences = text.split('.')
unique_sentences = set()
repeating_sentences = []
for s in sentences:
stripped_sentence = s.strip()
if stripped_sentence in unique_sentences and len(stripped_sentence.split()) >= min_words:
repeating_sentences.append(stripped_sentence)
else:
unique_sentences.add(stripped_sentence)
return repeating_sentences
def create_vdb(fileobj, chunk_size = 200, overlap = 100):
if chunk_size < overlap:
chunk_size, overlap = 500, 100
print(f'Chunk size: {chunk_size}, overlap: {overlap}')
#loader = PyPDFLoader(fileobj)
pdf_reader = PdfReader(fileobj)
all_text = ""
for page in pdf_reader.pages:
all_text += page.extract_text()
repeating_sentences = show_long_repeating_sentences(all_text)
print('Repeating sentences')
print(repeating_sentences)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)
chunks = text_splitter.split_text(text = all_text)
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
cname = str(random.randint(1000, 9999))
db = Chroma.from_texts(chunks, embedding_function, persist_directory=cname)
print(''' Index is created βΉοΈ''')
return db
"""
# Information Retrieval (IR) Tool
This tool is your small local and sweet π¬ Google for the document
"""
db = None
uploaded_file = st.file_uploader('Choose your .pdf file', type="pdf")
if uploaded_file is not None:
db = create_vdb(uploaded_file, 500, 100)
st.write("filename:", uploaded_file.name)
st.write(uploaded_file)
else:
st.info("Please upload a PDF file.")
"""π You did. Now write your keyword, or question or anything to search in the document"""
# q = 'leak' # @param {type:"string"}
q = st.text_input('Write your question or keywords', 'HVAC', key='q1')
st.write(f'Your question is: {q}')
if db:
answers = []
pgnos = []
for d in db.similarity_search(q, k=10):
answers.append(d.page_content)
#pgnos.append(d.metadata['pdf_page_number'])
results = pd.DataFrame({'Cite':answers})
st.markdown('## Results')
st.dataframe(results)
st.markdown(
"""The order of result is based the most relevant ones to less. Like Google
# Chat Tool (Humble ) π π
This tool is allow your direct question to answer like my rival πͺ ChatGPT
""")
def ask_to_alpaca(question, vector_store, qa_chain):
similar_docs = vector_store.similarity_search(question)
response = qa_chain.run(input_documents=similar_docs, question=question)
return response
explicit_question = st.text_input('Write your question', 'Roof problems', key='q2')
if db:
max_length=1024
model_name = "declare-lab/flan-alpaca-large"
llm = HuggingFacePipeline.from_model_id(
model_id=model_name,
task="text2text-generation",
model_kwargs={"max_length": max_length},
)
qa_chain = load_qa_chain(llm, chain_type="stuff")
# explicit_question = 'Is there any problem of the roofs?' # @param {type:"string"}
st.markdown("Answer:")
st.write(ask_to_alpaca(explicit_question, db, qa_chain))
else:
st.markdown('Please upload a PDF file.')
|