data_anilsys / app.py
mehdi364's picture
Update app.py
c49b90d verified
raw
history blame
6.2 kB
import os
from dash import Dash, dcc, html, Input, Output, State
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFaceHub
import pandas as pd
# Set API token for HuggingFace
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv('HUGGINGFACEHUB_API_TOKEN', "")
# Initialize Dash app
app = Dash(__name__)
# Extract text from PDF files
def get_pdf_text(pdf_file):
try:
pdf_reader = PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() or ""
return text
except Exception as e:
raise ValueError(f"Error processing PDF file: {e}")
# Split text into smaller chunks
def get_text_chunks(text):
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len
)
return text_splitter.split_text(text)
# Create a vector store from text chunks
def get_vectorstore(text_chunks):
if not text_chunks:
raise ValueError("No text chunks provided for vectorstore creation.")
model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embeddings = HuggingFaceBgeEmbeddings(
model_name=model, encode_kwargs={"normalize_embeddings": True}, model_kwargs={"device": "cpu"}
)
return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
# Create a conversational retrieval chain
def get_conversation_chain(vectorstore):
if not vectorstore:
raise ValueError("Vectorstore is not initialized.")
llm = HuggingFaceHub(
repo_id="google/gemma-7b",
model_kwargs={"temperature": 0.1, "max_length": 2048},
)
return llm, vectorstore.as_retriever()
# Process CSV data
def process_csv_data(csv_file):
try:
df = pd.read_csv(csv_file)
combined_text = df.astype(str).apply(" ".join, axis=1).str.cat(sep=" ")
return combined_text
except Exception as e:
raise ValueError(f"Error processing CSV file: {e}")
# Layout
app.layout = html.Div([
html.H1("Chat Bot برای فایل‌های PDF و CSV 📚"),
dcc.Upload(
id='upload-pdf',
children=html.Div(['Drag and Drop or ', html.A('Select PDF Files')]),
style={
'width': '100%', 'height': '60px', 'lineHeight': '60px',
'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px',
'textAlign': 'center', 'margin': '10px'
},
multiple=True
),
dcc.Upload(
id='upload-csv',
children=html.Div(['Drag and Drop or ', html.A('Select CSV Files')]),
style={
'width': '100%', 'height': '60px', 'lineHeight': '60px',
'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px',
'textAlign': 'center', 'margin': '10px'
},
),
html.Button('پردازش', id='process-button', n_clicks=0),
dcc.Input(id='question-input', type='text', placeholder='سوال خود را وارد کنید'),
html.Button('پاسخ', id='answer-button', n_clicks=0),
html.Div(id='output-answer')
])
# Callbacks
@app.callback(
Output('output-answer', 'children'),
[Input('answer-button', 'n_clicks')],
[State('question-input', 'value'), State('process-button', 'n_clicks')]
)
def handle_question(n_clicks_answer, question, n_clicks_process):
if n_clicks_answer > 0 and n_clicks_process > 0:
llm, retriever = app.server.config.get('conversation_chain', (None, None))
if not llm or not retriever:
return "لطفاً ابتدا فایل‌ها را پردازش کنید."
try:
result = retriever.get_relevant_documents(question)
answer = llm.generate({"question": question, "context": result})
return html.Div([
html.P(f"سوال: {question}"),
html.P(f"پاسخ: {answer}")
])
except Exception as e:
return f"خطا در پردازش سوال: {str(e)}"
return "لطفاً ابتدا فایل‌ها را پردازش کنید."
@app.callback(
Output('output-answer', 'children'),
[Input('process-button', 'n_clicks')],
[State('upload-pdf', 'contents'), State('upload-csv', 'contents')]
)
def process_files(n_clicks, pdf_contents, csv_contents):
if n_clicks > 0:
combined_text = ""
if pdf_contents:
if not isinstance(pdf_contents, list) or not all(isinstance(content, str) for content in pdf_contents):
return "فرمت فایل PDF صحیح نیست. لطفاً دوباره تلاش کنید."
for content in pdf_contents:
try:
pdf_data = content.split(",")[1]
pdf_text = get_pdf_text(pdf_data)
combined_text += pdf_text
except Exception as e:
return f"خطا در پردازش فایل PDF: {str(e)}"
if csv_contents:
for content in csv_contents:
try:
csv_data = content.split(",")[1]
csv_text = process_csv_data(csv_data)
combined_text += csv_text
except Exception as e:
return f"خطا در پردازش فایل CSV: {str(e)}"
if not combined_text.strip():
return "هیچ متنی برای پردازش یافت نشد."
try:
text_chunks = get_text_chunks(combined_text)
vectorstore = get_vectorstore(text_chunks)
conversation_chain = get_conversation_chain(vectorstore)
app.server.config['conversation_chain'] = conversation_chain
return "پردازش تکمیل شد! اکنون می‌توانید سوالات خود را بپرسید."
except ValueError as e:
return f"خطا در پردازش داده‌ها: {str(e)}"
return "لطفاً فایل‌های مناسب را آپلود کنید."
if __name__ == '__main__':
from waitress import serve
serve(app.server, host="0.0.0.0", port=7860)