Spaces:
Sleeping
Sleeping
import os | |
from dash import Dash, dcc, html, Input, Output, State | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain_community.vectorstores import FAISS | |
from langchain_community.llms import HuggingFaceHub | |
import pandas as pd | |
# Set API token for HuggingFace | |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv('HUGGINGFACEHUB_API_TOKEN', "") | |
# Initialize Dash app | |
app = Dash(__name__) | |
# Extract text from PDF files | |
def get_pdf_text(pdf_file): | |
try: | |
pdf_reader = PdfReader(pdf_file) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() or "" | |
return text | |
except Exception as e: | |
raise ValueError(f"Error processing PDF file: {e}") | |
# Split text into smaller chunks | |
def get_text_chunks(text): | |
text_splitter = CharacterTextSplitter( | |
separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len | |
) | |
return text_splitter.split_text(text) | |
# Create a vector store from text chunks | |
def get_vectorstore(text_chunks): | |
if not text_chunks: | |
raise ValueError("No text chunks provided for vectorstore creation.") | |
model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
embeddings = HuggingFaceBgeEmbeddings( | |
model_name=model, encode_kwargs={"normalize_embeddings": True}, model_kwargs={"device": "cpu"} | |
) | |
return FAISS.from_texts(texts=text_chunks, embedding=embeddings) | |
# Create a conversational retrieval chain | |
def get_conversation_chain(vectorstore): | |
if not vectorstore: | |
raise ValueError("Vectorstore is not initialized.") | |
llm = HuggingFaceHub( | |
repo_id="google/gemma-7b", | |
model_kwargs={"temperature": 0.1, "max_length": 2048}, | |
) | |
return llm, vectorstore.as_retriever() | |
# Process CSV data | |
def process_csv_data(csv_file): | |
try: | |
df = pd.read_csv(csv_file) | |
combined_text = df.astype(str).apply(" ".join, axis=1).str.cat(sep=" ") | |
return combined_text | |
except Exception as e: | |
raise ValueError(f"Error processing CSV file: {e}") | |
# Layout | |
app.layout = html.Div([ | |
html.H1("Chat Bot برای فایلهای PDF و CSV 📚"), | |
dcc.Upload( | |
id='upload-pdf', | |
children=html.Div(['Drag and Drop or ', html.A('Select PDF Files')]), | |
style={ | |
'width': '100%', 'height': '60px', 'lineHeight': '60px', | |
'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px', | |
'textAlign': 'center', 'margin': '10px' | |
}, | |
multiple=True | |
), | |
dcc.Upload( | |
id='upload-csv', | |
children=html.Div(['Drag and Drop or ', html.A('Select CSV Files')]), | |
style={ | |
'width': '100%', 'height': '60px', 'lineHeight': '60px', | |
'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px', | |
'textAlign': 'center', 'margin': '10px' | |
}, | |
), | |
html.Button('پردازش', id='process-button', n_clicks=0), | |
dcc.Input(id='question-input', type='text', placeholder='سوال خود را وارد کنید'), | |
html.Button('پاسخ', id='answer-button', n_clicks=0), | |
html.Div(id='output-answer') | |
]) | |
# Callbacks | |
def handle_question(n_clicks_answer, question, n_clicks_process): | |
if n_clicks_answer > 0 and n_clicks_process > 0: | |
llm, retriever = app.server.config.get('conversation_chain', (None, None)) | |
if not llm or not retriever: | |
return "لطفاً ابتدا فایلها را پردازش کنید." | |
try: | |
result = retriever.get_relevant_documents(question) | |
answer = llm.generate({"question": question, "context": result}) | |
return html.Div([ | |
html.P(f"سوال: {question}"), | |
html.P(f"پاسخ: {answer}") | |
]) | |
except Exception as e: | |
return f"خطا در پردازش سوال: {str(e)}" | |
return "لطفاً ابتدا فایلها را پردازش کنید." | |
def process_files(n_clicks, pdf_contents, csv_contents): | |
if n_clicks > 0: | |
combined_text = "" | |
if pdf_contents: | |
if not isinstance(pdf_contents, list) or not all(isinstance(content, str) for content in pdf_contents): | |
return "فرمت فایل PDF صحیح نیست. لطفاً دوباره تلاش کنید." | |
for content in pdf_contents: | |
try: | |
pdf_data = content.split(",")[1] | |
pdf_text = get_pdf_text(pdf_data) | |
combined_text += pdf_text | |
except Exception as e: | |
return f"خطا در پردازش فایل PDF: {str(e)}" | |
if csv_contents: | |
for content in csv_contents: | |
try: | |
csv_data = content.split(",")[1] | |
csv_text = process_csv_data(csv_data) | |
combined_text += csv_text | |
except Exception as e: | |
return f"خطا در پردازش فایل CSV: {str(e)}" | |
if not combined_text.strip(): | |
return "هیچ متنی برای پردازش یافت نشد." | |
try: | |
text_chunks = get_text_chunks(combined_text) | |
vectorstore = get_vectorstore(text_chunks) | |
conversation_chain = get_conversation_chain(vectorstore) | |
app.server.config['conversation_chain'] = conversation_chain | |
return "پردازش تکمیل شد! اکنون میتوانید سوالات خود را بپرسید." | |
except ValueError as e: | |
return f"خطا در پردازش دادهها: {str(e)}" | |
return "لطفاً فایلهای مناسب را آپلود کنید." | |
if __name__ == '__main__': | |
from waitress import serve | |
serve(app.server, host="0.0.0.0", port=7860) | |