Spaces:

mehdi364
/

data_anilsys

Sleeping

App Files Files Community

data_anilsys / app.py

mehdi364

Update app.py

c49b90d verified 6 months ago

raw

history blame

6.2 kB

	import os
	from dash import Dash, dcc, html, Input, Output, State
	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain_community.llms import HuggingFaceHub
	import pandas as pd

	# Set API token for HuggingFace
	os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv('HUGGINGFACEHUB_API_TOKEN', "")

	# Initialize Dash app
	app = Dash(__name__)

	# Extract text from PDF files
	def get_pdf_text(pdf_file):
	try:
	pdf_reader = PdfReader(pdf_file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() or ""
	return text
	except Exception as e:
	raise ValueError(f"Error processing PDF file: {e}")

	# Split text into smaller chunks
	def get_text_chunks(text):
	text_splitter = CharacterTextSplitter(
	separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len
	)
	return text_splitter.split_text(text)

	# Create a vector store from text chunks
	def get_vectorstore(text_chunks):
	if not text_chunks:
	raise ValueError("No text chunks provided for vectorstore creation.")

	model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
	embeddings = HuggingFaceBgeEmbeddings(
	model_name=model, encode_kwargs={"normalize_embeddings": True}, model_kwargs={"device": "cpu"}
	)
	return FAISS.from_texts(texts=text_chunks, embedding=embeddings)

	# Create a conversational retrieval chain
	def get_conversation_chain(vectorstore):
	if not vectorstore:
	raise ValueError("Vectorstore is not initialized.")

	llm = HuggingFaceHub(
	repo_id="google/gemma-7b",
	model_kwargs={"temperature": 0.1, "max_length": 2048},
	)
	return llm, vectorstore.as_retriever()

	# Process CSV data
	def process_csv_data(csv_file):
	try:
	df = pd.read_csv(csv_file)
	combined_text = df.astype(str).apply(" ".join, axis=1).str.cat(sep=" ")
	return combined_text
	except Exception as e:
	raise ValueError(f"Error processing CSV file: {e}")

	# Layout
	app.layout = html.Div([
	html.H1("Chat Bot برای فایل‌های PDF و CSV 📚"),
	dcc.Upload(
	id='upload-pdf',
	children=html.Div(['Drag and Drop or ', html.A('Select PDF Files')]),
	style={
	'width': '100%', 'height': '60px', 'lineHeight': '60px',
	'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px',
	'textAlign': 'center', 'margin': '10px'
	},
	multiple=True
	),
	dcc.Upload(
	id='upload-csv',
	children=html.Div(['Drag and Drop or ', html.A('Select CSV Files')]),
	style={
	'width': '100%', 'height': '60px', 'lineHeight': '60px',
	'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px',
	'textAlign': 'center', 'margin': '10px'
	},
	),
	html.Button('پردازش', id='process-button', n_clicks=0),
	dcc.Input(id='question-input', type='text', placeholder='سوال خود را وارد کنید'),
	html.Button('پاسخ', id='answer-button', n_clicks=0),
	html.Div(id='output-answer')
	])

	# Callbacks
	@app.callback(
	Output('output-answer', 'children'),
	[Input('answer-button', 'n_clicks')],
	[State('question-input', 'value'), State('process-button', 'n_clicks')]
	)
	def handle_question(n_clicks_answer, question, n_clicks_process):
	if n_clicks_answer > 0 and n_clicks_process > 0:
	llm, retriever = app.server.config.get('conversation_chain', (None, None))
	if not llm or not retriever:
	return "لطفاً ابتدا فایل‌ها را پردازش کنید."

	try:
	result = retriever.get_relevant_documents(question)
	answer = llm.generate({"question": question, "context": result})
	return html.Div([
	html.P(f"سوال: {question}"),
	html.P(f"پاسخ: {answer}")
	])
	except Exception as e:
	return f"خطا در پردازش سوال: {str(e)}"
	return "لطفاً ابتدا فایل‌ها را پردازش کنید."

	@app.callback(
	Output('output-answer', 'children'),
	[Input('process-button', 'n_clicks')],
	[State('upload-pdf', 'contents'), State('upload-csv', 'contents')]
	)
	def process_files(n_clicks, pdf_contents, csv_contents):
	if n_clicks > 0:
	combined_text = ""

	if pdf_contents:
	if not isinstance(pdf_contents, list) or not all(isinstance(content, str) for content in pdf_contents):
	return "فرمت فایل PDF صحیح نیست. لطفاً دوباره تلاش کنید."
	for content in pdf_contents:
	try:
	pdf_data = content.split(",")[1]
	pdf_text = get_pdf_text(pdf_data)
	combined_text += pdf_text
	except Exception as e:
	return f"خطا در پردازش فایل PDF: {str(e)}"

	if csv_contents:
	for content in csv_contents:
	try:
	csv_data = content.split(",")[1]
	csv_text = process_csv_data(csv_data)
	combined_text += csv_text
	except Exception as e:
	return f"خطا در پردازش فایل CSV: {str(e)}"

	if not combined_text.strip():
	return "هیچ متنی برای پردازش یافت نشد."

	try:
	text_chunks = get_text_chunks(combined_text)
	vectorstore = get_vectorstore(text_chunks)
	conversation_chain = get_conversation_chain(vectorstore)
	app.server.config['conversation_chain'] = conversation_chain
	return "پردازش تکمیل شد! اکنون می‌توانید سوالات خود را بپرسید."
	except ValueError as e:
	return f"خطا در پردازش داده‌ها: {str(e)}"

	return "لطفاً فایل‌های مناسب را آپلود کنید."

	if __name__ == '__main__':
	from waitress import serve
	serve(app.server, host="0.0.0.0", port=7860)