import os from dash import Dash, dcc, html, Input, Output, State from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.llms import HuggingFaceHub import pandas as pd # Set API token for HuggingFace os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv('HUGGINGFACEHUB_API_TOKEN', "") # Initialize Dash app app = Dash(__name__) # Extract text from PDF files def get_pdf_text(pdf_file): try: pdf_reader = PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() or "" return text except Exception as e: raise ValueError(f"Error processing PDF file: {e}") # Split text into smaller chunks def get_text_chunks(text): text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len ) return text_splitter.split_text(text) # Create a vector store from text chunks def get_vectorstore(text_chunks): if not text_chunks: raise ValueError("No text chunks provided for vectorstore creation.") model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" embeddings = HuggingFaceBgeEmbeddings( model_name=model, encode_kwargs={"normalize_embeddings": True}, model_kwargs={"device": "cpu"} ) return FAISS.from_texts(texts=text_chunks, embedding=embeddings) # Create a conversational retrieval chain def get_conversation_chain(vectorstore): if not vectorstore: raise ValueError("Vectorstore is not initialized.") llm = HuggingFaceHub( repo_id="google/gemma-7b", model_kwargs={"temperature": 0.1, "max_length": 2048}, ) return llm, vectorstore.as_retriever() # Process CSV data def process_csv_data(csv_file): try: df = pd.read_csv(csv_file) combined_text = df.astype(str).apply(" ".join, axis=1).str.cat(sep=" ") return combined_text except Exception as e: raise ValueError(f"Error processing CSV file: {e}") # Layout app.layout = html.Div([ html.H1("Chat Bot برای فایل‌های PDF و CSV 📚"), dcc.Upload( id='upload-pdf', children=html.Div(['Drag and Drop or ', html.A('Select PDF Files')]), style={ 'width': '100%', 'height': '60px', 'lineHeight': '60px', 'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px', 'textAlign': 'center', 'margin': '10px' }, multiple=True ), dcc.Upload( id='upload-csv', children=html.Div(['Drag and Drop or ', html.A('Select CSV Files')]), style={ 'width': '100%', 'height': '60px', 'lineHeight': '60px', 'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px', 'textAlign': 'center', 'margin': '10px' }, ), html.Button('پردازش', id='process-button', n_clicks=0), dcc.Input(id='question-input', type='text', placeholder='سوال خود را وارد کنید'), html.Button('پاسخ', id='answer-button', n_clicks=0), html.Div(id='output-answer') ]) # Callbacks @app.callback( Output('output-answer', 'children'), [Input('answer-button', 'n_clicks')], [State('question-input', 'value'), State('process-button', 'n_clicks')] ) def handle_question(n_clicks_answer, question, n_clicks_process): if n_clicks_answer > 0 and n_clicks_process > 0: llm, retriever = app.server.config.get('conversation_chain', (None, None)) if not llm or not retriever: return "لطفاً ابتدا فایل‌ها را پردازش کنید." try: result = retriever.get_relevant_documents(question) answer = llm.generate({"question": question, "context": result}) return html.Div([ html.P(f"سوال: {question}"), html.P(f"پاسخ: {answer}") ]) except Exception as e: return f"خطا در پردازش سوال: {str(e)}" return "لطفاً ابتدا فایل‌ها را پردازش کنید." @app.callback( Output('output-answer', 'children'), [Input('process-button', 'n_clicks')], [State('upload-pdf', 'contents'), State('upload-csv', 'contents')] ) def process_files(n_clicks, pdf_contents, csv_contents): if n_clicks > 0: combined_text = "" if pdf_contents: if not isinstance(pdf_contents, list) or not all(isinstance(content, str) for content in pdf_contents): return "فرمت فایل PDF صحیح نیست. لطفاً دوباره تلاش کنید." for content in pdf_contents: try: pdf_data = content.split(",")[1] pdf_text = get_pdf_text(pdf_data) combined_text += pdf_text except Exception as e: return f"خطا در پردازش فایل PDF: {str(e)}" if csv_contents: for content in csv_contents: try: csv_data = content.split(",")[1] csv_text = process_csv_data(csv_data) combined_text += csv_text except Exception as e: return f"خطا در پردازش فایل CSV: {str(e)}" if not combined_text.strip(): return "هیچ متنی برای پردازش یافت نشد." try: text_chunks = get_text_chunks(combined_text) vectorstore = get_vectorstore(text_chunks) conversation_chain = get_conversation_chain(vectorstore) app.server.config['conversation_chain'] = conversation_chain return "پردازش تکمیل شد! اکنون می‌توانید سوالات خود را بپرسید." except ValueError as e: return f"خطا در پردازش داده‌ها: {str(e)}" return "لطفاً فایل‌های مناسب را آپلود کنید." if __name__ == '__main__': from waitress import serve serve(app.server, host="0.0.0.0", port=7860)