Spaces:

mehdi364
/

data_anilsys

Sleeping

File size: 6,201 Bytes

import os
from dash import Dash, dcc, html, Input, Output, State
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFaceHub
import pandas as pd

# Set API token for HuggingFace
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv('HUGGINGFACEHUB_API_TOKEN', "")

# Initialize Dash app
app = Dash(__name__)

# Extract text from PDF files
def get_pdf_text(pdf_file):
    try:
        pdf_reader = PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
        return text
    except Exception as e:
        raise ValueError(f"Error processing PDF file: {e}")

# Split text into smaller chunks
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len
    )
    return text_splitter.split_text(text)

# Create a vector store from text chunks
def get_vectorstore(text_chunks):
    if not text_chunks:
        raise ValueError("No text chunks provided for vectorstore creation.")

    model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    embeddings = HuggingFaceBgeEmbeddings(
        model_name=model, encode_kwargs={"normalize_embeddings": True}, model_kwargs={"device": "cpu"}
    )
    return FAISS.from_texts(texts=text_chunks, embedding=embeddings)

# Create a conversational retrieval chain
def get_conversation_chain(vectorstore):
    if not vectorstore:
        raise ValueError("Vectorstore is not initialized.")

    llm = HuggingFaceHub(
        repo_id="google/gemma-7b",
        model_kwargs={"temperature": 0.1, "max_length": 2048},
    )
    return llm, vectorstore.as_retriever()

# Process CSV data
def process_csv_data(csv_file):
    try:
        df = pd.read_csv(csv_file)
        combined_text = df.astype(str).apply(" ".join, axis=1).str.cat(sep=" ")
        return combined_text
    except Exception as e:
        raise ValueError(f"Error processing CSV file: {e}")

# Layout
app.layout = html.Div([
    html.H1("Chat Bot برای فایل‌های PDF و CSV 📚"),
    dcc.Upload(
        id='upload-pdf',
        children=html.Div(['Drag and Drop or ', html.A('Select PDF Files')]),
        style={
            'width': '100%', 'height': '60px', 'lineHeight': '60px',
            'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px',
            'textAlign': 'center', 'margin': '10px'
        },
        multiple=True
    ),
    dcc.Upload(
        id='upload-csv',
        children=html.Div(['Drag and Drop or ', html.A('Select CSV Files')]),
        style={
            'width': '100%', 'height': '60px', 'lineHeight': '60px',
            'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px',
            'textAlign': 'center', 'margin': '10px'
        },
    ),
    html.Button('پردازش', id='process-button', n_clicks=0),
    dcc.Input(id='question-input', type='text', placeholder='سوال خود را وارد کنید'),
    html.Button('پاسخ', id='answer-button', n_clicks=0),
    html.Div(id='output-answer')
])

# Callbacks
@app.callback(
    Output('output-answer', 'children'),
    [Input('answer-button', 'n_clicks')],
    [State('question-input', 'value'), State('process-button', 'n_clicks')]
)
def handle_question(n_clicks_answer, question, n_clicks_process):
    if n_clicks_answer > 0 and n_clicks_process > 0:
        llm, retriever = app.server.config.get('conversation_chain', (None, None))
        if not llm or not retriever:
            return "لطفاً ابتدا فایل‌ها را پردازش کنید."

        try:
            result = retriever.get_relevant_documents(question)
            answer = llm.generate({"question": question, "context": result})
            return html.Div([
                html.P(f"سوال: {question}"),
                html.P(f"پاسخ: {answer}")
            ])
        except Exception as e:
            return f"خطا در پردازش سوال: {str(e)}"
    return "لطفاً ابتدا فایل‌ها را پردازش کنید."

@app.callback(
    Output('output-answer', 'children'),
    [Input('process-button', 'n_clicks')],
    [State('upload-pdf', 'contents'), State('upload-csv', 'contents')]
)
def process_files(n_clicks, pdf_contents, csv_contents):
    if n_clicks > 0:
        combined_text = ""

        if pdf_contents:
            if not isinstance(pdf_contents, list) or not all(isinstance(content, str) for content in pdf_contents):
                return "فرمت فایل PDF صحیح نیست. لطفاً دوباره تلاش کنید."
            for content in pdf_contents:
                try:
                    pdf_data = content.split(",")[1]
                    pdf_text = get_pdf_text(pdf_data)
                    combined_text += pdf_text
                except Exception as e:
                    return f"خطا در پردازش فایل PDF: {str(e)}"

        if csv_contents:
            for content in csv_contents:
                try:
                    csv_data = content.split(",")[1]
                    csv_text = process_csv_data(csv_data)
                    combined_text += csv_text
                except Exception as e:
                    return f"خطا در پردازش فایل CSV: {str(e)}"

        if not combined_text.strip():
            return "هیچ متنی برای پردازش یافت نشد."

        try:
            text_chunks = get_text_chunks(combined_text)
            vectorstore = get_vectorstore(text_chunks)
            conversation_chain = get_conversation_chain(vectorstore)
            app.server.config['conversation_chain'] = conversation_chain
            return "پردازش تکمیل شد! اکنون می‌توانید سوالات خود را بپرسید."
        except ValueError as e:
            return f"خطا در پردازش داده‌ها: {str(e)}"

    return "لطفاً فایل‌های مناسب را آپلود کنید."

if __name__ == '__main__':
    from waitress import serve
    serve(app.server, host="0.0.0.0", port=7860)