import os # Dynamo 완전 비활성화 os.environ["TORCH_DYNAMO_DISABLE"] = "1" import torch import torch._dynamo import gradio as gr import spaces from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread import random from datasets import load_dataset import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer import pandas as pd from typing import List, Tuple import json from datetime import datetime import pyarrow.parquet as pq import pypdf import io import pyarrow.parquet as pq from tabulate import tabulate import platform import subprocess import pytesseract from pdf2image import convert_from_path # -------------------- 추가: PDF to Markdown 변환 관련 import -------------------- import re import requests from bs4 import BeautifulSoup import urllib.request import ocrmypdf import pytz import urllib.parse from pypdf import PdfReader # --------------------------------------------------------------------------- # -------------------- # 1) Dynamo suppress_errors 옵션 사용 (오류 시 eager로 fallback) # -------------------- torch._dynamo.config.suppress_errors = True # 전역 변수 current_file_context = None # 환경 변수 설정 HF_TOKEN = os.environ.get("HF_TOKEN", None) MODEL_ID = "CohereForAI/c4ai-command-r7b-12-2024" MODELS = os.environ.get("MODELS") MODEL_NAME = MODEL_ID.split("/")[-1] model = None # 전역 변수로 선언 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # 위키피디아 데이터셋 로드 wiki_dataset = load_dataset("lcw99/wikipedia-korean-20240501-1million-qna") print("Wikipedia dataset loaded:", wiki_dataset) # TF-IDF 벡터라이저 초기화 및 학습 print("TF-IDF 벡터화 시작...") questions = wiki_dataset['train']['question'][:10000] # 처음 10000개만 사용 vectorizer = TfidfVectorizer(max_features=1000) question_vectors = vectorizer.fit_transform(questions) print("TF-IDF 벡터화 완료") class ChatHistory: def __init__(self): self.history = [] self.history_file = "/tmp/chat_history.json" self.load_history() def add_conversation(self, user_msg: str, assistant_msg: str): conversation = { "timestamp": datetime.now().isoformat(), "messages": [ {"role": "user", "content": user_msg}, {"role": "assistant", "content": assistant_msg} ] } self.history.append(conversation) self.save_history() def format_for_display(self): formatted = [] for conv in self.history: formatted.append([ conv["messages"][0]["content"], conv["messages"][1]["content"] ]) return formatted def get_messages_for_api(self): messages = [] for conv in self.history: messages.extend([ {"role": "user", "content": conv["messages"][0]["content"]}, {"role": "assistant", "content": conv["messages"][1]["content"]} ]) return messages def clear_history(self): self.history = [] self.save_history() def save_history(self): try: with open(self.history_file, 'w', encoding='utf-8') as f: json.dump(self.history, f, ensure_ascii=False, indent=2) except Exception as e: print(f"히스토리 저장 실패: {e}") def load_history(self): try: if os.path.exists(self.history_file): with open(self.history_file, 'r', encoding='utf-8') as f: self.history = json.load(f) except Exception as e: print(f"히스토리 로드 실패: {e}") self.history = [] # 전역 ChatHistory 인스턴스 생성 chat_history = ChatHistory() def find_relevant_context(query, top_k=3): # 쿼리 벡터화 query_vector = vectorizer.transform([query]) # 코사인 유사도 계산 similarities = (query_vector * question_vectors.T).toarray()[0] # 가장 유사한 질문들의 인덱스 top_indices = np.argsort(similarities)[-top_k:][::-1] # 관련 컨텍스트 추출 relevant_contexts = [] for idx in top_indices: if similarities[idx] > 0: relevant_contexts.append({ 'question': questions[idx], 'answer': wiki_dataset['train']['answer'][idx], 'similarity': similarities[idx] }) return relevant_contexts def init_msg(): return "파일을 분석하고 있습니다..." # -------------------- PDF 파일을 Markdown으로 변환하는 유틸 함수들 -------------------- def extract_text_from_pdf(reader: PdfReader) -> str: """ PyPDF를 사용해 모든 페이지 텍스트를 추출. 만약 텍스트가 없으면 빈 문자열 반환. """ full_text = "" for idx, page in enumerate(reader.pages): text = page.extract_text() or "" if len(text) > 0: full_text += f"---- Page {idx+1} ----\n" + text + "\n\n" return full_text.strip() def convert_pdf_to_markdown(pdf_file: str): """ PDF 파일을 읽고 텍스트를 추출한 뒤, 이미지가 많고 텍스트가 적은 경우에는 OCR을 시도한다. 최종적으로 Markdown 형식으로 변환 가능한 텍스트를 반환한다. 메타데이터도 함께 반환. """ try: reader = PdfReader(pdf_file) except Exception as e: return f"PDF 파일을 읽는 중 오류 발생: {e}", None, None # Extract metadata raw_meta = reader.metadata metadata = { "author": raw_meta.author if raw_meta else None, "creator": raw_meta.creator if raw_meta else None, "producer": raw_meta.producer if raw_meta else None, "subject": raw_meta.subject if raw_meta else None, "title": raw_meta.title if raw_meta else None, } # Extract text full_text = extract_text_from_pdf(reader) # 이미지가 많고 텍스트가 너무 짧으면 OCR 시도 image_count = 0 for page in reader.pages: image_count += len(page.images) if image_count > 0 and len(full_text) < 1000: try: out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf") ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True) # Re-extract text from OCR-processed PDF reader_ocr = PdfReader(out_pdf_file) full_text = extract_text_from_pdf(reader_ocr) except Exception as e: full_text = f"OCR 처리 중 오류 발생: {e}\n\n원본 PDF 텍스트:\n\n" + full_text return full_text, metadata, pdf_file # --------------------------------------------------------------------------- def analyze_file_content(content, file_type): """파일 내용을 간단히 분석한 후 구조 요약을 반환.""" if file_type in ['parquet', 'csv']: try: lines = content.split('\n') header = lines[0] columns = header.count('|') - 1 rows = len(lines) - 3 return f"📊 Dataset Structure: {columns} columns, {rows} rows" except: return "❌ Failed to analyze dataset structure" lines = content.split('\n') total_lines = len(lines) non_empty_lines = len([line for line in lines if line.strip()]) if any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function']): functions = len([line for line in lines if 'def ' in line]) classes = len([line for line in lines if 'class ' in line]) imports = len([line for line in lines if 'import ' in line or 'from ' in line]) return f"💻 Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})" paragraphs = content.count('\n\n') + 1 words = len(content.split()) return f"📝 Document Structure: {total_lines} lines, {paragraphs} paragraphs, approximately {words} words" def read_uploaded_file(file): """ 업로드된 파일을 처리하여 1) 파일 타입별로 내용을 읽고 2) 분석 결과와 함께 반환 """ if file is None: return "", "" try: file_ext = os.path.splitext(file.name)[1].lower() # Parquet if file_ext == '.parquet': try: table = pq.read_table(file.name) df = table.to_pandas() content = f"📊 Parquet File Analysis:\n\n" content += f"1. Basic Information:\n" content += f"- Total Rows: {len(df):,}\n" content += f"- Total Columns: {len(df.columns)}\n" content += f"- Memory Usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n" content += f"2. Column Information:\n" for col in df.columns: content += f"- {col} ({df[col].dtype})\n" content += f"\n3. Data Preview:\n" content += tabulate(df.head(5), headers='keys', tablefmt='pipe', showindex=False) content += f"\n\n4. Missing Values:\n" null_counts = df.isnull().sum() for col, count in null_counts[null_counts > 0].items(): content += f"- {col}: {count:,} ({count/len(df)*100:.1f}%)\n" numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns if len(numeric_cols) > 0: content += f"\n5. Numeric Column Statistics:\n" stats_df = df[numeric_cols].describe() content += tabulate(stats_df, headers='keys', tablefmt='pipe') return content, "parquet" except Exception as e: return f"Error reading Parquet file: {str(e)}", "error" # PDF (Markdown 변환) if file_ext == '.pdf': try: markdown_text, metadata, processed_pdf_path = convert_pdf_to_markdown(file.name) if metadata is None: return f"PDF 파일 변환 오류 또는 읽기 실패.\n\n원본 메시지:\n{markdown_text}", "error" content = "# PDF to Markdown Conversion\n\n" content += "## Metadata\n" for k, v in metadata.items(): content += f"**{k.capitalize()}**: {v}\n\n" content += "## Extracted Text\n\n" content += markdown_text return content, "pdf" except Exception as e: return f"Error reading PDF file: {str(e)}", "error" # CSV elif file_ext == '.csv': encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1'] for encoding in encodings: try: df = pd.read_csv(file.name, encoding=encoding) content = f"📊 CSV File Analysis:\n\n" content += f"1. Basic Information:\n" content += f"- Total Rows: {len(df):,}\n" content += f"- Total Columns: {len(df.columns)}\n" content += f"- Memory Usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\n\n" content += f"2. Column Information:\n" for col in df.columns: content += f"- {col} ({df[col].dtype})\n" content += f"\n3. Data Preview:\n" content += df.head(5).to_markdown(index=False) content += f"\n\n4. Missing Values:\n" null_counts = df.isnull().sum() for col, count in null_counts[null_counts > 0].items(): content += f"- {col}: {count:,} ({count/len(df)*100:.1f}%)\n" return content, "csv" except UnicodeDecodeError: continue raise UnicodeDecodeError(f"Unable to read file with supported encodings ({', '.join(encodings)})") # 일반 텍스트 파일 else: encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1'] for encoding in encodings: try: with open(file.name, 'r', encoding=encoding) as f: content = f.read() lines = content.split('\n') total_lines = len(lines) non_empty_lines = len([line for line in lines if line.strip()]) is_code = any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function']) analysis = f"\n📝 File Analysis:\n" if is_code: functions = len([line for line in lines if 'def ' in line]) classes = len([line for line in lines if 'class ' in line]) imports = len([line for line in lines if 'import ' in line or 'from ' in line]) analysis += f"- File Type: Code\n" analysis += f"- Total Lines: {total_lines:,}\n" analysis += f"- Functions: {functions}\n" analysis += f"- Classes: {classes}\n" analysis += f"- Import Statements: {imports}\n" else: words = len(content.split()) chars = len(content) analysis += f"- File Type: Text\n" analysis += f"- Total Lines: {total_lines:,}\n" analysis += f"- Non-empty Lines: {non_empty_lines:,}\n" analysis += f"- Word Count: {words:,}\n" analysis += f"- Character Count: {chars:,}\n" return content + analysis, "text" except UnicodeDecodeError: continue raise UnicodeDecodeError(f"Unable to read file with supported encodings ({', '.join(encodings)})") except Exception as e: return f"Error reading file: {str(e)}", "error" CSS = """ /* 3D 스타일 CSS */ :root { --primary-color: #2196f3; --secondary-color: #1976d2; --background-color: #f0f2f5; --card-background: #ffffff; --text-color: #333333; --shadow-color: rgba(0, 0, 0, 0.1); } body { background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); min-height: 100vh; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } .container { transform-style: preserve-3d; perspective: 1000px; } .chatbot { background: var(--card-background); border-radius: 20px; box-shadow: 0 10px 20px var(--shadow-color), 0 6px 6px var(--shadow-color); transform: translateZ(0); transition: transform 0.3s ease; backdrop-filter: blur(10px); } .chatbot:hover { transform: translateZ(10px); } /* 메시지 입력 영역 */ .input-area { background: var(--card-background); border-radius: 15px; padding: 15px; margin-top: 20px; box-shadow: 0 5px 15px var(--shadow-color), 0 3px 3px var(--shadow-color); transform: translateZ(0); transition: all 0.3s ease; display: flex; align-items: center; gap: 10px; } .input-area:hover { transform: translateZ(5px); } /* 버튼 스타일 */ .custom-button { background: linear-gradient(145deg, var(--primary-color), var(--secondary-color)); color: white; border: none; border-radius: 10px; padding: 10px 20px; font-weight: 600; cursor: pointer; transform: translateZ(0); transition: all 0.3s ease; box-shadow: 0 4px 6px var(--shadow-color), 0 1px 3px var(--shadow-color); } .custom-button:hover { transform: translateZ(5px) translateY(-2px); box-shadow: 0 7px 14px var(--shadow-color), 0 3px 6px var(--shadow-color); } /* 파일 업로드 버튼 */ .file-upload-icon { background: linear-gradient(145deg, #64b5f6, #42a5f5); color: white; border-radius: 8px; font-size: 2em; cursor: pointer; display: flex; align-items: center; justify-content: center; height: 70px; width: 70px; transition: all 0.3s ease; box-shadow: 0 2px 5px rgba(0,0,0,0.1); } .file-upload-icon:hover { transform: translateY(-2px); box-shadow: 0 4px 8px rgba(0,0,0,0.2); } /* 파일 업로드 버튼 내부 요소 스타일링 */ .file-upload-icon > .wrap { display: flex !important; align-items: center; justify-content: center; width: 100%; height: 100%; } .file-upload-icon > .wrap > p { display: none !important; } .file-upload-icon > .wrap::before { content: "📁"; font-size: 2em; display: block; } /* 메시지 스타일 */ .message { background: var(--card-background); border-radius: 15px; padding: 15px; margin: 10px 0; box-shadow: 0 4px 6px var(--shadow-color), 0 1px 3px var(--shadow-color); transform: translateZ(0); transition: all 0.3s ease; } .message:hover { transform: translateZ(5px); } .chat-container { height: 600px !important; margin-bottom: 10px; } .input-container { height: 70px !important; display: flex; align-items: center; gap: 10px; margin-top: 5px; } .input-textbox { height: 70px !important; border-radius: 8px !important; font-size: 1.1em !important; padding: 10px 15px !important; display: flex !important; align-items: flex-start !important; } .input-textbox textarea { padding-top: 5px !important; } .send-button { height: 70px !important; min-width: 70px !important; font-size: 1.1em !important; } /* 설정 패널 기본 스타일 */ .settings-panel { padding: 20px; margin-top: 20px; } """ def clear_cuda_memory(): if hasattr(torch.cuda, 'empty_cache'): with torch.cuda.device('cuda'): torch.cuda.empty_cache() @spaces.GPU def load_model(): try: loaded_model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", ) return loaded_model except Exception as e: print(f"모델 로드 오류: {str(e)}") raise def _truncate_tokens_for_context(input_ids_str: str, desired_input_length: int) -> str: """ 입력 문자열이 desired_input_length 토큰을 넘으면, 앞부분(오래된 컨텍스트)을 잘라내는 함수. """ tokens = input_ids_str.split() if len(tokens) > desired_input_length: # 가장 오래된 부분을 버리고, 뒤에서 desired_input_length만 남김 tokens = tokens[-desired_input_length:] return " ".join(tokens) # build_prompt 함수: 대화 내역을 문자열로 변환 def build_prompt(conversation: list) -> str: """ conversation은 각 항목이 {"role": "user" 또는 "assistant", "content": ...} 형태의 딕셔너리 목록입니다. 이를 단순 텍스트 프롬프트로 변환합니다. """ prompt = "" for msg in conversation: if msg["role"] == "user": prompt += "User: " + msg["content"] + "\n" elif msg["role"] == "assistant": prompt += "Assistant: " + msg["content"] + "\n" # 마지막에 어시스턴트 응답을 기대하도록 추가 prompt += "Assistant: " return prompt @spaces.GPU def stream_chat( message: str, history: list, uploaded_file, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float ): global model, current_file_context try: if model is None: model = load_model() print(f'message is - {message}') print(f'history is - {history}') # 파일 업로드 처리 file_context = "" if uploaded_file and message == "파일을 분석하고 있습니다...": # 새로운 파일 업로드 시에는 기존 메모리 컨텍스트 초기화 current_file_context = None try: content, file_type = read_uploaded_file(uploaded_file) if content: file_analysis = analyze_file_content(content, file_type) file_context = ( f"\n\n📄 파일 분석 결과:\n{file_analysis}" f"\n\n파일 내용:\n```\n{content}\n```" ) current_file_context = file_context # 파일 컨텍스트 저장 message = "업로드된 파일을 분석해주세요." except Exception as e: print(f"파일 분석 오류: {str(e)}") file_context = f"\n\n❌ 파일 분석 중 오류가 발생했습니다: {str(e)}" elif current_file_context: # 이미 업로드된 파일 컨텍스트가 있다면 사용 file_context = current_file_context # 메모리 사용량 모니터링 if torch.cuda.is_available(): print(f"CUDA 메모리 사용량: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") # 대화 히스토리가 너무 길면 잘라내기 max_history_length = 10 if len(history) > max_history_length: history = history[-max_history_length:] # 위키 컨텍스트 찾기 try: relevant_contexts = find_relevant_context(message) wiki_context = "\n\n관련 위키피디아 정보:\n" for ctx in relevant_contexts: wiki_context += ( f"Q: {ctx['question']}\n" f"A: {ctx['answer']}\n" f"유사도: {ctx['similarity']:.3f}\n\n" ) except Exception as e: print(f"컨텍스트 검색 오류: {str(e)}") wiki_context = "" # 대화 히스토리 구성 conversation = [] for prompt, answer in history: conversation.extend([ {"role": "user", "content": prompt}, {"role": "assistant", "content": answer} ]) # 최종 프롬프트 구성 final_message = file_context + wiki_context + "\n현재 질문: " + message conversation.append({"role": "user", "content": final_message}) # build_prompt 사용 (기존 tokenizer.apply_chat_template 대신) input_ids_str = build_prompt(conversation) # 먼저 6000 토큰 이내로 잘라주기 (임의의 수치, 필요에 따라 조정 가능) input_ids_str = _truncate_tokens_for_context(input_ids_str, 6000) inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda") # 최대 컨텍스트 8192 고려하여, 남은 자리가 적으면 max_new_tokens 줄이기 max_context = 8192 input_length = inputs["input_ids"].shape[1] remaining = max_context - input_length # 최소 128 토큰 정도는 생성할 수 있게 만들고 싶다면, # remaining이 128 미만이면, 추가로 input을 더 잘라낸다. min_generation = 128 if remaining < min_generation: # 더 잘라서 충분한 출력 토큰 확보 must_cut = min_generation - remaining # 몇 토큰만큼 더 자를지 new_desired_input_length = max(1, input_length - must_cut) print(f"[주의] 입력이 너무 길어 {must_cut}토큰 더 제거하여, input_length={input_length} -> {new_desired_input_length} 재조정") # 문자열 다시 만들어서 tokenizer input_ids_str = _truncate_tokens_for_context(input_ids_str, new_desired_input_length) inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda") input_length = inputs["input_ids"].shape[1] remaining = max_context - input_length # 최종적으로 (input + max_new_tokens) <= 8192 되도록 if remaining < max_new_tokens: print(f"[주의] 입력 토큰이 많아 max_new_tokens={max_new_tokens} -> {remaining}로 조정합니다.") max_new_tokens = remaining if max_new_tokens < 1: # 그래도 1 미만이면 1 토큰만 생성 max_new_tokens = 1 if torch.cuda.is_available(): print(f"입력 텐서 생성 후 CUDA 메모리: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") streamer = TextIteratorStreamer( tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True ) generate_kwargs = dict( **inputs, streamer=streamer, top_k=top_k, top_p=top_p, repetition_penalty=penalty, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, eos_token_id=255001, # 수정: 리스트 대신 정수형 사용 ) # 생성 시작 전 메모리 정리 clear_cuda_memory() thread = Thread(target=model.generate, kwargs=generate_kwargs) thread.start() buffer = "" for new_text in streamer: buffer += new_text yield "", history + [[message, buffer]] # 생성 완료 후 메모리 정리 clear_cuda_memory() except Exception as e: error_message = f"오류가 발생했습니다: {str(e)}" print(f"Stream chat 오류: {error_message}") clear_cuda_memory() yield "", history + [[message, error_message]] def create_demo(): with gr.Blocks(css=CSS) as demo: with gr.Column(elem_classes="markdown-style"): gr.Markdown(""" # 🤖 RAGOndevice #### 📊 RAG: Upload and Analyze Files (TXT, CSV, PDF, Parquet files) Upload your files for data analysis and learning """) chatbot = gr.Chatbot( value=[], height=600, label="GiniGEN AI Assistant", elem_classes="chat-container" ) with gr.Row(elem_classes="input-container"): with gr.Column(scale=1, min_width=70): file_upload = gr.File( type="filepath", elem_classes="file-upload-icon", scale=1, container=True, interactive=True, show_label=False ) with gr.Column(scale=3): msg = gr.Textbox( show_label=False, placeholder="Type your message here... 💭", container=False, elem_classes="input-textbox", scale=1 ) with gr.Column(scale=1, min_width=70): send = gr.Button( "Send", elem_classes="send-button custom-button", scale=1 ) with gr.Column(scale=1, min_width=70): clear = gr.Button( "Clear", elem_classes="clear-button custom-button", scale=1 ) with gr.Accordion("🎮 Advanced Settings", open=False): with gr.Row(): with gr.Column(scale=1): temperature = gr.Slider( minimum=0, maximum=1, step=0.1, value=0.8, label="Creativity Level 🎨" ) max_new_tokens = gr.Slider( minimum=128, maximum=8000, step=1, value=4000, label="Maximum Token Count 📝" ) with gr.Column(scale=1): top_p = gr.Slider( minimum=0.0, maximum=1.0, step=0.1, value=0.8, label="Diversity Control 🎯" ) top_k = gr.Slider( minimum=1, maximum=20, step=1, value=20, label="Selection Range 📊" ) penalty = gr.Slider( minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Repetition Penalty 🔄" ) gr.Examples( examples=[ ["Please analyze this code and suggest improvements:\ndef fibonacci(n):\n if n <= 1: return n\n return fibonacci(n-1) + fibonacci(n-2)"], ["Please analyze this data and provide insights:\nAnnual Revenue (Million)\n2019: 1200\n2020: 980\n2021: 1450\n2022: 2100\n2023: 1890"], ["Please solve this math problem step by step: 'When a circle's area is twice that of its inscribed square, find the relationship between the circle's radius and the square's side length.'"], ["Please analyze this marketing campaign's ROI and suggest improvements:\nTotal Cost: $50,000\nReach: 1M users\nClick Rate: 2.3%\nConversion Rate: 0.8%\nAverage Purchase: $35"], ], inputs=msg ) def clear_conversation(): global current_file_context current_file_context = None return [], None, "Start a new conversation..." # 이벤트 연결 msg.submit( stream_chat, inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty], outputs=[msg, chatbot] ) send.click( stream_chat, inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty], outputs=[msg, chatbot] ) file_upload.change( fn=init_msg, outputs=msg, queue=False ).then( fn=stream_chat, inputs=[msg, chatbot, file_upload, temperature, max_new_tokens, top_p, top_k, penalty], outputs=[msg, chatbot], queue=True ) clear.click( fn=clear_conversation, outputs=[chatbot, file_upload, msg], queue=False ) return demo if __name__ == "__main__": demo = create_demo() demo.launch()