Spaces:

siyux1927
/

slides-comprehension

Sleeping

App Files Files Community

SonyaX20 commited on Feb 4

Commit

cc4ded4

1 Parent(s): 581381b

new

Browse files

Files changed (4) hide show

.gitignore +4 -1
README.md +11 -8
app.py +101 -17
requirements.txt +3 -2

.gitignore CHANGED Viewed

@@ -19,4 +19,7 @@ temp_image.png
 # Distribution / packaging
 dist/
 build/
-*.egg-info/

 # Distribution / packaging
 dist/
 build/
+*.egg-info/
+# Model cache
+models/

README.md CHANGED Viewed

@@ -43,11 +43,14 @@ MIT License
 ## Hugging Face Spaces 部署说明
 1. Fork 这个项目到你的 Hugging Face Space
-2. 在 Space 设置中添加 Secret：
-   - 进入 Space 设置页面
-   - 找到 "Repository Secrets" 部分
-   - 添加新的 secret：
-     - 名称：`OPENAI_API_KEY`
-     - 值：你的 OpenAI API Key
-注意：请勿在代码中直接硬编码 API Key，务必使用 Secrets 功能进行配置。

 ## Hugging Face Spaces 部署说明
 1. Fork 这个项目到你的 Hugging Face Space
+2. 在 Space 设置中：
+   - Hardware: 选择 CPU (免费) 或 GPU (付费)
+   - Python packages: 确保所有依赖都已列在 requirements.txt 中
+3. 添加 Repository Secrets：
+   - 名称：`OPENAI_API_KEY`
+   - 值：你的 OpenAI API Key
+注意：
+- 首次运行时会下载必要的模型文件，可能需要几分钟
+- CPU 模式下识别速度较慢，但功能完整
+- 如果需要更快的识别速度，建议使用 GPU 环境

app.py CHANGED Viewed

@@ -10,24 +10,89 @@ import torch
 # 加载环境变量
 load_dotenv()
-# 修改初始化 OpenAI 客户端的方式
 try:
     openai_api_key = os.getenv('OPENAI_API_KEY')
     if not openai_api_key:
-        raise ValueError("No OpenAI API key found")
     client = OpenAI(api_key=openai_api_key)
 except Exception as e:
     print(f"Error initializing OpenAI client: {str(e)}")
     raise
-# 检查是否有 GPU
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f"Running on device: {device}")
-# 初始化 EasyOCR（添加进度提示）
-print("Initializing EasyOCR and loading models...")
-reader = easyocr.Reader(['ch_sim', 'en'], gpu=(device=='cuda'), download_enabled=True, verbose=True)
-print("EasyOCR initialization completed!")
 def process_image(image):
     """处理上传的图片并返回识别结果和分析"""
@@ -59,11 +124,15 @@ def extract_text_from_image(image):
             image.save(image_path)
         print("开始识别文字...")
         # 使用 EasyOCR 识别文字
         result = reader.readtext(
             image_path,
             detail=1,
-            paragraph=True  # 尝试将相近的文本组合成段落
         )
         print("文字识别完成")
@@ -71,18 +140,32 @@ def extract_text_from_image(image):
         if image_path == "temp_image.png" and os.path.exists(image_path):
             os.remove(image_path)
-        # 按照位置排序并组织文字
         sorted_text = []
-        for (bbox, text, prob) in result:
-            if prob > 0.5:  # 只保留置信度大于 0.5 的结果
-                sorted_text.append(text)
         final_text = ' '.join(sorted_text)
         if not final_text.strip():
             return "未能识别到清晰的文字，请尝试上传更清晰的图片"
         return final_text
     except Exception as e:
         print(f"文字识别出错: {str(e)}")
         return f"图片处理出错: {str(e)}"
 def analyze_slide(text):
@@ -169,7 +252,8 @@ with gr.Blocks(title="课程幻灯片理解助手") as demo:
     if api_key_error:
         gr.Markdown(api_key_error)
     else:
-        gr.Markdown(f"# 📚 课程幻灯片理解助手 ({device.upper()} 模式)")
         gr.Markdown("上传幻灯片图片，AI 将自动识别内容并提供详细讲解")
         # 存储当前识别的文字，用于对话上下文
@@ -200,8 +284,7 @@ with gr.Blocks(title="课程幻灯片理解助手") as demo:
         gr.Markdown("### 💬 与 AI 助手对话")
         chatbot = gr.Chatbot(
             label="对话历史",
-            height=400,
-            placeholder="在这里可以看到对话历史..."
         )
         with gr.Row():
             msg = gr.Textbox(
@@ -250,4 +333,5 @@ with gr.Blocks(title="课程幻灯片理解助手") as demo:
 # 启动应用
 if __name__ == "__main__":
-    demo.launch(share=True)

 # 加载环境变量
 load_dotenv()
+# 初始化 OpenAI 客户端
 try:
+    # 首先尝试从环境变量获取
     openai_api_key = os.getenv('OPENAI_API_KEY')
+    if not openai_api_key:
+        # 如果环境变量中没有，尝试从 .env 文件加载
+        if os.path.exists('.env'):
+            load_dotenv('.env')
+            openai_api_key = os.getenv('OPENAI_API_KEY')
     if not openai_api_key:
+        raise ValueError("No OpenAI API key found in environment variables or .env file")
     client = OpenAI(api_key=openai_api_key)
+    print("Successfully initialized OpenAI client")
 except Exception as e:
     print(f"Error initializing OpenAI client: {str(e)}")
     raise
+# 设置模型下载目录
+MODEL_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'models')
+os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
+# 检查 GPU 环境
+def check_gpu():
+    try:
+        if torch.cuda.is_available():
+            # 获取 GPU 信息
+            gpu_name = torch.cuda.get_device_name(0)
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3  # 转换为 GB
+            print(f"Found GPU: {gpu_name} with {gpu_memory:.2f}GB memory")
+            return 'cuda'
+        else:
+            print("No CUDA GPU available")
+            return 'cpu'
+    except Exception as e:
+        print(f"Error checking GPU: {str(e)}")
+        return 'cpu'
+# 初始化设备
+device = check_gpu()
 print(f"Running on device: {device}")
+# 初始化 EasyOCR（针对 T4 GPU 优化）
+def initialize_easyocr():
+    try:
+        print("Initializing EasyOCR and loading models...")
+        if device == 'cuda':
+            # 为 T4 GPU 设置较小的批处理大小和内存限制
+            torch.cuda.empty_cache()  # 清理 GPU 内存
+            reader = easyocr.Reader(
+                ['ch_sim', 'en'],
+                gpu=True,
+                download_enabled=True,
+                verbose=True,
+                model_storage_directory=MODEL_CACHE_DIR,
+                recog_batch_size=8,  # 减小批处理大小
+                detector_batch_size=2
+            )
+        else:
+            reader = easyocr.Reader(
+                ['ch_sim', 'en'],
+                gpu=False,
+                download_enabled=True,
+                verbose=True,
+                model_storage_directory=MODEL_CACHE_DIR
+            )
+        print("EasyOCR initialization completed!")
+        return reader
+    except Exception as e:
+        print(f"Error initializing EasyOCR: {str(e)}")
+        print("Falling back to CPU mode...")
+        return easyocr.Reader(
+            ['ch_sim', 'en'],
+            gpu=False,
+            download_enabled=True,
+            verbose=True,
+            model_storage_directory=MODEL_CACHE_DIR
+        )
+# 初始化 reader
+reader = initialize_easyocr()
 def process_image(image):
     """处理上传的图片并返回识别结果和分析"""
             image.save(image_path)
         print("开始识别文字...")
+        if device == 'cuda':
+            torch.cuda.empty_cache()  # 清理 GPU 内存
         # 使用 EasyOCR 识别文字
         result = reader.readtext(
             image_path,
             detail=1,
+            paragraph=True,
+            batch_size=8  # 控制批处理大小
         )
         print("文字识别完成")
         if image_path == "temp_image.png" and os.path.exists(image_path):
             os.remove(image_path)
+        # 修改文字提取逻辑
         sorted_text = []
+        for item in result:
+            # 检查返回结果的格式
+            if isinstance(item, (list, tuple)):
+                if len(item) >= 2:  # 确保至少有 bbox 和 text
+                    text = item[1] if len(item) >= 2 else ""
+                    prob = item[2] if len(item) >= 3 else 1.0
+                    if prob > 0.5:  # 只保留置信度大于 0.5 的结果
+                        sorted_text.append(text)
+            elif isinstance(item, dict):  # 处理可能的字典格式
+                text = item.get('text', '')
+                prob = item.get('confidence', 1.0)
+                if prob > 0.5:
+                    sorted_text.append(text)
         final_text = ' '.join(sorted_text)
         if not final_text.strip():
             return "未能识别到清晰的文字，请尝试上传更清晰的图片"
+        print(f"识别到的文字: {final_text[:100]}...")  # 打印前100个字符用于调试
         return final_text
     except Exception as e:
         print(f"文字识别出错: {str(e)}")
+        import traceback
+        traceback.print_exc()  # 打印详细错误信息
         return f"图片处理出错: {str(e)}"
 def analyze_slide(text):
     if api_key_error:
         gr.Markdown(api_key_error)
     else:
+        gpu_info = f"GPU (T4)" if device == 'cuda' else "CPU"
+        gr.Markdown(f"# 📚 课程幻灯片理解助手 ({gpu_info} 模式)")
         gr.Markdown("上传幻灯片图片，AI 将自动识别内容并提供详细讲解")
         # 存储当前识别的文字，用于对话上下文
         gr.Markdown("### 💬 与 AI 助手对话")
         chatbot = gr.Chatbot(
             label="对话历史",
+            height=400
         )
         with gr.Row():
             msg = gr.Textbox(
 # 启动应用
 if __name__ == "__main__":
+    # 设置较小的并行处理数
+    demo.launch(share=True, max_threads=4)

requirements.txt CHANGED Viewed

@@ -1,8 +1,9 @@
 huggingface_hub==0.25.2
-gradio==4.19.2
 easyocr>=1.7.1
 python-dotenv>=1.0.0
 openai>=1.0.0
 Pillow>=10.0.0
 numpy>=1.24.0
-torch>=2.0.0

 huggingface_hub==0.25.2
+gradio==4.44.1
 easyocr>=1.7.1
 python-dotenv>=1.0.0
 openai>=1.0.0
 Pillow>=10.0.0
 numpy>=1.24.0
+torch>=2.0.0
+torchvision>=0.15.0