Spaces:

nslaughter
/

flashcard-studio

Sleeping

App Files Files Community

Nathan Slaughter commited on 17 days ago

Commit

4d17caa

•

1 Parent(s): b8a0d78

add pytorch manual method

Browse files

Files changed (14) hide show

.github/workflows/python-app.yaml +29 -0
.gitignore +1 -0
app.py +8 -0
app/__init__.py +0 -0
app/interface.py +113 -0
app/models.py +31 -0
app/processing.py +95 -0
environment.yml +19 -0
pytest.ini +5 -0
requirements.txt +7 -0
tests/__init__.py +0 -0
tests/conftest.py +14 -0
tests/test_models.py +20 -0
tests/test_processing.py +73 -0

.github/workflows/python-app.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# .github/workflows/python-app.yml
+name: Python application
+on:
+ push:
+ branches: [ main ]
+ pull_request:
+ branches: [ main ]
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python
+ uses: actions/setup-python@v2
+ with:
+ python-version: '3.8'
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+ pip install pytest pytest-mock
+ - name: Run tests
+ run: |
+ pytest

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from app.interface import create_interface
+def main():
+ interface = create_interface()
+ interface.launch()
+if __name__ == "__main__":
+ main()

app/__init__.py ADDED Viewed

File without changes

app/interface.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import gradio as gr
+from .models import LanguageModel
+from .processing import process_file, process_text_input
+def create_interface():
+ # Initialize the language model
+ language_model = LanguageModel()
+ # Define the Output Format Selector
+ output_format_selector = gr.Radio(
+ choices=["CSV", "JSON"],
+ label="Select Output Format",
+ value="JSON",
+ type="value"
+ )
+ # Define the Output Flashcards
+ flashcard_output_file = gr.Textbox(
+ label="Flashcards",
+ lines=20,
+ placeholder="Extracted flashcards will appear here..."
+ )
+ flashcard_output_text = gr.Textbox(
+ label="Flashcards",
+ lines=20,
+ placeholder="Extracted flashcards will appear here..."
+ )
+ # Define the Gradio interface function for File Upload
+ def handle_file_upload(file_obj, output_format):
+ try:
+ flashcards = process_file(file_obj, output_format, language_model)
+ return flashcards
+ except ValueError as ve:
+ return str(ve)
+ # Define the Gradio interface function for Text Input
+ def handle_text_input(input_text, output_format):
+ try:
+ flashcards = process_text_input(input_text, output_format, language_model)
+ return flashcards
+ except ValueError as ve:
+ return str(ve)
+ # Create the Gradio Tabs
+ with gr.Blocks() as interface:
+ gr.Markdown("# Flashcard Extraction Tool")
+ gr.Markdown(
+ "Extract flashcards from uploaded files or directly input text. Choose your preferred output format."
+ )
+ with gr.Tab("Upload File"):
+ with gr.Row():
+ with gr.Column():
+ file_input = gr.File(
+ label="Upload a File",
+ file_types=['.pdf', '.txt', '.md']
+ )
+ format_selector = gr.Radio(
+ choices=["CSV", "JSON"],
+ label="Select Output Format",
+ value="JSON",
+ type="value"
+ )
+ submit_file = gr.Button("Extract Flashcards")
+ with gr.Column():
+ flashcard_output_file = gr.Textbox(
+ label="Flashcards",
+ lines=20,
+ placeholder="Extracted flashcards will appear here..."
+ )
+ submit_file.click(
+ fn=handle_file_upload,
+ inputs=[file_input, format_selector],
+ outputs=flashcard_output_file
+ )
+ with gr.Tab("Input Text"):
+ with gr.Row():
+ with gr.Column():
+ text_input = gr.Textbox(
+ label="Enter Text",
+ lines=20,
+ placeholder="Type or paste your text here..."
+ )
+ format_selector_text = gr.Radio(
+ choices=["CSV", "JSON"],
+ label="Select Output Format",
+ value="JSON",
+ type="value"
+ )
+ submit_text = gr.Button("Extract Flashcards")
+ with gr.Column():
+ flashcard_output_text = gr.Textbox(
+ label="Flashcards",
+ lines=20,
+ placeholder="Extracted flashcards will appear here..."
+ )
+ submit_text.click(
+ fn=handle_text_input,
+ inputs=[text_input, format_selector_text],
+ outputs=flashcard_output_text
+ )
+ gr.Markdown(
+ """
+ ---
+ **Notes:**
+ - Supported file types: `.pdf`, `.txt`, `.md`.
+ - Ensure that the input text is clear and well-structured for optimal flashcard extraction.
+ """
+ )
+ return interface

app/models.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+class LanguageModel:
+ def __init__(self, model_name: str = "Qwen/Qwen2.5-7B-Instruct"):
+ self.device = self._determine_device()
+ self.model = AutoModelForCausalLM.from_pretrained(
+ model_name,
+ torch_dtype="auto",
+ device_map="auto"
+ )
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+ def _determine_device(self):
+ if torch.cuda.is_available():
+ return torch.device("cuda")
+ elif torch.backends.mps.is_available():
+ return torch.device("mps")
+ else:
+ return torch.device("cpu")
+ def generate_flashcards(self, prompt: str, max_new_tokens: int = 1024) -> str:
+ inputs = self.tokenizer(prompt, return_tensors='pt').to(self.model.device)
+ with torch.no_grad():
+ output_ids = self.model.generate(
+ inputs.input_ids,
+ max_new_tokens=max_new_tokens,
+ do_sample=True
+ )
+ response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
+ return response

app/processing.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import pymupdf4llm
+def process_pdf(pdf_path: str) -> str:
+ """
+ Extracts text from a PDF file using pymupdf4llm.
+ """
+ try:
+ text = pymupdf4llm.extract_text(pdf_path)
+ return text
+ except Exception as e:
+ raise ValueError(f"Error processing PDF: {str(e)}")
+def read_text_file(file_path: str) -> str:
+ """
+ Reads text from a .txt or .md file.
+ """
+ try:
+ with open(file_path, 'r', encoding='utf-8') as f:
+ text = f.read()
+ return text
+ except Exception as e:
+ raise ValueError(f"Error reading text file: {str(e)}")
+def format_prompt(output_format: str) -> str:
+ """
+ Formats the prompt based on the output type.
+ """
+ if output_format.lower() == "json":
+ return """You only respond with cards in JSON format. Follow the example below.
+ EXAMPLE:
+ [
+ {"question": "What is AI?", "answer": "Artificial Intelligence."},
+ {"question": "What is ML?", "answer": "Machine Learning."}
+ ...
+ ]
+ """
+ elif output_format.lower() == "csv":
+ return """You only respond with cards in CSV format. Follow the example below.
+ EXAMPLE:
+ "What is AI?", "Artificial Intelligence."
+ "What is ML?", "Machine Learning."
+ ...
+ """
+def extract_flashcards(text: str, output_format: str, language_model: str) -> str:
+ """
+ Extracts flashcards from the input text using the LLM and formats them in CSV or JSON.
+ """
+ prompt = f"""You are an expert flashcard creator. You always include a single knowledge item per flashcard.
+ {format_prompt(output_format)}
+ Extract flashcards from the user's text:
+ {text}
+ Do not include the prompt or any other unnecessary information in the flashcards.
+ Do not include triple ticks (```) or any other code blocks in the flashcards.
+ """
+ # TODO:
+ # see https://qwen.readthedocs.io/en/latest/inference/chat.html
+ # e.g. pipeline = pipeline("text-generation", model="Qwen/Qwen2.5-7B-Instruct")
+ response = language_model.generate_flashcards(prompt)
+ return response
+def process_file(file_obj, output_format: str, language_model) -> str:
+ """
+ Processes the uploaded file based on its type and extracts flashcards.
+ """
+ file_path = file_obj.name
+ file_ext = os.path.splitext(file_path)[1].lower()
+ if file_ext == '.pdf':
+ text = process_pdf(file_path)
+ elif file_ext in ['.txt', '.md']:
+ text = read_text_file(file_path)
+ else:
+ raise ValueError("Unsupported file type.")
+ flashcards = extract_flashcards(text, output_format, language_model)
+ return flashcards
+def process_text_input(input_text: str, output_format: str, language_model) -> str:
+ """
+ Processes the input text and extracts flashcards.
+ """
+ if not input_text.strip():
+ raise ValueError("No text provided.")
+ flashcards = extract_flashcards(input_text, output_format, language_model)
+ return flashcards

environment.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: flashcard-maker
+channels:
+ - conda-forge
+ - pytorch
+ - defaults
+dependencies:
+ - python=3.12
+ - torch
+ - torchvision
+ - torchaudio
+ - cudatoolkit=11.7 # Remove or adjust if installing CPU-only
+ - transformers
+ - gradio
+ - librosa
+ - pytest
+ - pytest-mock
+ - pip
+ - pip:
+ - pymupdf4llm

pytest.ini ADDED Viewed

	@@ -0,0 +1,5 @@

+# pytest.ini
+[pytest]
+filterwarnings =
+ ignore::DeprecationWarning

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pytorch
+transformers
+gradio
+librosa
+pymupdf4llm
+pytest
+pytest-mock # Added for mocking capabilities

tests/__init__.py ADDED Viewed

File without changes

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import pytest
+from unittest.mock import Mock
+from app.models import LanguageModel
+@pytest.fixture
+def language_model():
+ """
+ Fixture to provide a mocked LanguageModel instance.
+ """
+ # Create a mock instance of LanguageModel
+ lm = Mock(spec=LanguageModel)
+ # Mock the generate_flashcards method
+ lm.generate_flashcards.return_value = '{"flashcards": []}'
+ return lm

tests/test_models.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# tests/test_models.py
+import pytest
+def test_generate_flashcards(language_model, mocker):
+ """
+ Test the generate_flashcards method of LanguageModel.
+ """
+ prompt = "Sample prompt for flashcard generation."
+ expected_response = '{"flashcards": [{"Question": "What is AI?", "Answer": "Artificial Intelligence."}]}'
+ # Configure the mock to return a specific response
+ language_model.generate_flashcards.return_value = expected_response
+ # Call the method
+ response = language_model.generate_flashcards(prompt)
+ # Assertions
+ assert response == expected_response
+ language_model.generate_flashcards.assert_called_once_with(prompt)

tests/test_processing.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# tests/test_processing.py
+import pytest
+from app.processing import process_text_input, process_file
+def test_process_text_input_success(language_model):
+ """
+ Test processing of valid text input.
+ """
+ input_text = "This is a sample text for flashcard extraction."
+ output_format = "JSON"
+ expected_output = '{"flashcards": []}'
+ result = process_text_input(input_text, output_format, language_model)
+ assert result == expected_output
+ language_model.generate_flashcards.assert_called_once()
+def test_process_text_input_empty(language_model):
+ """
+ Test processing of empty text input.
+ """
+ input_text = " "
+ output_format = "JSON"
+ with pytest.raises(ValueError) as excinfo:
+ process_text_input(input_text, output_format, language_model)
+ assert "No text provided." in str(excinfo.value)
+def test_process_file_unsupported_type(language_model, tmp_path):
+ """
+ Test processing of an unsupported file type.
+ """
+ # Create a dummy unsupported file
+ dummy_file = tmp_path / "dummy.unsupported"
+ dummy_file.write_text("Unsupported content")
+ with pytest.raises(ValueError) as excinfo:
+ process_file(dummy_file, "JSON", language_model)
+ assert "Unsupported file type." in str(excinfo.value)
+def test_process_file_pdf(language_model, tmp_path, mocker):
+ """
+ Test processing of a PDF file.
+ """
+ # Mock the process_pdf function
+ mocker.patch('app.processing.process_pdf', return_value="Extracted PDF text.")
+ # Create a dummy PDF file
+ dummy_file = tmp_path / "test.pdf"
+ dummy_file.write_text("PDF content")
+ expected_output = '{"flashcards": []}'
+ result = process_file(dummy_file, "JSON", language_model)
+ assert result == expected_output
+ language_model.generate_flashcards.assert_called_once()
+def test_process_file_txt(language_model, tmp_path, mocker):
+ """
+ Test processing of a TXT file.
+ """
+ # Mock the read_text_file function
+ mocker.patch('app.processing.read_text_file', return_value="Extracted TXT text.")
+ # Create a dummy TXT file
+ dummy_file = tmp_path / "test.txt"
+ dummy_file.write_text("TXT content")
+ expected_output = '{"flashcards": []}'
+ result = process_file(dummy_file, "JSON", language_model)
+ assert result == expected_output
+ language_model.generate_flashcards.assert_called_once()