Spaces:
Sleeping
Sleeping
import os | |
import io | |
import re | |
import streamlit as st | |
# Must be the first Streamlit command. | |
st.set_page_config(layout="wide", initial_sidebar_state="collapsed") | |
from PIL import Image | |
import fitz # PyMuPDF | |
from reportlab.lib.pagesizes import A4 | |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle | |
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
from reportlab.lib import colors | |
from reportlab.pdfbase import pdfmetrics | |
from reportlab.pdfbase.ttfonts import TTFont | |
# --------------------------------------------------------------- | |
# Define available NotoEmoji fonts (local files) | |
# One font is at the root and others are in the 'static' subdirectory. | |
available_fonts = { | |
"NotoEmoji Variable": "NotoEmoji-VariableFont_wght.ttf", | |
"NotoEmoji Bold": "NotoEmoji-Bold.ttf", | |
"NotoEmoji Light": "NotoEmoji-Light.ttf", | |
"NotoEmoji Medium": "NotoEmoji-Medium.ttf", | |
"NotoEmoji Regular": "NotoEmoji-Regular.ttf", | |
"NotoEmoji SemiBold": "NotoEmoji-SemiBold.ttf" | |
} | |
# Sidebar: Let the user choose the desired NotoEmoji font. | |
selected_font_name = st.sidebar.selectbox( | |
"Select NotoEmoji Font", | |
options=list(available_fonts.keys()) | |
) | |
selected_font_path = available_fonts[selected_font_name] | |
# Register the chosen font with ReportLab. | |
pdfmetrics.registerFont(TTFont(selected_font_name, selected_font_path)) | |
# --------------------------------------------------------------- | |
# Default markdown content with emojis. | |
default_markdown = """# Cutting-Edge ML Outline | |
## Core ML Techniques | |
1. π **Mixture of Experts (MoE)** | |
- Conditional computation techniques | |
- Sparse gating mechanisms | |
- Training specialized sub-models | |
2. π₯ **Supervised Fine-Tuning (SFT) using PyTorch** | |
- Loss function customization | |
- Gradient accumulation strategies | |
- Learning rate schedulers | |
3. π€ **Large Language Models (LLM) using Transformers** | |
- Attention mechanisms | |
- Tokenization strategies | |
- Position encodings | |
## Training Methods | |
4. π **Self-Rewarding Learning using NPS 0-10 and Verbatims** | |
- Custom reward functions | |
- Feedback categorization | |
- Signal extraction from text | |
5. π **Reinforcement Learning from Human Feedback (RLHF)** | |
- Preference datasets | |
- PPO implementation | |
- KL divergence constraints | |
6. π **MergeKit: Merging Models to Same Embedding Space** | |
- TIES merging | |
- Task arithmetic | |
- SLERP interpolation | |
## Optimization & Deployment | |
7. π **DistillKit: Model Size Reduction with Spectrum Analysis** | |
- Knowledge distillation | |
- Quantization techniques | |
- Model pruning strategies | |
8. π§ **Agentic RAG Agents using Document Inputs** | |
- Vector database integration | |
- Query planning | |
- Self-reflection mechanisms | |
9. β³ **Longitudinal Data Summarization from Multiple Docs** | |
- Multi-document compression | |
- Timeline extraction | |
- Entity tracking | |
## Knowledge Representation | |
10. π **Knowledge Extraction using Markdown Knowledge Graphs** | |
- Entity recognition | |
- Relationship mapping | |
- Hierarchical structuring | |
11. πΊοΈ **Knowledge Mapping with Mermaid Diagrams** | |
- Flowchart generation | |
- Sequence diagram creation | |
- State diagrams | |
12. π» **ML Code Generation with Streamlit/Gradio/HTML5+JS** | |
- Code completion | |
- Unit test generation | |
- Documentation synthesis | |
""" | |
# --------------------------------------------------------------- | |
# Process markdown into PDF content. | |
def markdown_to_pdf_content(markdown_text): | |
lines = markdown_text.strip().split('\n') | |
pdf_content = [] | |
in_list_item = False | |
current_item = None | |
sub_items = [] | |
for line in lines: | |
line = line.strip() | |
if not line: | |
continue | |
if line.startswith('# '): | |
# Optionally skip the main title. | |
pass | |
elif line.startswith('## '): | |
if current_item and sub_items: | |
pdf_content.append([current_item, sub_items]) | |
sub_items = [] | |
current_item = None | |
section = line.replace('## ', '').strip() | |
pdf_content.append(f"<b>{section}</b>") | |
in_list_item = False | |
elif re.match(r'^\d+\.', line): | |
if current_item and sub_items: | |
pdf_content.append([current_item, sub_items]) | |
sub_items = [] | |
current_item = line.strip() | |
in_list_item = True | |
elif line.startswith('- ') and in_list_item: | |
sub_items.append(line.strip()) | |
else: | |
if not in_list_item: | |
pdf_content.append(line.strip()) | |
if current_item and sub_items: | |
pdf_content.append([current_item, sub_items]) | |
mid_point = len(pdf_content) // 2 | |
left_column = pdf_content[:mid_point] | |
right_column = pdf_content[mid_point:] | |
return left_column, right_column | |
# --------------------------------------------------------------- | |
# Create PDF using ReportLab. | |
def create_main_pdf(markdown_text, base_font_size=10, auto_size=False): | |
buffer = io.BytesIO() | |
doc = SimpleDocTemplate( | |
buffer, | |
pagesize=(A4[1], A4[0]), | |
leftMargin=36, | |
rightMargin=36, | |
topMargin=36, | |
bottomMargin=36 | |
) | |
styles = getSampleStyleSheet() | |
story = [] | |
spacer_height = 10 | |
left_column, right_column = markdown_to_pdf_content(markdown_text) | |
# Count total items to possibly adjust font size. | |
total_items = 0 | |
for col in (left_column, right_column): | |
for item in col: | |
if isinstance(item, list): | |
main_item, sub_items | |