awacke1's picture
Update app.py
db7d8ff verified
raw
history blame
12.5 kB
import os
import io
import re
import streamlit as st
# Must be the very first Streamlit command.
st.set_page_config(layout="wide", initial_sidebar_state="collapsed")
from PIL import Image
import fitz # PyMuPDF
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib import colors
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
# ---------------------------------------------------------------
# Define available NotoEmoji fonts (all in the base directory now)
available_fonts = {
"NotoEmoji Variable": "NotoEmoji-VariableFont_wght.ttf",
"NotoEmoji Bold": "NotoEmoji-Bold.ttf",
"NotoEmoji Light": "NotoEmoji-Light.ttf",
"NotoEmoji Medium": "NotoEmoji-Medium.ttf",
"NotoEmoji Regular": "NotoEmoji-Regular.ttf",
"NotoEmoji SemiBold": "NotoEmoji-SemiBold.ttf"
}
# Sidebar: Let the user choose the desired NotoEmoji font.
selected_font_name = st.sidebar.selectbox(
"Select NotoEmoji Font",
options=list(available_fonts.keys())
)
selected_font_path = available_fonts[selected_font_name]
# Register the chosen emoji font with ReportLab.
pdfmetrics.registerFont(TTFont(selected_font_name, selected_font_path))
# ---------------------------------------------------------------
# Helper function to wrap emoji characters with a font tag.
def apply_emoji_font(text, emoji_font):
# This regex attempts to capture many common emoji ranges.
emoji_pattern = re.compile(
r"([\U0001F300-\U0001F5FF"
r"\U0001F600-\U0001F64F"
r"\U0001F680-\U0001F6FF"
r"\U0001F700-\U0001F77F"
r"\U0001F780-\U0001F7FF"
r"\U0001F800-\U0001F8FF"
r"\U0001F900-\U0001F9FF"
r"\U0001FA00-\U0001FA6F"
r"\U0001FA70-\U0001FAFF"
r"\u2600-\u26FF"
r"\u2700-\u27BF]+)"
)
# Wrap found emoji with a font tag using the selected emoji font.
return emoji_pattern.sub(r'<font face="{}">\1</font>'.format(emoji_font), text)
# ---------------------------------------------------------------
# Default markdown content with emojis.
default_markdown = """# Cutting-Edge ML Outline
## Core ML Techniques
1. 🌟 **Mixture of Experts (MoE)**
- Conditional computation techniques
- Sparse gating mechanisms
- Training specialized sub-models
2. πŸ”₯ **Supervised Fine-Tuning (SFT) using PyTorch**
- Loss function customization
- Gradient accumulation strategies
- Learning rate schedulers
3. πŸ€– **Large Language Models (LLM) using Transformers**
- Attention mechanisms
- Tokenization strategies
- Position encodings
## Training Methods
4. πŸ“Š **Self-Rewarding Learning using NPS 0-10 and Verbatims**
- Custom reward functions
- Feedback categorization
- Signal extraction from text
5. πŸ‘ **Reinforcement Learning from Human Feedback (RLHF)**
- Preference datasets
- PPO implementation
- KL divergence constraints
6. πŸ”— **MergeKit: Merging Models to Same Embedding Space**
- TIES merging
- Task arithmetic
- SLERP interpolation
## Optimization & Deployment
7. πŸ“ **DistillKit: Model Size Reduction with Spectrum Analysis**
- Knowledge distillation
- Quantization techniques
- Model pruning strategies
8. 🧠 **Agentic RAG Agents using Document Inputs**
- Vector database integration
- Query planning
- Self-reflection mechanisms
9. ⏳ **Longitudinal Data Summarization from Multiple Docs**
- Multi-document compression
- Timeline extraction
- Entity tracking
## Knowledge Representation
10. πŸ“‘ **Knowledge Extraction using Markdown Knowledge Graphs**
- Entity recognition
- Relationship mapping
- Hierarchical structuring
11. πŸ—ΊοΈ **Knowledge Mapping with Mermaid Diagrams**
- Flowchart generation
- Sequence diagram creation
- State diagrams
12. πŸ’» **ML Code Generation with Streamlit/Gradio/HTML5+JS**
- Code completion
- Unit test generation
- Documentation synthesis
"""
# ---------------------------------------------------------------
# Process markdown into a two-column layout for the PDF.
def markdown_to_pdf_content(markdown_text):
lines = markdown_text.strip().split('\n')
pdf_content = []
in_list_item = False
current_item = None
sub_items = []
for line in lines:
line = line.strip()
if not line:
continue
if line.startswith('# '):
# Optionally skip the main title.
pass
elif line.startswith('## '):
if current_item and sub_items:
pdf_content.append([current_item, sub_items])
sub_items = []
current_item = None
section = line.replace('## ', '').strip()
pdf_content.append(f"<b>{section}</b>")
in_list_item = False
elif re.match(r'^\d+\.', line):
if current_item and sub_items:
pdf_content.append([current_item, sub_items])
sub_items = []
current_item = line.strip()
in_list_item = True
elif line.startswith('- ') and in_list_item:
sub_items.append(line.strip())
else:
if not in_list_item:
pdf_content.append(line.strip())
if current_item and sub_items:
pdf_content.append([current_item, sub_items])
mid_point = len(pdf_content) // 2
left_column = pdf_content[:mid_point]
right_column = pdf_content[mid_point:]
return left_column, right_column
# ---------------------------------------------------------------
# Create the PDF using ReportLab.
def create_main_pdf(markdown_text, base_font_size=10, auto_size=False):
buffer = io.BytesIO()
doc = SimpleDocTemplate(
buffer,
pagesize=(A4[1], A4[0]),
leftMargin=36,
rightMargin=36,
topMargin=36,
bottomMargin=36
)
styles = getSampleStyleSheet()
story = []
spacer_height = 10
left_column, right_column = markdown_to_pdf_content(markdown_text)
total_items = 0
for col in (left_column, right_column):
for item in col:
if isinstance(item, list):
main_item, sub_items = item
total_items += 1 + len(sub_items)
else:
total_items += 1
if auto_size:
base_font_size = max(6, min(12, 200 / total_items))
item_font_size = base_font_size
subitem_font_size = base_font_size * 0.9
section_font_size = base_font_size * 1.2
title_font_size = min(16, base_font_size * 1.5)
# Define ParagraphStyles using Helvetica for normal text.
title_style = ParagraphStyle(
'Heading1',
parent=styles['Heading1'],
fontName="Helvetica-Bold",
textColor=colors.darkblue,
alignment=1,
fontSize=title_font_size
)
section_style = ParagraphStyle(
'SectionStyle',
parent=styles['Heading2'],
fontName="Helvetica-Bold",
textColor=colors.darkblue,
fontSize=section_font_size,
leading=section_font_size * 1.2,
spaceAfter=2
)
item_style = ParagraphStyle(
'ItemStyle',
parent=styles['Normal'],
fontName="Helvetica",
fontSize=item_font_size,
leading=item_font_size * 1.2,
spaceAfter=1
)
subitem_style = ParagraphStyle(
'SubItemStyle',
parent=styles['Normal'],
fontName="Helvetica",
fontSize=subitem_font_size,
leading=subitem_font_size * 1.2,
leftIndent=10,
spaceAfter=1
)
story.append(Paragraph(apply_emoji_font("Cutting-Edge ML Outline (ReportLab)", selected_font_name), title_style))
story.append(Spacer(1, spacer_height))
left_cells = []
for item in left_column:
if isinstance(item, str) and item.startswith('<b>'):
# Process section headings.
text = item.replace('<b>', '').replace('</b>', '')
left_cells.append(Paragraph(apply_emoji_font(text, selected_font_name), section_style))
elif isinstance(item, list):
main_item, sub_items = item
left_cells.append(Paragraph(apply_emoji_font(main_item, selected_font_name), item_style))
for sub_item in sub_items:
left_cells.append(Paragraph(apply_emoji_font(sub_item, selected_font_name), subitem_style))
else:
left_cells.append(Paragraph(apply_emoji_font(item, selected_font_name), item_style))
right_cells = []
for item in right_column:
if isinstance(item, str) and item.startswith('<b>'):
text = item.replace('<b>', '').replace('</b>', '')
right_cells.append(Paragraph(apply_emoji_font(text, selected_font_name), section_style))
elif isinstance(item, list):
main_item, sub_items = item
right_cells.append(Paragraph(apply_emoji_font(main_item, selected_font_name), item_style))
for sub_item in sub_items:
right_cells.append(Paragraph(apply_emoji_font(sub_item, selected_font_name), subitem_style))
else:
right_cells.append(Paragraph(apply_emoji_font(item, selected_font_name), item_style))
max_cells = max(len(left_cells), len(right_cells))
left_cells.extend([""] * (max_cells - len(left_cells)))
right_cells.extend([""] * (max_cells - len(right_cells)))
table_data = list(zip(left_cells, right_cells))
col_width = (A4[1] - 72) / 2.0
table = Table(table_data, colWidths=[col_width, col_width], hAlign='CENTER')
table.setStyle(TableStyle([
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
('BACKGROUND', (0, 0), (-1, -1), colors.white),
('GRID', (0, 0), (-1, -1), 0, colors.white),
('LINEAFTER', (0, 0), (0, -1), 0.5, colors.grey),
('LEFTPADDING', (0, 0), (-1, -1), 2),
('RIGHTPADDING', (0, 0), (-1, -1), 2),
('TOPPADDING', (0, 0), (-1, -1), 1),
('BOTTOMPADDING', (0, 0), (-1, -1), 1),
]))
story.append(table)
doc.build(story)
buffer.seek(0)
return buffer.getvalue()
# ---------------------------------------------------------------
# Convert PDF bytes to an image for preview using PyMuPDF.
def pdf_to_image(pdf_bytes):
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
page = doc[0]
pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
doc.close()
return img
except Exception as e:
st.error(f"Failed to render PDF preview: {e}")
return None
# ---------------------------------------------------------------
# Sidebar options for text size.
with st.sidebar:
auto_size = st.checkbox("Auto-size text", value=True)
if not auto_size:
base_font_size = st.slider("Base Font Size (points)", min_value=6, max_value=16, value=10, step=1)
else:
base_font_size = 10
st.info("Font size will auto-adjust between 6-12 points based on content length.")
# Persist markdown content in session state.
if 'markdown_content' not in st.session_state:
st.session_state.markdown_content = default_markdown
# ---------------------------------------------------------------
# Generate the PDF.
with st.spinner("Generating PDF..."):
pdf_bytes = create_main_pdf(st.session_state.markdown_content, base_font_size, auto_size)
# Display PDF preview.
with st.container():
pdf_image = pdf_to_image(pdf_bytes)
if pdf_image:
st.image(pdf_image, use_container_width=True)
else:
st.info("Download the PDF to view it locally.")
# PDF Download button.
st.download_button(
label="Download PDF",
data=pdf_bytes,
file_name="ml_outline.pdf",
mime="application/pdf"
)
# Markdown editor.
edited_markdown = st.text_area(
"Modify the markdown content below:",
value=st.session_state.markdown_content,
height=300
)
# Update PDF on button click.
if st.button("Update PDF"):
st.session_state.markdown_content = edited_markdown
st.experimental_rerun()
# Markdown Download button.
st.download_button(
label="Save Markdown",
data=st.session_state.markdown_content,
file_name="ml_outline.md",
mime="text/markdown"
)