Spaces:

awacke1
/

Pillow-PyMuPDF-ReportLab

Running

App Files Files Community

Pillow-PyMuPDF-ReportLab / app.py

awacke1

Update app.py

9f3cf94 verified about 1 month ago

raw

history blame

11.4 kB

	import os
	import urllib.request
	import io
	import re
	import streamlit as st

	# Set the page configuration as the very first Streamlit command.
	st.set_page_config(layout="wide", initial_sidebar_state="collapsed")

	from PIL import Image
	import fitz # PyMuPDF

	from reportlab.lib.pagesizes import A4
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib import colors
	from reportlab.pdfbase import pdfmetrics
	from reportlab.pdfbase.ttfonts import TTFont

	# --- Step 1: Define and Download Available Emoji Fonts ---
	font_files = [
	"Noto-COLRv1-emojicompat.ttf",
	"Noto-COLRv1-noflags.ttf",
	"Noto-COLRv1.ttf",
	"NotoColorEmoji-emojicompat.ttf",
	"NotoColorEmoji-flagsonly.ttf",
	"NotoColorEmoji-noflags.ttf",
	"NotoColorEmoji.ttf",
	"NotoColorEmoji_WindowsCompatible.ttf"
	]

	base_font_url = "https://github.com/googlefonts/noto-emoji/raw/main/fonts/"

	for font_file in font_files:
	if not os.path.exists(font_file):
	st.info(f"Downloading {font_file}...")
	try:
	urllib.request.urlretrieve(base_font_url + font_file, font_file)
	st.success(f"Downloaded {font_file}")
	except Exception as e:
	st.error(f"Failed to download {font_file}: {e}")

	# --- Step 2: Allow User to Select the Emoji Font ---
	font_display_names = {f: f.replace(".ttf", "") for f in font_files}
	selected_font_file = st.sidebar.selectbox(
	"Select Emoji Font",
	options=font_files,
	format_func=lambda f: font_display_names[f]
	)

	registered_font_name = font_display_names[selected_font_file]
	pdfmetrics.registerFont(TTFont(registered_font_name, selected_font_file))

	# --- Default Markdown Content with Emojis ---
	default_markdown = """# Cutting-Edge ML Outline

	## Core ML Techniques
	1. 🌟 Mixture of Experts (MoE)
	- Conditional computation techniques
	- Sparse gating mechanisms
	- Training specialized sub-models

	2. 🔥 Supervised Fine-Tuning (SFT) using PyTorch
	- Loss function customization
	- Gradient accumulation strategies
	- Learning rate schedulers

	3. 🤖 Large Language Models (LLM) using Transformers
	- Attention mechanisms
	- Tokenization strategies
	- Position encodings

	## Training Methods
	4. 📊 Self-Rewarding Learning using NPS 0-10 and Verbatims
	- Custom reward functions
	- Feedback categorization
	- Signal extraction from text

	5. 👍 Reinforcement Learning from Human Feedback (RLHF)
	- Preference datasets
	- PPO implementation
	- KL divergence constraints

	6. 🔗 MergeKit: Merging Models to Same Embedding Space
	- TIES merging
	- Task arithmetic
	- SLERP interpolation

	## Optimization & Deployment
	7. 📏 DistillKit: Model Size Reduction with Spectrum Analysis
	- Knowledge distillation
	- Quantization techniques
	- Model pruning strategies

	8. 🧠 Agentic RAG Agents using Document Inputs
	- Vector database integration
	- Query planning
	- Self-reflection mechanisms

	9. ⏳ Longitudinal Data Summarization from Multiple Docs
	- Multi-document compression
	- Timeline extraction
	- Entity tracking

	## Knowledge Representation
	10. 📑 Knowledge Extraction using Markdown Knowledge Graphs
	- Entity recognition
	- Relationship mapping
	- Hierarchical structuring

	11. 🗺️ Knowledge Mapping with Mermaid Diagrams
	- Flowchart generation
	- Sequence diagram creation
	- State diagrams

	12. 💻 ML Code Generation with Streamlit/Gradio/HTML5+JS
	- Code completion
	- Unit test generation
	- Documentation synthesis
	"""

	# --- Markdown to PDF Content Processing ---
	def markdown_to_pdf_content(markdown_text):
	lines = markdown_text.strip().split('\n')
	pdf_content = []
	in_list_item = False
	current_item = None
	sub_items = []

	for line in lines:
	line = line.strip()
	if not line:
	continue

	if line.startswith('# '):
	# Optionally skip main title
	pass
	elif line.startswith('## '):
	if current_item and sub_items:
	pdf_content.append([current_item, sub_items])
	sub_items = []
	current_item = None
	section = line.replace('## ', '').strip()
	pdf_content.append(f"<b>{section}</b>")
	in_list_item = False
	elif re.match(r'^\d+\.', line):
	if current_item and sub_items:
	pdf_content.append([current_item, sub_items])
	sub_items = []
	current_item = line.strip()
	in_list_item = True
	elif line.startswith('- ') and in_list_item:
	sub_items.append(line.strip())
	else:
	if not in_list_item:
	pdf_content.append(line.strip())

	if current_item and sub_items:
	pdf_content.append([current_item, sub_items])

	mid_point = len(pdf_content) // 2
	left_column = pdf_content[:mid_point]
	right_column = pdf_content[mid_point:]

	return left_column, right_column

	# --- Main PDF Creation ---
	def create_main_pdf(markdown_text, base_font_size=10, auto_size=False):
	buffer = io.BytesIO()
	doc = SimpleDocTemplate(
	buffer,
	pagesize=(A4[1], A4[0]),
	leftMargin=36,
	rightMargin=36,
	topMargin=36,
	bottomMargin=36
	)

	styles = getSampleStyleSheet()
	story = []

	spacer_height = 10
	left_column, right_column = markdown_to_pdf_content(markdown_text)

	total_items = 0
	for col in (left_column, right_column):
	for item in col:
	if isinstance(item, list):
	main_item, sub_items = item
	total_items += 1 + len(sub_items)
	else:
	total_items += 1

	if auto_size:
	base_font_size = max(6, min(12, 200 / total_items))

	item_font_size = base_font_size
	subitem_font_size = base_font_size * 0.9
	section_font_size = base_font_size * 1.2
	title_font_size = min(16, base_font_size * 1.5)

	title_style = ParagraphStyle(
	'Heading1',
	parent=styles['Heading1'],
	fontName=registered_font_name,
	textColor=colors.darkblue,
	alignment=1,
	fontSize=title_font_size
	)

	section_style = ParagraphStyle(
	'SectionStyle',
	parent=styles['Heading2'],
	fontName=registered_font_name,
	textColor=colors.darkblue,
	fontSize=section_font_size,
	leading=section_font_size * 1.2,
	spaceAfter=2
	)

	item_style = ParagraphStyle(
	'ItemStyle',
	parent=styles['Normal'],
	fontName=registered_font_name,
	fontSize=item_font_size,
	leading=item_font_size * 1.2,
	spaceAfter=1
	)

	subitem_style = ParagraphStyle(
	'SubItemStyle',
	parent=styles['Normal'],
	fontName=registered_font_name,
	fontSize=subitem_font_size,
	leading=subitem_font_size * 1.2,
	leftIndent=10,
	spaceAfter=1
	)

	story.append(Paragraph("Cutting-Edge ML Outline (ReportLab)", title_style))
	story.append(Spacer(1, spacer_height))

	left_cells = []
	for item in left_column:
	if isinstance(item, str) and item.startswith('<b>'):
	text = item.replace('<b>', '').replace('</b>', '')
	left_cells.append(Paragraph(text, section_style))
	elif isinstance(item, list):
	main_item, sub_items = item
	left_cells.append(Paragraph(main_item, item_style))
	for sub_item in sub_items:
	left_cells.append(Paragraph(sub_item, subitem_style))
	else:
	left_cells.append(Paragraph(item, item_style))

	right_cells = []
	for item in right_column:
	if isinstance(item, str) and item.startswith('<b>'):
	text = item.replace('<b>', '').replace('</b>', '')
	right_cells.append(Paragraph(text, section_style))
	elif isinstance(item, list):
	main_item, sub_items = item
	right_cells.append(Paragraph(main_item, item_style))
	for sub_item in sub_items:
	right_cells.append(Paragraph(sub_item, subitem_style))
	else:
	right_cells.append(Paragraph(item, item_style))

	max_cells = max(len(left_cells), len(right_cells))
	left_cells.extend([""] * (max_cells - len(left_cells)))
	right_cells.extend([""] * (max_cells - len(right_cells)))

	table_data = list(zip(left_cells, right_cells))
	col_width = (A4[1] - 72) / 2.0

	table = Table(table_data, colWidths=[col_width, col_width], hAlign='CENTER')
	table.setStyle(TableStyle([
	('VALIGN', (0, 0), (-1, -1), 'TOP'),
	('ALIGN', (0, 0), (-1, -1), 'LEFT'),
	('BACKGROUND', (0, 0), (-1, -1), colors.white),
	('GRID', (0, 0), (-1, -1), 0, colors.white),
	('LINEAFTER', (0, 0), (0, -1), 0.5, colors.grey),
	('LEFTPADDING', (0, 0), (-1, -1), 2),
	('RIGHTPADDING', (0, 0), (-1, -1), 2),
	('TOPPADDING', (0, 0), (-1, -1), 1),
	('BOTTOMPADDING', (0, 0), (-1, -1), 1),
	]))

	story.append(table)
	doc.build(story)
	buffer.seek(0)
	return buffer.getvalue()

	# --- Function to Convert PDF Bytes to Image (for Preview) ---
	def pdf_to_image(pdf_bytes):
	try:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	page = doc[0]
	pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	doc.close()
	return img
	except Exception as e:
	st.error(f"Failed to render PDF preview: {e}")
	return None

	# --- Sidebar UI for Additional Settings ---
	with st.sidebar:
	auto_size = st.checkbox("Auto-size text", value=True)
	if not auto_size:
	base_font_size = st.slider("Base Font Size (points)", min_value=6, max_value=16, value=10, step=1)
	else:
	base_font_size = 10
	st.info("Font size will auto-adjust between 6-12 points based on content length.")

	# --- Persist Markdown Content in Session State ---
	if 'markdown_content' not in st.session_state:
	st.session_state.markdown_content = default_markdown

	# --- Generate PDF ---
	with st.spinner("Generating PDF..."):
	pdf_bytes = create_main_pdf(st.session_state.markdown_content, base_font_size, auto_size)

	# --- Display PDF Preview in UI ---
	with st.container():
	pdf_image = pdf_to_image(pdf_bytes)
	if pdf_image:
	st.image(pdf_image, use_container_width=True)
	else:
	st.info("Download the PDF to view it locally.")

	# --- PDF Download Button ---
	st.download_button(
	label="Download PDF",
	data=pdf_bytes,
	file_name="ml_outline.pdf",
	mime="application/pdf"
	)

	# --- Markdown Editor ---
	edited_markdown = st.text_area(
	"Modify the markdown content below:",
	value=st.session_state.markdown_content,
	height=300
	)

	# --- Update PDF on Button Click ---
	if st.button("Update PDF"):
	st.session_state.markdown_content = edited_markdown
	st.experimental_rerun()

	# --- Markdown Download Button ---
	st.download_button(
	label="Save Markdown",
	data=st.session_state.markdown_content,
	file_name="ml_outline.md",
	mime="text/markdown"
	)