Spaces:

awacke1
/

Pillow-PyMuPDF-ReportLab

Running

App Files Files Community

Pillow-PyMuPDF-ReportLab / app.py

awacke1

Update app.py

db7d8ff verified 3 months ago

raw

history blame

12.5 kB

	import os
	import io
	import re
	import streamlit as st

	# Must be the very first Streamlit command.
	st.set_page_config(layout="wide", initial_sidebar_state="collapsed")

	from PIL import Image
	import fitz # PyMuPDF

	from reportlab.lib.pagesizes import A4
	from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib import colors
	from reportlab.pdfbase import pdfmetrics
	from reportlab.pdfbase.ttfonts import TTFont

	# ---------------------------------------------------------------
	# Define available NotoEmoji fonts (all in the base directory now)
	available_fonts = {
	"NotoEmoji Variable": "NotoEmoji-VariableFont_wght.ttf",
	"NotoEmoji Bold": "NotoEmoji-Bold.ttf",
	"NotoEmoji Light": "NotoEmoji-Light.ttf",
	"NotoEmoji Medium": "NotoEmoji-Medium.ttf",
	"NotoEmoji Regular": "NotoEmoji-Regular.ttf",
	"NotoEmoji SemiBold": "NotoEmoji-SemiBold.ttf"
	}

	# Sidebar: Let the user choose the desired NotoEmoji font.
	selected_font_name = st.sidebar.selectbox(
	"Select NotoEmoji Font",
	options=list(available_fonts.keys())
	)
	selected_font_path = available_fonts[selected_font_name]

	# Register the chosen emoji font with ReportLab.
	pdfmetrics.registerFont(TTFont(selected_font_name, selected_font_path))

	# ---------------------------------------------------------------
	# Helper function to wrap emoji characters with a font tag.
	def apply_emoji_font(text, emoji_font):
	# This regex attempts to capture many common emoji ranges.
	emoji_pattern = re.compile(
	r"([\U0001F300-\U0001F5FF"
	r"\U0001F600-\U0001F64F"
	r"\U0001F680-\U0001F6FF"
	r"\U0001F700-\U0001F77F"
	r"\U0001F780-\U0001F7FF"
	r"\U0001F800-\U0001F8FF"
	r"\U0001F900-\U0001F9FF"
	r"\U0001FA00-\U0001FA6F"
	r"\U0001FA70-\U0001FAFF"
	r"\u2600-\u26FF"
	r"\u2700-\u27BF]+)"
	)
	# Wrap found emoji with a font tag using the selected emoji font.
	return emoji_pattern.sub(r'<font face="{}">\1</font>'.format(emoji_font), text)

	# ---------------------------------------------------------------
	# Default markdown content with emojis.
	default_markdown = """# Cutting-Edge ML Outline

	## Core ML Techniques
	1. 🌟 Mixture of Experts (MoE)
	- Conditional computation techniques
	- Sparse gating mechanisms
	- Training specialized sub-models

	2. 🔥 Supervised Fine-Tuning (SFT) using PyTorch
	- Loss function customization
	- Gradient accumulation strategies
	- Learning rate schedulers

	3. 🤖 Large Language Models (LLM) using Transformers
	- Attention mechanisms
	- Tokenization strategies
	- Position encodings

	## Training Methods
	4. 📊 Self-Rewarding Learning using NPS 0-10 and Verbatims
	- Custom reward functions
	- Feedback categorization
	- Signal extraction from text

	5. 👍 Reinforcement Learning from Human Feedback (RLHF)
	- Preference datasets
	- PPO implementation
	- KL divergence constraints

	6. 🔗 MergeKit: Merging Models to Same Embedding Space
	- TIES merging
	- Task arithmetic
	- SLERP interpolation

	## Optimization & Deployment
	7. 📏 DistillKit: Model Size Reduction with Spectrum Analysis
	- Knowledge distillation
	- Quantization techniques
	- Model pruning strategies

	8. 🧠 Agentic RAG Agents using Document Inputs
	- Vector database integration
	- Query planning
	- Self-reflection mechanisms

	9. ⏳ Longitudinal Data Summarization from Multiple Docs
	- Multi-document compression
	- Timeline extraction
	- Entity tracking

	## Knowledge Representation
	10. 📑 Knowledge Extraction using Markdown Knowledge Graphs
	- Entity recognition
	- Relationship mapping
	- Hierarchical structuring

	11. 🗺️ Knowledge Mapping with Mermaid Diagrams
	- Flowchart generation
	- Sequence diagram creation
	- State diagrams

	12. 💻 ML Code Generation with Streamlit/Gradio/HTML5+JS
	- Code completion
	- Unit test generation
	- Documentation synthesis
	"""

	# ---------------------------------------------------------------
	# Process markdown into a two-column layout for the PDF.
	def markdown_to_pdf_content(markdown_text):
	lines = markdown_text.strip().split('\n')
	pdf_content = []
	in_list_item = False
	current_item = None
	sub_items = []

	for line in lines:
	line = line.strip()
	if not line:
	continue

	if line.startswith('# '):
	# Optionally skip the main title.
	pass
	elif line.startswith('## '):
	if current_item and sub_items:
	pdf_content.append([current_item, sub_items])
	sub_items = []
	current_item = None
	section = line.replace('## ', '').strip()
	pdf_content.append(f"<b>{section}</b>")
	in_list_item = False
	elif re.match(r'^\d+\.', line):
	if current_item and sub_items:
	pdf_content.append([current_item, sub_items])
	sub_items = []
	current_item = line.strip()
	in_list_item = True
	elif line.startswith('- ') and in_list_item:
	sub_items.append(line.strip())
	else:
	if not in_list_item:
	pdf_content.append(line.strip())

	if current_item and sub_items:
	pdf_content.append([current_item, sub_items])

	mid_point = len(pdf_content) // 2
	left_column = pdf_content[:mid_point]
	right_column = pdf_content[mid_point:]

	return left_column, right_column

	# ---------------------------------------------------------------
	# Create the PDF using ReportLab.
	def create_main_pdf(markdown_text, base_font_size=10, auto_size=False):
	buffer = io.BytesIO()
	doc = SimpleDocTemplate(
	buffer,
	pagesize=(A4[1], A4[0]),
	leftMargin=36,
	rightMargin=36,
	topMargin=36,
	bottomMargin=36
	)

	styles = getSampleStyleSheet()
	story = []
	spacer_height = 10
	left_column, right_column = markdown_to_pdf_content(markdown_text)

	total_items = 0
	for col in (left_column, right_column):
	for item in col:
	if isinstance(item, list):
	main_item, sub_items = item
	total_items += 1 + len(sub_items)
	else:
	total_items += 1

	if auto_size:
	base_font_size = max(6, min(12, 200 / total_items))

	item_font_size = base_font_size
	subitem_font_size = base_font_size * 0.9
	section_font_size = base_font_size * 1.2
	title_font_size = min(16, base_font_size * 1.5)

	# Define ParagraphStyles using Helvetica for normal text.
	title_style = ParagraphStyle(
	'Heading1',
	parent=styles['Heading1'],
	fontName="Helvetica-Bold",
	textColor=colors.darkblue,
	alignment=1,
	fontSize=title_font_size
	)

	section_style = ParagraphStyle(
	'SectionStyle',
	parent=styles['Heading2'],
	fontName="Helvetica-Bold",
	textColor=colors.darkblue,
	fontSize=section_font_size,
	leading=section_font_size * 1.2,
	spaceAfter=2
	)

	item_style = ParagraphStyle(
	'ItemStyle',
	parent=styles['Normal'],
	fontName="Helvetica",
	fontSize=item_font_size,
	leading=item_font_size * 1.2,
	spaceAfter=1
	)

	subitem_style = ParagraphStyle(
	'SubItemStyle',
	parent=styles['Normal'],
	fontName="Helvetica",
	fontSize=subitem_font_size,
	leading=subitem_font_size * 1.2,
	leftIndent=10,
	spaceAfter=1
	)

	story.append(Paragraph(apply_emoji_font("Cutting-Edge ML Outline (ReportLab)", selected_font_name), title_style))
	story.append(Spacer(1, spacer_height))

	left_cells = []
	for item in left_column:
	if isinstance(item, str) and item.startswith('<b>'):
	# Process section headings.
	text = item.replace('<b>', '').replace('</b>', '')
	left_cells.append(Paragraph(apply_emoji_font(text, selected_font_name), section_style))
	elif isinstance(item, list):
	main_item, sub_items = item
	left_cells.append(Paragraph(apply_emoji_font(main_item, selected_font_name), item_style))
	for sub_item in sub_items:
	left_cells.append(Paragraph(apply_emoji_font(sub_item, selected_font_name), subitem_style))
	else:
	left_cells.append(Paragraph(apply_emoji_font(item, selected_font_name), item_style))

	right_cells = []
	for item in right_column:
	if isinstance(item, str) and item.startswith('<b>'):
	text = item.replace('<b>', '').replace('</b>', '')
	right_cells.append(Paragraph(apply_emoji_font(text, selected_font_name), section_style))
	elif isinstance(item, list):
	main_item, sub_items = item
	right_cells.append(Paragraph(apply_emoji_font(main_item, selected_font_name), item_style))
	for sub_item in sub_items:
	right_cells.append(Paragraph(apply_emoji_font(sub_item, selected_font_name), subitem_style))
	else:
	right_cells.append(Paragraph(apply_emoji_font(item, selected_font_name), item_style))

	max_cells = max(len(left_cells), len(right_cells))
	left_cells.extend([""] * (max_cells - len(left_cells)))
	right_cells.extend([""] * (max_cells - len(right_cells)))

	table_data = list(zip(left_cells, right_cells))
	col_width = (A4[1] - 72) / 2.0
	table = Table(table_data, colWidths=[col_width, col_width], hAlign='CENTER')
	table.setStyle(TableStyle([
	('VALIGN', (0, 0), (-1, -1), 'TOP'),
	('ALIGN', (0, 0), (-1, -1), 'LEFT'),
	('BACKGROUND', (0, 0), (-1, -1), colors.white),
	('GRID', (0, 0), (-1, -1), 0, colors.white),
	('LINEAFTER', (0, 0), (0, -1), 0.5, colors.grey),
	('LEFTPADDING', (0, 0), (-1, -1), 2),
	('RIGHTPADDING', (0, 0), (-1, -1), 2),
	('TOPPADDING', (0, 0), (-1, -1), 1),
	('BOTTOMPADDING', (0, 0), (-1, -1), 1),
	]))

	story.append(table)
	doc.build(story)
	buffer.seek(0)
	return buffer.getvalue()

	# ---------------------------------------------------------------
	# Convert PDF bytes to an image for preview using PyMuPDF.
	def pdf_to_image(pdf_bytes):
	try:
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	page = doc[0]
	pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	doc.close()
	return img
	except Exception as e:
	st.error(f"Failed to render PDF preview: {e}")
	return None

	# ---------------------------------------------------------------
	# Sidebar options for text size.
	with st.sidebar:
	auto_size = st.checkbox("Auto-size text", value=True)
	if not auto_size:
	base_font_size = st.slider("Base Font Size (points)", min_value=6, max_value=16, value=10, step=1)
	else:
	base_font_size = 10
	st.info("Font size will auto-adjust between 6-12 points based on content length.")

	# Persist markdown content in session state.
	if 'markdown_content' not in st.session_state:
	st.session_state.markdown_content = default_markdown

	# ---------------------------------------------------------------
	# Generate the PDF.
	with st.spinner("Generating PDF..."):
	pdf_bytes = create_main_pdf(st.session_state.markdown_content, base_font_size, auto_size)

	# Display PDF preview.
	with st.container():
	pdf_image = pdf_to_image(pdf_bytes)
	if pdf_image:
	st.image(pdf_image, use_container_width=True)
	else:
	st.info("Download the PDF to view it locally.")

	# PDF Download button.
	st.download_button(
	label="Download PDF",
	data=pdf_bytes,
	file_name="ml_outline.pdf",
	mime="application/pdf"
	)

	# Markdown editor.
	edited_markdown = st.text_area(
	"Modify the markdown content below:",
	value=st.session_state.markdown_content,
	height=300
	)

	# Update PDF on button click.
	if st.button("Update PDF"):
	st.session_state.markdown_content = edited_markdown
	st.experimental_rerun()

	# Markdown Download button.
	st.download_button(
	label="Save Markdown",
	data=st.session_state.markdown_content,
	file_name="ml_outline.md",
	mime="text/markdown"
	)