Files changed (1) hide show
  1. app.py +844 -0
app.py ADDED
@@ -0,0 +1,844 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MarkdownMuse: A Python application for converting Markdown to beautifully formatted PDFs
3
+
4
+ This module implements the core functionality needed for the MarkdownMuse application.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ import sys
10
+ import glob
11
+ import logging
12
+ from typing import List, Dict, Any, Optional, Tuple
13
+ from bs4 import BeautifulSoup
14
+ import markdown
15
+ from markdown.extensions.tables import TableExtension
16
+ from markdown.extensions.fenced_code import FencedCodeExtension
17
+ from markdown.extensions.toc import TocExtension
18
+ from reportlab.lib.pagesizes import letter, A4
19
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
20
+ from reportlab.lib.units import inch
21
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image, PageBreak, Preformatted, ListFlowable, ListItem
22
+ from reportlab.lib.colors import HexColor, black, grey
23
+ from reportlab.lib import colors
24
+ from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
25
+ import html
26
+ import base64
27
+ import requests
28
+ from PIL import Image as PilImage
29
+ import io
30
+ import tempfile
31
+
32
+ # Set up logging
33
+ logging.basicConfig(
34
+ level=logging.INFO,
35
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
36
+ )
37
+ logger = logging.getLogger(__name__)
38
+
39
+ class MarkdownToPDFConverter:
40
+ """
41
+ Class to convert Markdown content to PDF using ReportLab.
42
+ """
43
+ def __init__(
44
+ self,
45
+ output_path: str = "output.pdf",
46
+ page_size: str = "A4",
47
+ margins: Tuple[float, float, float, float] = (0.75, 0.75, 0.75, 0.75),
48
+ font_name: str = "Helvetica",
49
+ base_font_size: int = 10,
50
+ heading_scale: Dict[int, float] = None,
51
+ include_toc: bool = True,
52
+ code_style: str = "github"
53
+ ):
54
+ """
55
+ Initialize the converter with configuration options.
56
+
57
+ Args:
58
+ output_path: Path to save the PDF
59
+ page_size: Page size ("A4" or "letter")
60
+ margins: Tuple of margins (left, right, top, bottom) in inches
61
+ font_name: Base font name to use
62
+ base_font_size: Base font size in points
63
+ heading_scale: Dictionary of heading levels to font size multipliers
64
+ include_toc: Whether to include a table of contents
65
+ code_style: Style to use for code blocks
66
+ """
67
+ self.output_path = output_path
68
+ self.page_size = A4 if page_size.upper() == "A4" else letter
69
+ self.margins = margins
70
+ self.font_name = font_name
71
+ self.base_font_size = base_font_size
72
+ self.heading_scale = heading_scale or {
73
+ 1: 2.0, # H1 is 2.0x base font size
74
+ 2: 1.7, # H2 is 1.7x base font size
75
+ 3: 1.4, # H3 is 1.4x base font size
76
+ 4: 1.2, # H4 is 1.2x base font size
77
+ 5: 1.1, # H5 is 1.1x base font size
78
+ 6: 1.0 # H6 is 1.0x base font size
79
+ }
80
+ self.include_toc = include_toc
81
+ self.code_style = code_style
82
+
83
+ # Initialize styles
84
+ self.styles = getSampleStyleSheet()
85
+ self._setup_styles()
86
+
87
+ # Initialize document elements
88
+ self.elements = []
89
+ self.toc_entries = []
90
+
91
+ def _setup_styles(self) -> None:
92
+ """Set up custom paragraph styles for the document."""
93
+ # Modify existing Normal style
94
+ self.styles['Normal'].fontName = self.font_name
95
+ self.styles['Normal'].fontSize = self.base_font_size
96
+ self.styles['Normal'].leading = self.base_font_size * 1.2
97
+ self.styles['Normal'].spaceAfter = self.base_font_size * 0.8
98
+
99
+ # Heading styles
100
+ for level in range(1, 7):
101
+ size_multiplier = self.heading_scale.get(level, 1.0)
102
+ heading_name = f'Heading{level}'
103
+
104
+ # Check if the heading style already exists
105
+ if heading_name in self.styles:
106
+ # Modify existing style
107
+ self.styles[heading_name].parent = self.styles['Normal']
108
+ self.styles[heading_name].fontName = f'{self.font_name}-Bold'
109
+ self.styles[heading_name].fontSize = int(self.base_font_size * size_multiplier)
110
+ self.styles[heading_name].leading = int(self.base_font_size * size_multiplier * 1.2)
111
+ self.styles[heading_name].spaceAfter = self.base_font_size
112
+ self.styles[heading_name].spaceBefore = self.base_font_size * (1 + (0.2 * (7 - level)))
113
+ else:
114
+ # Create new style
115
+ self.styles.add(
116
+ ParagraphStyle(
117
+ name=heading_name,
118
+ parent=self.styles['Normal'],
119
+ fontName=f'{self.font_name}-Bold',
120
+ fontSize=int(self.base_font_size * size_multiplier),
121
+ leading=int(self.base_font_size * size_multiplier * 1.2),
122
+ spaceAfter=self.base_font_size,
123
+ spaceBefore=self.base_font_size * (1 + (0.2 * (7 - level))),
124
+ )
125
+ )
126
+
127
+ # Code block style
128
+ self.styles.add(
129
+ ParagraphStyle(
130
+ name='CodeBlock',
131
+ fontName='Courier',
132
+ fontSize=self.base_font_size * 0.9,
133
+ leading=self.base_font_size * 1.1,
134
+ spaceAfter=self.base_font_size,
135
+ spaceBefore=self.base_font_size,
136
+ leftIndent=self.base_font_size,
137
+ backgroundColor=HexColor('#EEEEEE'),
138
+ borderWidth=0,
139
+ borderPadding=self.base_font_size * 0.5,
140
+ )
141
+ )
142
+
143
+ # List item style
144
+ self.styles.add(
145
+ ParagraphStyle(
146
+ name='ListItem',
147
+ parent=self.styles['Normal'],
148
+ leftIndent=self.base_font_size * 2,
149
+ firstLineIndent=-self.base_font_size,
150
+ )
151
+ )
152
+
153
+ # Table of contents styles
154
+ self.styles.add(
155
+ ParagraphStyle(
156
+ name='TOCHeading',
157
+ parent=self.styles['Heading1'],
158
+ fontSize=int(self.base_font_size * 1.5),
159
+ spaceAfter=self.base_font_size * 1.5,
160
+ )
161
+ )
162
+
163
+ for level in range(1, 4): # Create styles for TOC levels
164
+ self.styles.add(
165
+ ParagraphStyle(
166
+ name=f'TOC{level}',
167
+ parent=self.styles['Normal'],
168
+ leftIndent=self.base_font_size * (level - 1) * 2,
169
+ fontSize=self.base_font_size - (level - 1),
170
+ leading=self.base_font_size * 1.4,
171
+ )
172
+ )
173
+
174
+ def convert_file(self, md_file_path: str) -> None:
175
+ """
176
+ Convert a single markdown file to PDF.
177
+
178
+ Args:
179
+ md_file_path: Path to the markdown file
180
+ """
181
+ # Read markdown content
182
+ with open(md_file_path, 'r', encoding='utf-8') as f:
183
+ md_content = f.read()
184
+
185
+ # Convert markdown to PDF
186
+ self.convert_content(md_content)
187
+
188
+ def convert_content(self, md_content: str) -> None:
189
+ """
190
+ Convert markdown content string to PDF.
191
+
192
+ Args:
193
+ md_content: Markdown content as a string
194
+ """
195
+ # Convert markdown to HTML
196
+ html_content = self._md_to_html(md_content)
197
+
198
+ # Convert HTML to ReportLab elements
199
+ self._html_to_elements(html_content)
200
+
201
+ # Generate the PDF
202
+ self._generate_pdf()
203
+
204
+ logger.info(f"PDF created at {self.output_path}")
205
+
206
+ def convert_multiple_files(self, md_file_paths: List[str],
207
+ merge: bool = True,
208
+ separate_toc: bool = False) -> None:
209
+ """
210
+ Convert multiple markdown files to PDF.
211
+
212
+ Args:
213
+ md_file_paths: List of paths to markdown files
214
+ merge: Whether to merge all files into a single PDF
215
+ separate_toc: Whether to include a separate TOC for each file
216
+ """
217
+ if merge:
218
+ all_content = []
219
+
220
+ for file_path in md_file_paths:
221
+ logger.info(f"Processing {file_path}")
222
+ with open(file_path, 'r', encoding='utf-8') as f:
223
+ content = f.read()
224
+
225
+ # Add file name as heading if more than one file
226
+ if len(md_file_paths) > 1:
227
+ file_name = os.path.splitext(os.path.basename(file_path))[0]
228
+ content = f"# {file_name}\n\n{content}"
229
+
230
+ # Add page break between files
231
+ if all_content:
232
+ all_content.append("\n\n<div class='page-break'></div>\n\n")
233
+
234
+ all_content.append(content)
235
+
236
+ combined_content = "\n".join(all_content)
237
+ self.convert_content(combined_content)
238
+ else:
239
+ # Process each file separately
240
+ for i, file_path in enumerate(md_file_paths):
241
+ converter = MarkdownToPDFConverter(
242
+ output_path=f"{os.path.splitext(file_path)[0]}.pdf",
243
+ page_size=self.page_size,
244
+ margins=self.margins,
245
+ font_name=self.font_name,
246
+ base_font_size=self.base_font_size,
247
+ heading_scale=self.heading_scale,
248
+ include_toc=separate_toc,
249
+ code_style=self.code_style
250
+ )
251
+ converter.convert_file(file_path)
252
+
253
+ def _md_to_html(self, md_content: str) -> str:
254
+ """
255
+ Convert markdown content to HTML.
256
+
257
+ Args:
258
+ md_content: Markdown content
259
+
260
+ Returns:
261
+ HTML content
262
+ """
263
+ # Define extensions for markdown conversion
264
+ extensions = [
265
+ 'markdown.extensions.extra',
266
+ 'markdown.extensions.smarty',
267
+ TableExtension(),
268
+ FencedCodeExtension(),
269
+ TocExtension(toc_depth=3) if self.include_toc else None
270
+ ]
271
+
272
+ # Remove None values
273
+ extensions = [ext for ext in extensions if ext is not None]
274
+
275
+ # Convert markdown to HTML
276
+ html_content = markdown.markdown(md_content, extensions=extensions)
277
+ return html_content
278
+
279
+ def _html_to_elements(self, html_content: str) -> None:
280
+ """
281
+ Convert HTML content to ReportLab elements.
282
+
283
+ Args:
284
+ html_content: HTML content
285
+ """
286
+ soup = BeautifulSoup(html_content, 'html.parser')
287
+
288
+ # Process elements
289
+ for element in soup.children:
290
+ if element.name:
291
+ self._process_element(element)
292
+
293
+ def _process_element(self, element: BeautifulSoup) -> None:
294
+ """
295
+ Process an HTML element and convert it to ReportLab elements.
296
+
297
+ Args:
298
+ element: BeautifulSoup element
299
+ """
300
+ if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
301
+ level = int(element.name[1])
302
+ text = element.get_text()
303
+
304
+ # Add to TOC
305
+ if self.include_toc:
306
+ self.toc_entries.append((level, text))
307
+
308
+ # Create heading paragraph
309
+ self.elements.append(
310
+ Paragraph(text, self.styles[f'Heading{level}'])
311
+ )
312
+
313
+ elif element.name == 'p':
314
+ text = self._process_inline_elements(element)
315
+ self.elements.append(
316
+ Paragraph(text, self.styles['Normal'])
317
+ )
318
+
319
+ elif element.name == 'pre':
320
+ code = element.get_text()
321
+ self.elements.append(
322
+ Preformatted(code, self.styles['CodeBlock'])
323
+ )
324
+
325
+ elif element.name == 'img':
326
+ src = element.get('src', '')
327
+ alt = element.get('alt', 'Image')
328
+
329
+ # Handle different image sources
330
+ if src.startswith('http'):
331
+ # Remote image
332
+ try:
333
+ response = requests.get(src)
334
+ img_data = response.content
335
+ img_stream = io.BytesIO(img_data)
336
+ image = Image(img_stream, width=4*inch, height=3*inch)
337
+
338
+ # Try to get actual dimensions
339
+ try:
340
+ pil_img = PilImage.open(img_stream)
341
+ width, height = pil_img.size
342
+ aspect = width / height
343
+ max_width = 6 * inch
344
+
345
+ if width > max_width:
346
+ new_width = max_width
347
+ new_height = new_width / aspect
348
+ image = Image(img_stream, width=new_width, height=new_height)
349
+ except:
350
+ pass # Use default size if image can't be processed
351
+
352
+ self.elements.append(image)
353
+ except:
354
+ # If image can't be retrieved, add a placeholder
355
+ self.elements.append(
356
+ Paragraph(f"[Image: {alt}]", self.styles['Normal'])
357
+ )
358
+ elif src.startswith('data:image'):
359
+ # Base64 encoded image
360
+ try:
361
+ # Extract base64 data
362
+ b64_data = src.split(',')[1]
363
+ img_data = base64.b64decode(b64_data)
364
+ img_stream = io.BytesIO(img_data)
365
+ image = Image(img_stream, width=4*inch, height=3*inch)
366
+ self.elements.append(image)
367
+ except:
368
+ # If image can't be processed, add a placeholder
369
+ self.elements.append(
370
+ Paragraph(f"[Image: {alt}]", self.styles['Normal'])
371
+ )
372
+ else:
373
+ # Local image
374
+ if os.path.exists(src):
375
+ image = Image(src, width=4*inch, height=3*inch)
376
+ self.elements.append(image)
377
+ else:
378
+ # If image can't be found, add a placeholder
379
+ self.elements.append(
380
+ Paragraph(f"[Image: {alt}]", self.styles['Normal'])
381
+ )
382
+
383
+ elif element.name == 'ul' or element.name == 'ol':
384
+ list_items = []
385
+ bullet_type = 'bullet' if element.name == 'ul' else 'numbered'
386
+
387
+ for item in element.find_all('li', recursive=False):
388
+ text = self._process_inline_elements(item)
389
+ list_items.append(
390
+ ListItem(
391
+ Paragraph(text, self.styles['ListItem']),
392
+ leftIndent=20
393
+ )
394
+ )
395
+
396
+ self.elements.append(
397
+ ListFlowable(
398
+ list_items,
399
+ bulletType=bullet_type,
400
+ start=1 if bullet_type == 'numbered' else None,
401
+ bulletFormat='%s.' if bullet_type == 'numbered' else '%s'
402
+ )
403
+ )
404
+
405
+ elif element.name == 'table':
406
+ self._process_table(element)
407
+
408
+ elif element.name == 'div' and 'page-break' in element.get('class', []):
409
+ self.elements.append(PageBreak())
410
+
411
+ elif element.name == 'hr':
412
+ self.elements.append(Spacer(1, 0.25*inch))
413
+
414
+ # Process children for complex elements
415
+ elif element.name in ['div', 'blockquote', 'section', 'article']:
416
+ for child in element.children:
417
+ if hasattr(child, 'name') and child.name:
418
+ self._process_element(child)
419
+
420
+ def _process_inline_elements(self, element: BeautifulSoup) -> str:
421
+ """
422
+ Process inline HTML elements like bold, italic, etc.
423
+
424
+ Args:
425
+ element: BeautifulSoup element
426
+
427
+ Returns:
428
+ Formatted text with ReportLab markup
429
+ """
430
+ html_str = str(element)
431
+
432
+ # Convert common HTML tags to ReportLab paragraph markup
433
+ replacements = [
434
+ (r'<strong>(.*?)</strong>', r'<b>\1</b>'),
435
+ (r'<b>(.*?)</b>', r'<b>\1</b>'),
436
+ (r'<em>(.*?)</em>', r'<i>\1</i>'),
437
+ (r'<i>(.*?)</i>', r'<i>\1</i>'),
438
+ (r'<code>(.*?)</code>', r'<font name="Courier">\1</font>'),
439
+ (r'<a href="(.*?)">(.*?)</a>', r'<link href="\1">\2</link>'),
440
+ (r'<u>(.*?)</u>', r'<u>\1</u>'),
441
+ (r'<strike>(.*?)</strike>', r'<strike>\1</strike>'),
442
+ (r'<del>(.*?)</del>', r'<strike>\1</strike>'),
443
+ ]
444
+
445
+ for pattern, replacement in replacements:
446
+ html_str = re.sub(pattern, replacement, html_str, flags=re.DOTALL)
447
+
448
+ # Extract text with our ReportLab markup from the modified HTML
449
+ soup = BeautifulSoup(html_str, 'html.parser')
450
+ return soup.get_text()
451
+
452
+ def _process_table(self, table_element: BeautifulSoup) -> None:
453
+ """
454
+ Process an HTML table into a ReportLab Table.
455
+
456
+ Args:
457
+ table_element: BeautifulSoup table element
458
+ """
459
+ rows = []
460
+
461
+ # Extract header row
462
+ thead = table_element.find('thead')
463
+ if thead:
464
+ header_cells = []
465
+ for th in thead.find_all(['th']):
466
+ text = self._process_inline_elements(th)
467
+ # Create a paragraph with bold text for headers
468
+ header_cells.append(Paragraph(f"<b>{text}</b>", self.styles['Normal']))
469
+ rows.append(header_cells)
470
+
471
+ # Extract body rows
472
+ tbody = table_element.find('tbody') or table_element
473
+ for tr in tbody.find_all('tr'):
474
+ if tr.parent.name == 'thead':
475
+ continue # Skip header rows already processed
476
+
477
+ row_cells = []
478
+ for cell in tr.find_all(['td', 'th']):
479
+ text = self._process_inline_elements(cell)
480
+ if cell.name == 'th':
481
+ # Headers are bold
482
+ row_cells.append(Paragraph(f"<b>{text}</b>", self.styles['Normal']))
483
+ else:
484
+ row_cells.append(Paragraph(text, self.styles['Normal']))
485
+
486
+ if row_cells: # Only add non-empty rows
487
+ rows.append(row_cells)
488
+
489
+ if rows:
490
+ # Create table and style
491
+ col_widths = [None] * len(rows[0]) # Auto width for columns
492
+ table = Table(rows, colWidths=col_widths)
493
+
494
+ # Add basic grid and header styling
495
+ style = TableStyle([
496
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.Color(0.7, 0.7, 0.7)),
497
+ ('BACKGROUND', (0, 0), (-1, 0), colors.Color(0.8, 0.8, 0.8)),
498
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
499
+ ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
500
+ ('FONTNAME', (0, 0), (-1, 0), f'{self.font_name}-Bold'),
501
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 8),
502
+ ('TOPPADDING', (0, 0), (-1, 0), 8),
503
+ ('BOTTOMPADDING', (0, 1), (-1, -1), 6),
504
+ ('TOPPADDING', (0, 1), (-1, -1), 6),
505
+ ])
506
+
507
+ table.setStyle(style)
508
+ self.elements.append(table)
509
+
510
+ # Add some space after the table
511
+ self.elements.append(Spacer(1, 0.1*inch))
512
+
513
+ def _generate_toc(self) -> None:
514
+ """Generate a table of contents."""
515
+ if not self.toc_entries:
516
+ return
517
+
518
+ self.elements.append(Paragraph("Table of Contents", self.styles['TOCHeading']))
519
+ self.elements.append(Spacer(1, 0.2*inch))
520
+
521
+ for level, text in self.toc_entries:
522
+ if level <= 3: # Only include headings up to level 3
523
+ self.elements.append(
524
+ Paragraph(text, self.styles[f'TOC{level}'])
525
+ )
526
+
527
+ self.elements.append(PageBreak())
528
+
529
+ def _generate_pdf(self) -> None:
530
+ """Generate the PDF document."""
531
+ # Create the document
532
+ doc = SimpleDocTemplate(
533
+ self.output_path,
534
+ pagesize=self.page_size,
535
+ leftMargin=self.margins[0]*inch,
536
+ rightMargin=self.margins[1]*inch,
537
+ topMargin=self.margins[2]*inch,
538
+ bottomMargin=self.margins[3]*inch
539
+ )
540
+
541
+ # Add TOC if requested
542
+ if self.include_toc and self.toc_entries:
543
+ self._generate_toc()
544
+
545
+ # Build the PDF
546
+ doc.build(self.elements)
547
+
548
+
549
+ class MarkdownToPDFAgent:
550
+ """
551
+ AI Agent to convert Markdown files to PDF with enhanced formatting.
552
+ """
553
+
554
+ def __init__(self, llm=None):
555
+ """
556
+ Initialize the agent with optional LLM for content enhancement.
557
+
558
+ Args:
559
+ llm: Optional language model for content enhancement
560
+ """
561
+ self.llm = llm
562
+ self.converter = MarkdownToPDFConverter()
563
+
564
+ def setup_from_openai(self, api_key=None):
565
+ """
566
+ Setup agent with OpenAI LLM.
567
+
568
+ Args:
569
+ api_key: OpenAI API key (will use env var if not provided)
570
+ """
571
+ try:
572
+ from langchain_openai import ChatOpenAI
573
+
574
+ api_key = api_key or os.getenv("OPENAI_API_KEY")
575
+ if not api_key:
576
+ logger.warning("No OpenAI API key provided. Agent will run without LLM enhancement.")
577
+ return False
578
+
579
+ self.llm = ChatOpenAI(
580
+ model="gpt-4",
581
+ temperature=0.1,
582
+ api_key=api_key
583
+ )
584
+ return True
585
+ except ImportError:
586
+ logger.warning("LangChain OpenAI package not found. Install with 'pip install langchain-openai'")
587
+ return False
588
+
589
+ def setup_from_gemini(self, api_key=None):
590
+ """
591
+ Setup agent with Google Gemini LLM.
592
+
593
+ Args:
594
+ api_key: Google Gemini API key (will use env var if not provided)
595
+ """
596
+ try:
597
+ from langchain_google_genai import ChatGoogleGenerativeAI
598
+
599
+ api_key = api_key or os.getenv("GOOGLE_API_KEY")
600
+ if not api_key:
601
+ logger.warning("No Google API key provided. Agent will run without LLM enhancement.")
602
+ return False
603
+
604
+ try:
605
+ # Use the latest Gemini model version
606
+ self.llm = ChatGoogleGenerativeAI(
607
+ model="gemini-1.5-flash",
608
+ temperature=0.1,
609
+ google_api_key=api_key,
610
+ convert_system_message_to_human=True
611
+ )
612
+ logger.info("Successfully set up Google Gemini LLM")
613
+ return True
614
+ except Exception as e:
615
+ logger.error(f"Error setting up Google Gemini LLM: {str(e)}")
616
+ return False
617
+ except ImportError:
618
+ logger.warning("LangChain Google Generative AI package not found. Install with 'pip install langchain-google-genai'")
619
+ return False
620
+
621
+ def enhance_markdown(self, content: str, instructions: str = None) -> str:
622
+ """
623
+ Enhance markdown content using LLM if available.
624
+
625
+ Args:
626
+ content: Original markdown content
627
+ instructions: Specific enhancement instructions
628
+
629
+ Returns:
630
+ Enhanced markdown content
631
+ """
632
+ if not self.llm:
633
+ logger.warning("No LLM available for enhancement. Returning original content.")
634
+ return content
635
+
636
+ default_instructions = """
637
+ Enhance this markdown content while preserving its structure and meaning.
638
+ Make the following improvements:
639
+ 1. Fix any grammar or spelling issues
640
+ 2. Improve formatting for better readability
641
+ 3. Ensure proper markdown syntax is used
642
+ 4. Add appropriate section headings if missing
643
+ 5. Keep the content factually identical to the original
644
+ """
645
+
646
+ instructions = instructions or default_instructions
647
+
648
+ try:
649
+ # Create a prompt for the LLM
650
+ prompt = f"{instructions}\n\nOriginal content:\n\n{content}\n\nPlease provide the enhanced markdown content:"
651
+
652
+ # Use the LLM directly with proper error handling
653
+ try:
654
+ from langchain.schema import HumanMessage
655
+ logger.info(f"Using LLM type: {type(self.llm).__name__}")
656
+ messages = [HumanMessage(content=prompt)]
657
+ result = self.llm.invoke(messages).content
658
+ logger.info("Successfully received response from LLM")
659
+ except Exception as e:
660
+ logger.error(f"Error invoking LLM: {str(e)}")
661
+ return content
662
+
663
+ # Clean up the result (extract just the markdown part)
664
+ result = self._clean_agent_output(result)
665
+
666
+ return result
667
+ except Exception as e:
668
+ logger.error(f"Error enhancing markdown: {str(e)}")
669
+ return content # Return original content if enhancement fails
670
+
671
+ def _clean_agent_output(self, output: str) -> str:
672
+ """
673
+ Clean up agent output to extract just the markdown content.
674
+
675
+ Args:
676
+ output: Raw agent output
677
+
678
+ Returns:
679
+ Cleaned markdown content
680
+ """
681
+ # Check if the output is wrapped in markdown code blocks
682
+ md_pattern = r"```(?:markdown|md)?\s*([\s\S]*?)```"
683
+ match = re.search(md_pattern, output)
684
+
685
+ if match:
686
+ return match.group(1).strip()
687
+
688
+ # If no markdown blocks found, remove any agent commentary
689
+ lines = output.split('\n')
690
+ result_lines = []
691
+ capture = False
692
+
693
+ for line in lines:
694
+ if capture or not (line.startswith("I") or line.startswith("Here") or line.startswith("The")):
695
+ capture = True
696
+ result_lines.append(line)
697
+
698
+ return '\n'.join(result_lines)
699
+
700
+ def process_file(self, input_path: str, output_path: str = None, enhance: bool = False,
701
+ enhancement_instructions: str = None, page_size: str = "A4") -> str:
702
+ """
703
+ Process a single markdown file and convert it to PDF.
704
+
705
+ Args:
706
+ input_path: Path to input markdown file
707
+ output_path: Path for output PDF (defaults to input path with .pdf extension)
708
+ enhance: Whether to enhance the content with LLM
709
+ enhancement_instructions: Specific instructions for enhancement
710
+ page_size: Page size for the PDF ("A4" or "letter")
711
+
712
+ Returns:
713
+ Path to the generated PDF
714
+ """
715
+ # Validate input file
716
+ if not os.path.exists(input_path):
717
+ logger.error(f"Input file not found: {input_path}")
718
+ return None
719
+
720
+ # Set default output path if not provided
721
+ if not output_path:
722
+ output_path = os.path.splitext(input_path)[0] + ".pdf"
723
+
724
+ # Read markdown content
725
+ with open(input_path, 'r', encoding='utf-8') as f:
726
+ content = f.read()
727
+
728
+ # Enhance content if requested
729
+ if enhance and self.llm:
730
+ logger.info(f"Enhancing content for {input_path}")
731
+ content = self.enhance_markdown(content, enhancement_instructions)
732
+
733
+ # Configure converter
734
+ self.converter = MarkdownToPDFConverter(
735
+ output_path=output_path,
736
+ page_size=page_size
737
+ )
738
+
739
+ # Convert to PDF
740
+ logger.info(f"Converting {input_path} to PDF")
741
+ self.converter.convert_content(content)
742
+
743
+ return output_path
744
+
745
+ def process_directory(self, input_dir: str, output_dir: str = None, pattern: str = "*.md",
746
+ enhance: bool = False, merge: bool = False,
747
+ output_filename: str = "merged_document.pdf",
748
+ page_size: str = "A4") -> List[str]:
749
+ """
750
+ Process all markdown files in a directory.
751
+
752
+ Args:
753
+ input_dir: Path to input directory
754
+ output_dir: Path to output directory (defaults to input directory)
755
+ pattern: Glob pattern for markdown files
756
+ enhance: Whether to enhance content with LLM
757
+ merge: Whether to merge all files into a single PDF
758
+ output_filename: Filename for merged PDF
759
+ page_size: Page size for the PDF ("A4" or "letter")
760
+
761
+ Returns:
762
+ List of paths to generated PDFs
763
+ """
764
+ # Validate input directory
765
+ if not os.path.isdir(input_dir):
766
+ logger.error(f"Input directory not found: {input_dir}")
767
+ return []
768
+
769
+ # Set default output directory if not provided
770
+ if not output_dir:
771
+ output_dir = input_dir
772
+ elif not os.path.exists(output_dir):
773
+ os.makedirs(output_dir)
774
+
775
+ # Get all markdown files
776
+ md_files = glob.glob(os.path.join(input_dir, pattern))
777
+
778
+ if not md_files:
779
+ logger.warning(f"No markdown files found in {input_dir} with pattern {pattern}")
780
+ return []
781
+
782
+ # Sort files to ensure consistent ordering
783
+ md_files.sort()
784
+
785
+ if merge:
786
+ logger.info(f"Merging {len(md_files)} markdown files into a single PDF")
787
+
788
+ # Process each file for enhancement if requested
789
+ if enhance and self.llm:
790
+ enhanced_contents = []
791
+
792
+ for md_file in md_files:
793
+ logger.info(f"Enhancing content for {md_file}")
794
+ with open(md_file, 'r', encoding='utf-8') as f:
795
+ content = f.read()
796
+
797
+ # Add file name as heading
798
+ file_name = os.path.splitext(os.path.basename(md_file))[0]
799
+ content = f"# {file_name}\n\n{content}"
800
+
801
+ enhanced_content = self.enhance_markdown(content)
802
+ enhanced_contents.append(enhanced_content)
803
+
804
+ # Merge enhanced contents with page breaks
805
+ merged_content = "\n\n<div class='page-break'></div>\n\n".join(enhanced_contents)
806
+
807
+ # Convert merged content
808
+ output_path = os.path.join(output_dir, output_filename)
809
+ self.converter = MarkdownToPDFConverter(
810
+ output_path=output_path,
811
+ page_size=page_size
812
+ )
813
+ self.converter.convert_content(merged_content)
814
+
815
+ return [output_path]
816
+ else:
817
+ # Merge without enhancement
818
+ output_path = os.path.join(output_dir, output_filename)
819
+ self.converter = MarkdownToPDFConverter(
820
+ output_path=output_path,
821
+ page_size=page_size
822
+ )
823
+ self.converter.convert_multiple_files(md_files, merge=True)
824
+
825
+ return [output_path]
826
+ else:
827
+ # Process each file individually
828
+ output_files = []
829
+
830
+ for md_file in md_files:
831
+ output_filename = os.path.splitext(os.path.basename(md_file))[0] + ".pdf"
832
+ output_path = os.path.join(output_dir, output_filename)
833
+
834
+ processed_file = self.process_file(
835
+ md_file,
836
+ output_path,
837
+ enhance=enhance,
838
+ page_size=page_size
839
+ )
840
+
841
+ if processed_file:
842
+ output_files.append(processed_file)
843
+
844
+ return output_files