import streamlit as st import tempfile import os from pdf2markdown4llm import PDF2Markdown4LLM st.set_page_config(page_title="PDF to Markdown Converter", layout="wide") st.title("PDF to Markdown Converter") st.write("Convert your PDF files to Markdown format") def progress_callback(progress): """Callback function to handle progress updates""" progress_bar.progress(progress.percentage / 100) status_text.text(f"Phase: {progress.phase.value}, Page {progress.current_page}/{progress.total_pages}\n" f"Progress: {progress.percentage:.1f}%, Message: {progress.message}") def format_markdown_for_preview(markdown_text): """Format markdown text for proper preview display""" # Ensure newlines are preserved by adding two spaces at the end of each line lines = markdown_text.split('\n') formatted_lines = [line + ' ' if line.strip() else line for line in lines] return '\n'.join(formatted_lines) # File upload uploaded_file = st.file_uploader("Select a PDF file", type=['pdf']) if uploaded_file is not None: # Configuration options with st.expander("Conversion Settings"): remove_headers = st.checkbox("Remove Headers", value=False) skip_empty_tables = st.checkbox("Skip Empty Tables", value=True) table_header = st.text_input("Table Header Format", value="### Table") if st.button("Start Conversion"): # Initialize progress bar and status text progress_bar = st.progress(0) status_text = st.empty() try: # Create temporary file with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: tmp_file.write(uploaded_file.getvalue()) tmp_file_path = tmp_file.name # Initialize converter converter = PDF2Markdown4LLM( remove_headers=remove_headers, skip_empty_tables=skip_empty_tables, table_header=table_header, progress_callback=progress_callback ) # Perform conversion markdown_content = converter.convert(tmp_file_path) # Clean up temporary file os.unlink(tmp_file_path) # Display results st.success("Conversion completed successfully!") # Raw markdown download st.download_button( label="Download Markdown File", data=markdown_content, file_name="converted.md", mime="text/markdown" ) # Preview with proper formatting st.subheader("Preview") # Create tabs for different preview modes preview_tab, raw_tab = st.tabs(["Formatted Preview", "Raw Markdown"]) with preview_tab: formatted_content = format_markdown_for_preview(markdown_content) st.markdown(formatted_content) with raw_tab: st.code(markdown_content, language="markdown") except Exception as e: st.error(f"An error occurred: {str(e)}") if 'progress_bar' in locals(): progress_bar.empty() if 'status_text' in locals(): status_text.empty()