repo-to-md / app.py
VOIDER's picture
Update app.py
6085d45 verified
import gradio as gr
import os
import subprocess
import tempfile
import zipfile
import pathlib
import shutil
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern
import traceback # Import traceback for better error logging
# --- Configuration --- (Keep as before)
DEFAULT_IGNORE_PATTERNS = """
# Default Ignore Patterns (Gitignore Syntax)
/.git/
/.hg/
/.svn/
/.vscode/
/.idea/
/node_modules/
/vendor/
/build/
/dist/
/target/
*.pyc
*.log
*.swp
*~
__pycache__/
.DS_Store
"""
MAX_OUTPUT_LINES = 10000 # Limit potential output size in display
INDENT_CHAR = " " # 4 spaces for indentation
FOLDER_ICON = "πŸ“"
FILE_ICON = "πŸ“„"
# --- Core Logic --- (Keep get_repo_path and generate_markdown_structure as before)
def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.Progress()):
"""Clones or extracts the repository, returning the local path."""
temp_dir = tempfile.mkdtemp()
repo_path = None
try:
if source_type == "URL":
if not repo_url:
raise ValueError("GitHub Repository URL is required.")
progress(0.1, desc="Cloning repository...")
git_command = ["git", "clone", "--depth", "1"] # Shallow clone for speed
if branch_tag:
git_command.extend(["--branch", branch_tag])
git_command.extend([repo_url, temp_dir])
print(f"Running command: {' '.join(git_command)}") # For debugging
result = subprocess.run(git_command, capture_output=True, text=True, check=False)
if result.returncode != 0:
# Attempt clone without branch if specific one failed (might be default branch)
if branch_tag:
progress(0.2, desc=f"Branch '{branch_tag}' not found or clone failed, trying default branch...")
git_command = ["git", "clone", "--depth", "1", repo_url, temp_dir]
print(f"Running command: {' '.join(git_command)}") # For debugging
result = subprocess.run(git_command, capture_output=True, text=True, check=False)
if result.returncode != 0:
error_message = f"Git clone failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
print(error_message) # Log detailed error
# Try to extract a user-friendly message
if "Authentication failed" in result.stderr:
raise ValueError("Authentication failed. Private repositories require different handling (e.g., tokens) which is not supported here.")
elif "not found" in result.stderr:
raise ValueError(f"Repository or branch '{branch_tag or 'default'}' not found at URL: {repo_url}")
else:
raise ValueError(f"Git clone failed. Check URL and branch/tag. Error: {result.stderr.splitlines()[-1] if result.stderr else 'Unknown error'}")
repo_path = pathlib.Path(temp_dir)
progress(0.5, desc="Repository cloned.")
print(f"Cloned repo to: {repo_path}") # Debugging
elif source_type == "Upload ZIP":
if zip_file_obj is None:
raise ValueError("ZIP file upload is required.")
progress(0.1, desc="Extracting ZIP file...")
zip_path = zip_file_obj.name # Gradio provides a temp file path
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Check for common zip structure (single top-level dir)
top_level_dirs = list(set(p.split('/')[0] for p in zip_ref.namelist() if '/' in p and p.split('/')[0]))
extract_target = temp_dir
potential_repo_root = temp_dir
if len(top_level_dirs) == 1:
# If zip contains repo-main/file structure, extract *into* temp_dir
# The actual repo content will be inside temp_dir/repo-main/
zip_ref.extractall(extract_target)
potential_repo_root = os.path.join(temp_dir, top_level_dirs[0])
print(f"ZIP has single top-level dir: {top_level_dirs[0]}. Potential root: {potential_repo_root}")
else:
# Otherwise, extract directly into temp_dir
zip_ref.extractall(extract_target)
print(f"ZIP structure seems flat or multi-root. Using extract target as root: {extract_target}")
# Basic check if potential_repo_root looks like a valid directory
if os.path.isdir(potential_repo_root):
repo_path = pathlib.Path(potential_repo_root)
else:
# Fallback if single dir logic failed or wasn't applicable
repo_path = pathlib.Path(extract_target)
progress(0.5, desc="ZIP extracted.")
print(f"Extracted ZIP to: {repo_path}") # Debugging
else:
raise ValueError("Invalid source type selected.")
if not repo_path or not repo_path.is_dir():
# Add more specific debugging info here
print(f"Debug Info: Temp dir content: {list(os.listdir(temp_dir))}")
if 'potential_repo_root' in locals() and potential_repo_root != temp_dir:
print(f"Debug Info: Potential repo root '{potential_repo_root}' exists: {os.path.exists(potential_repo_root)}, is dir: {os.path.isdir(potential_repo_root)}")
raise ValueError(f"Could not determine repository root directory within: {temp_dir}")
return repo_path, temp_dir # Return both the repo content path and the parent temp dir for cleanup
except Exception as e:
# Clean up the temporary directory on error before re-raising
shutil.rmtree(temp_dir, ignore_errors=True)
print(f"Error in get_repo_path: {e}") # Log error
traceback.print_exc() # Print full traceback for debugging get_repo_path issues
raise e # Re-raise the exception to be caught by the main function
def generate_markdown_structure(
repo_root_path: pathlib.Path,
include_content: bool,
max_size_kb: int,
ignore_patterns_str: str,
progress=gr.Progress()
):
"""Generates the Markdown string from the repository structure."""
repo_root_path = pathlib.Path(repo_root_path) # Ensure it's a Path object
markdown_lines = []
max_file_size_bytes = max_size_kb * 1024 if max_size_kb > 0 else 0
# --- Prepare ignore patterns ---
# Combine default and user patterns
full_ignore_patterns = DEFAULT_IGNORE_PATTERNS.strip() + "\n" + ignore_patterns_str.strip()
# Filter out empty lines and comments
patterns = [line for line in full_ignore_patterns.splitlines() if line.strip() and not line.strip().startswith('#')]
# Create unique list while preserving order (important if later patterns override earlier ones)
seen = set()
unique_patterns = [x for x in patterns if not (x in seen or seen.add(x))]
spec = PathSpec.from_lines(GitWildMatchPattern, unique_patterns)
print(f"Using unique ignore patterns: {unique_patterns}") # Debugging
# --- Add header ---
repo_name = repo_root_path.name
markdown_lines.append(f"# {FOLDER_ICON} {repo_name}")
markdown_lines.append("")
# --- Walk through the directory ---
progress(0.6, desc="Scanning repository structure...")
files_processed = 0
# Need to iterate through items relative to the root for pathspec matching
all_items = sorted(list(repo_root_path.rglob('*')))
total_items_estimate = len(all_items) # More accurate estimate
items_scanned = 0
for item_path in all_items:
items_scanned += 1
if items_scanned % 50 == 0: # Update progress periodically
progress(0.6 + (0.3 * (items_scanned / max(1, total_items_estimate))), desc=f"Scanning: {item_path.name}")
relative_path = item_path.relative_to(repo_root_path)
# Pathspec matches against the path string relative to the root where .gitignore would be
# Important: Add a leading '/' for patterns like '/node_modules/' to only match at the root
path_str_for_match = str(relative_path)
# Check if the path itself should be ignored
# Pathspec automatically handles directory patterns (e.g., node_modules/ matches files and dirs inside)
if spec.match_file(path_str_for_match):
print(f"Ignoring: {relative_path}") # Debugging
# If it's a directory, we don't need to manually skip recursion because
# rglob already gave us all paths; we just skip processing this specific path.
# If we were using os.walk, we'd modify the dirs list here.
continue
# Calculate depth and indentation
depth = len(relative_path.parts) -1 # 0-based depth relative to root content
indent = INDENT_CHAR * depth
# Add entry to Markdown
if item_path.is_dir():
# Check if dir is empty *after* considering ignores. This is tricky with rglob.
# A simple heuristic: check if any non-ignored children exist directly within it.
# This isn't perfect but avoids complex lookahead.
# has_children = any(p.relative_to(repo_root_path).parts[0] == relative_path.parts[0] and not spec.match_file(str(p.relative_to(repo_root_path))) for p in all_items if p != item_path and p.parent == item_path)
# Simpler: Just always list the dir for now. Empty dir check is complex with ignores + rglob.
markdown_lines.append(f"{indent}{FOLDER_ICON} **{item_path.name}/**")
elif item_path.is_file():
markdown_lines.append(f"{indent}{FILE_ICON} {item_path.name}")
files_processed += 1
# Include file content if requested and within limits
if include_content and max_size_kb > 0: # Check > 0 explicitly
try:
file_size = item_path.stat().st_size
if file_size == 0:
markdown_lines.append(f"{indent}{INDENT_CHAR}```")
markdown_lines.append(f"{indent}{INDENT_CHAR}[Empty File]")
markdown_lines.append(f"{indent}{INDENT_CHAR}```")
elif file_size <= max_file_size_bytes:
try:
# Attempt to detect binary files heuristically before reading large ones
is_binary = False
try:
# Read a small chunk to check for null bytes
with open(item_path, 'rb') as bf:
chunk = bf.read(1024)
if b'\x00' in chunk:
is_binary = True
except Exception:
# Ignore errors during binary check, proceed as text
pass
if is_binary:
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Likely a binary file (Size: {file_size} bytes)]")
else:
content = item_path.read_text(encoding='utf-8', errors='replace') # Replace errors instead of failing
lang = item_path.suffix.lstrip('.')
# Simple lang detection, can be expanded
if not lang: lang = "text"
markdown_lines.append(f"{indent}{INDENT_CHAR}```{lang}")
# Indent content lines
content_lines = content.splitlines()
# Limit output lines displayed in Markdown preview if necessary
# Note: The downloaded file will have full content
display_lines = content_lines[:MAX_OUTPUT_LINES]
for line in display_lines:
markdown_lines.append(f"{indent}{INDENT_CHAR}{line}")
if len(content_lines) > MAX_OUTPUT_LINES:
markdown_lines.append(f"{indent}{INDENT_CHAR}[... content truncated in preview ...]")
markdown_lines.append(f"{indent}{INDENT_CHAR}```")
except UnicodeDecodeError:
# Should be less common now with errors='replace'
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error decoding file as UTF-8 (Size: {file_size} bytes)]")
except Exception as read_err:
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error reading file - {read_err}]")
else:
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: File size ({file_size:,} bytes) exceeds limit ({max_file_size_bytes:,} bytes)]") # Added commas
except OSError as stat_err:
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error accessing file stats - {stat_err}]")
elif include_content and max_size_kb == 0: # Content included checked, but 0 size limit
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Max file size set to 0 KB]")
# Add a newline for separation, helps readability only if content wasn't added (which adds ```\n)
# Or maybe always add it for consistency between file/dir entries
markdown_lines.append("")
progress(0.95, desc="Formatting output...")
final_markdown = "\n".join(markdown_lines)
print(f"Processed {files_processed} files.") # Debugging
return final_markdown
# --- Gradio Interface ---
def process_repo(
source_type, repo_url, branch_tag, zip_file_obj,
include_content, max_size_kb, ignore_patterns,
progress=gr.Progress(track_tqdm=True)
):
"""Main function called by Gradio button."""
status = ""
output_markdown = ""
output_file_path = None
repo_root_path = None
temp_dir_to_clean = None
# Ensure max_size_kb is treated as a number
try:
max_size_kb_int = int(max_size_kb) if max_size_kb is not None else 0
except ValueError:
yield "Error: Max File Size must be a number.", "", gr.update(value=None, visible=False)
return
try:
progress(0, desc="Starting...")
# Initial state update for all outputs
yield "Preparing...", "", gr.update(value=None, visible=False)
# 1. Get Repository Path
yield "Fetching repository...", "", gr.update(value=None, visible=False)
repo_root_path, temp_dir_to_clean = get_repo_path(
source_type, repo_url, branch_tag, zip_file_obj, progress=progress
)
# Check if path finding was successful before proceeding
if not repo_root_path:
# Error should have been raised in get_repo_path, but double-check
raise ValueError("Failed to obtain repository path.")
yield f"Repository ready at: {repo_root_path.name}", "", gr.update(value=None, visible=False)
# 2. Generate Markdown
yield "Generating Markdown structure...", "", gr.update(value=None, visible=False)
markdown_content = generate_markdown_structure(
repo_root_path, include_content, max_size_kb_int, ignore_patterns, progress=progress
)
# Limit preview size robustly
preview_limit = 3000
markdown_preview = markdown_content[:preview_limit]
if len(markdown_content) > preview_limit:
markdown_preview += "\n\n[... Output truncated in preview ...]"
# 3. Prepare Output File
yield "Saving Markdown to file...", markdown_preview, gr.update(value=None, visible=False)
output_filename = f"{repo_root_path.name}_structure.md"
# Sanitize filename slightly (replace spaces, etc.) - less critical in temp file context
output_filename = "".join(c if c.isalnum() or c in ('_', '-', '.') else '_' for c in output_filename)
# Save the file in a place Gradio can access (it manages temp files)
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".md", encoding='utf-8') as temp_file:
temp_file.write(markdown_content)
output_file_path = temp_file.name # Gradio needs the path to this file
# *** CORRECTED YIELD USING gr.update ***
yield f"Done. Output file '{output_filename}' ready for download.", markdown_preview, gr.update(value=output_file_path, visible=True, label=f"Download {output_filename}")
except ValueError as ve:
print(f"Value Error: {ve}") # Log error
traceback.print_exc()
# *** CORRECTED YIELD USING gr.update ***
yield f"Error: {ve}", "", gr.update(value=None, visible=False)
except subprocess.CalledProcessError as cpe:
error_detail = cpe.stderr or cpe.stdout or "Unknown git error"
print(f"Git Error: {error_detail}") # Log error
traceback.print_exc()
# *** CORRECTED YIELD USING gr.update ***
yield f"Git command failed: {error_detail}", "", gr.update(value=None, visible=False)
except Exception as e:
print(f"Unexpected Error: {e}") # Log error
traceback.print_exc() # Print full traceback to logs
# *** CORRECTED YIELD USING gr.update ***
yield f"An unexpected error occurred: {e}", "", gr.update(value=None, visible=False)
finally:
# 4. Cleanup
if temp_dir_to_clean:
print(f"Cleaning up temporary directory: {temp_dir_to_clean}")
shutil.rmtree(temp_dir_to_clean, ignore_errors=True)
print("Cleanup complete.")
# --- Build Gradio UI --- (Keep as before)
with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")) as demo:
gr.Markdown("# GitHub Repository to Markdown Converter")
gr.Markdown("Convert a GitHub repository's structure (and optionally content) into a single Markdown file.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Input Source")
input_source = gr.Radio(
["URL", "Upload ZIP"], label="Select Source Type", value="URL"
)
url_input_group = gr.Group(visible=True) # Show URL by default
with url_input_group:
repo_url_input = gr.Textbox(label="Git Repository URL", placeholder="https://github.com/user/repo.git")
branch_tag_input = gr.Textbox(label="Branch / Tag (Optional)", placeholder="main")
zip_input_group = gr.Group(visible=False) # Hide ZIP by default
with zip_input_group:
zip_file_input = gr.File(label="Upload Repository ZIP", file_types=[".zip"])
# --- Configuration Options ---
gr.Markdown("## Configuration")
include_content_checkbox = gr.Checkbox(label="Include File Content in Output", value=False)
max_size_input = gr.Number(label="Max File Size for Content (KB)", value=100, minimum=0, step=10,
info="Files larger than this won't have content included. Set to 0 to disable content inclusion entirely, even if checked above.")
ignore_patterns_input = gr.Textbox(
label="Ignore Patterns (comma-separated or newline, gitignore style)",
info="Uses .gitignore syntax. Add / for directories. Default patterns provided.",
lines=5,
value=DEFAULT_IGNORE_PATTERNS.strip()
)
generate_button = gr.Button("Generate Markdown", variant="primary")
with gr.Column(scale=2):
gr.Markdown("## Status & Output")
status_output = gr.Textbox(label="Current Status", interactive=False, lines=2)
# Use a Textbox for preview initially, as Markdown rendering can be slow/heavy
markdown_preview_output = gr.Textbox(label="Markdown Preview (Truncated)", interactive=False, lines=20)
# Use gr.File for the final download link
download_output = gr.File(label="Download Markdown File", visible=False, interactive=False) # Set interactive=False
# --- Event Handlers --- (Keep as before)
def toggle_input_visibility(choice):
if choice == "URL":
return gr.update(visible=True), gr.update(visible=False)
else: # ZIP
return gr.update(visible=False), gr.update(visible=True)
input_source.change(
fn=toggle_input_visibility,
inputs=input_source,
outputs=[url_input_group, zip_input_group],
)
generate_button.click(
fn=process_repo,
inputs=[
input_source, repo_url_input, branch_tag_input, zip_file_input,
include_content_checkbox, max_size_input, ignore_patterns_input
],
outputs=[status_output, markdown_preview_output, download_output],
# api_name="generate_markdown" # Optional: for API access
)
# --- Launch the App --- (Keep as before)
if __name__ == "__main__":
# Ensure queue is enabled for HF Spaces deployment
# debug=True is useful for local testing, might remove/set to False for production space
demo.queue().launch(debug=True)