Spaces:
Running
Running
import gradio as gr | |
import os | |
import subprocess | |
import tempfile | |
import zipfile | |
import pathlib | |
import shutil | |
from pathspec import PathSpec | |
from pathspec.patterns import GitWildMatchPattern | |
import traceback # Import traceback for better error logging | |
# --- Configuration --- (Keep as before) | |
DEFAULT_IGNORE_PATTERNS = """ | |
# Default Ignore Patterns (Gitignore Syntax) | |
/.git/ | |
/.hg/ | |
/.svn/ | |
/.vscode/ | |
/.idea/ | |
/node_modules/ | |
/vendor/ | |
/build/ | |
/dist/ | |
/target/ | |
*.pyc | |
*.log | |
*.swp | |
*~ | |
__pycache__/ | |
.DS_Store | |
""" | |
MAX_OUTPUT_LINES = 10000 # Limit potential output size in display | |
INDENT_CHAR = " " # 4 spaces for indentation | |
FOLDER_ICON = "π" | |
FILE_ICON = "π" | |
# --- Core Logic --- (Keep get_repo_path and generate_markdown_structure as before) | |
def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.Progress()): | |
"""Clones or extracts the repository, returning the local path.""" | |
temp_dir = tempfile.mkdtemp() | |
repo_path = None | |
try: | |
if source_type == "URL": | |
if not repo_url: | |
raise ValueError("GitHub Repository URL is required.") | |
progress(0.1, desc="Cloning repository...") | |
git_command = ["git", "clone", "--depth", "1"] # Shallow clone for speed | |
if branch_tag: | |
git_command.extend(["--branch", branch_tag]) | |
git_command.extend([repo_url, temp_dir]) | |
print(f"Running command: {' '.join(git_command)}") # For debugging | |
result = subprocess.run(git_command, capture_output=True, text=True, check=False) | |
if result.returncode != 0: | |
# Attempt clone without branch if specific one failed (might be default branch) | |
if branch_tag: | |
progress(0.2, desc=f"Branch '{branch_tag}' not found or clone failed, trying default branch...") | |
git_command = ["git", "clone", "--depth", "1", repo_url, temp_dir] | |
print(f"Running command: {' '.join(git_command)}") # For debugging | |
result = subprocess.run(git_command, capture_output=True, text=True, check=False) | |
if result.returncode != 0: | |
error_message = f"Git clone failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}" | |
print(error_message) # Log detailed error | |
# Try to extract a user-friendly message | |
if "Authentication failed" in result.stderr: | |
raise ValueError("Authentication failed. Private repositories require different handling (e.g., tokens) which is not supported here.") | |
elif "not found" in result.stderr: | |
raise ValueError(f"Repository or branch '{branch_tag or 'default'}' not found at URL: {repo_url}") | |
else: | |
raise ValueError(f"Git clone failed. Check URL and branch/tag. Error: {result.stderr.splitlines()[-1] if result.stderr else 'Unknown error'}") | |
repo_path = pathlib.Path(temp_dir) | |
progress(0.5, desc="Repository cloned.") | |
print(f"Cloned repo to: {repo_path}") # Debugging | |
elif source_type == "Upload ZIP": | |
if zip_file_obj is None: | |
raise ValueError("ZIP file upload is required.") | |
progress(0.1, desc="Extracting ZIP file...") | |
zip_path = zip_file_obj.name # Gradio provides a temp file path | |
with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
# Check for common zip structure (single top-level dir) | |
top_level_dirs = list(set(p.split('/')[0] for p in zip_ref.namelist() if '/' in p and p.split('/')[0])) | |
extract_target = temp_dir | |
potential_repo_root = temp_dir | |
if len(top_level_dirs) == 1: | |
# If zip contains repo-main/file structure, extract *into* temp_dir | |
# The actual repo content will be inside temp_dir/repo-main/ | |
zip_ref.extractall(extract_target) | |
potential_repo_root = os.path.join(temp_dir, top_level_dirs[0]) | |
print(f"ZIP has single top-level dir: {top_level_dirs[0]}. Potential root: {potential_repo_root}") | |
else: | |
# Otherwise, extract directly into temp_dir | |
zip_ref.extractall(extract_target) | |
print(f"ZIP structure seems flat or multi-root. Using extract target as root: {extract_target}") | |
# Basic check if potential_repo_root looks like a valid directory | |
if os.path.isdir(potential_repo_root): | |
repo_path = pathlib.Path(potential_repo_root) | |
else: | |
# Fallback if single dir logic failed or wasn't applicable | |
repo_path = pathlib.Path(extract_target) | |
progress(0.5, desc="ZIP extracted.") | |
print(f"Extracted ZIP to: {repo_path}") # Debugging | |
else: | |
raise ValueError("Invalid source type selected.") | |
if not repo_path or not repo_path.is_dir(): | |
# Add more specific debugging info here | |
print(f"Debug Info: Temp dir content: {list(os.listdir(temp_dir))}") | |
if 'potential_repo_root' in locals() and potential_repo_root != temp_dir: | |
print(f"Debug Info: Potential repo root '{potential_repo_root}' exists: {os.path.exists(potential_repo_root)}, is dir: {os.path.isdir(potential_repo_root)}") | |
raise ValueError(f"Could not determine repository root directory within: {temp_dir}") | |
return repo_path, temp_dir # Return both the repo content path and the parent temp dir for cleanup | |
except Exception as e: | |
# Clean up the temporary directory on error before re-raising | |
shutil.rmtree(temp_dir, ignore_errors=True) | |
print(f"Error in get_repo_path: {e}") # Log error | |
traceback.print_exc() # Print full traceback for debugging get_repo_path issues | |
raise e # Re-raise the exception to be caught by the main function | |
def generate_markdown_structure( | |
repo_root_path: pathlib.Path, | |
include_content: bool, | |
max_size_kb: int, | |
ignore_patterns_str: str, | |
progress=gr.Progress() | |
): | |
"""Generates the Markdown string from the repository structure.""" | |
repo_root_path = pathlib.Path(repo_root_path) # Ensure it's a Path object | |
markdown_lines = [] | |
max_file_size_bytes = max_size_kb * 1024 if max_size_kb > 0 else 0 | |
# --- Prepare ignore patterns --- | |
# Combine default and user patterns | |
full_ignore_patterns = DEFAULT_IGNORE_PATTERNS.strip() + "\n" + ignore_patterns_str.strip() | |
# Filter out empty lines and comments | |
patterns = [line for line in full_ignore_patterns.splitlines() if line.strip() and not line.strip().startswith('#')] | |
# Create unique list while preserving order (important if later patterns override earlier ones) | |
seen = set() | |
unique_patterns = [x for x in patterns if not (x in seen or seen.add(x))] | |
spec = PathSpec.from_lines(GitWildMatchPattern, unique_patterns) | |
print(f"Using unique ignore patterns: {unique_patterns}") # Debugging | |
# --- Add header --- | |
repo_name = repo_root_path.name | |
markdown_lines.append(f"# {FOLDER_ICON} {repo_name}") | |
markdown_lines.append("") | |
# --- Walk through the directory --- | |
progress(0.6, desc="Scanning repository structure...") | |
files_processed = 0 | |
# Need to iterate through items relative to the root for pathspec matching | |
all_items = sorted(list(repo_root_path.rglob('*'))) | |
total_items_estimate = len(all_items) # More accurate estimate | |
items_scanned = 0 | |
for item_path in all_items: | |
items_scanned += 1 | |
if items_scanned % 50 == 0: # Update progress periodically | |
progress(0.6 + (0.3 * (items_scanned / max(1, total_items_estimate))), desc=f"Scanning: {item_path.name}") | |
relative_path = item_path.relative_to(repo_root_path) | |
# Pathspec matches against the path string relative to the root where .gitignore would be | |
# Important: Add a leading '/' for patterns like '/node_modules/' to only match at the root | |
path_str_for_match = str(relative_path) | |
# Check if the path itself should be ignored | |
# Pathspec automatically handles directory patterns (e.g., node_modules/ matches files and dirs inside) | |
if spec.match_file(path_str_for_match): | |
print(f"Ignoring: {relative_path}") # Debugging | |
# If it's a directory, we don't need to manually skip recursion because | |
# rglob already gave us all paths; we just skip processing this specific path. | |
# If we were using os.walk, we'd modify the dirs list here. | |
continue | |
# Calculate depth and indentation | |
depth = len(relative_path.parts) -1 # 0-based depth relative to root content | |
indent = INDENT_CHAR * depth | |
# Add entry to Markdown | |
if item_path.is_dir(): | |
# Check if dir is empty *after* considering ignores. This is tricky with rglob. | |
# A simple heuristic: check if any non-ignored children exist directly within it. | |
# This isn't perfect but avoids complex lookahead. | |
# has_children = any(p.relative_to(repo_root_path).parts[0] == relative_path.parts[0] and not spec.match_file(str(p.relative_to(repo_root_path))) for p in all_items if p != item_path and p.parent == item_path) | |
# Simpler: Just always list the dir for now. Empty dir check is complex with ignores + rglob. | |
markdown_lines.append(f"{indent}{FOLDER_ICON} **{item_path.name}/**") | |
elif item_path.is_file(): | |
markdown_lines.append(f"{indent}{FILE_ICON} {item_path.name}") | |
files_processed += 1 | |
# Include file content if requested and within limits | |
if include_content and max_size_kb > 0: # Check > 0 explicitly | |
try: | |
file_size = item_path.stat().st_size | |
if file_size == 0: | |
markdown_lines.append(f"{indent}{INDENT_CHAR}```") | |
markdown_lines.append(f"{indent}{INDENT_CHAR}[Empty File]") | |
markdown_lines.append(f"{indent}{INDENT_CHAR}```") | |
elif file_size <= max_file_size_bytes: | |
try: | |
# Attempt to detect binary files heuristically before reading large ones | |
is_binary = False | |
try: | |
# Read a small chunk to check for null bytes | |
with open(item_path, 'rb') as bf: | |
chunk = bf.read(1024) | |
if b'\x00' in chunk: | |
is_binary = True | |
except Exception: | |
# Ignore errors during binary check, proceed as text | |
pass | |
if is_binary: | |
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Likely a binary file (Size: {file_size} bytes)]") | |
else: | |
content = item_path.read_text(encoding='utf-8', errors='replace') # Replace errors instead of failing | |
lang = item_path.suffix.lstrip('.') | |
# Simple lang detection, can be expanded | |
if not lang: lang = "text" | |
markdown_lines.append(f"{indent}{INDENT_CHAR}```{lang}") | |
# Indent content lines | |
content_lines = content.splitlines() | |
# Limit output lines displayed in Markdown preview if necessary | |
# Note: The downloaded file will have full content | |
display_lines = content_lines[:MAX_OUTPUT_LINES] | |
for line in display_lines: | |
markdown_lines.append(f"{indent}{INDENT_CHAR}{line}") | |
if len(content_lines) > MAX_OUTPUT_LINES: | |
markdown_lines.append(f"{indent}{INDENT_CHAR}[... content truncated in preview ...]") | |
markdown_lines.append(f"{indent}{INDENT_CHAR}```") | |
except UnicodeDecodeError: | |
# Should be less common now with errors='replace' | |
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error decoding file as UTF-8 (Size: {file_size} bytes)]") | |
except Exception as read_err: | |
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error reading file - {read_err}]") | |
else: | |
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: File size ({file_size:,} bytes) exceeds limit ({max_file_size_bytes:,} bytes)]") # Added commas | |
except OSError as stat_err: | |
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error accessing file stats - {stat_err}]") | |
elif include_content and max_size_kb == 0: # Content included checked, but 0 size limit | |
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Max file size set to 0 KB]") | |
# Add a newline for separation, helps readability only if content wasn't added (which adds ```\n) | |
# Or maybe always add it for consistency between file/dir entries | |
markdown_lines.append("") | |
progress(0.95, desc="Formatting output...") | |
final_markdown = "\n".join(markdown_lines) | |
print(f"Processed {files_processed} files.") # Debugging | |
return final_markdown | |
# --- Gradio Interface --- | |
def process_repo( | |
source_type, repo_url, branch_tag, zip_file_obj, | |
include_content, max_size_kb, ignore_patterns, | |
progress=gr.Progress(track_tqdm=True) | |
): | |
"""Main function called by Gradio button.""" | |
status = "" | |
output_markdown = "" | |
output_file_path = None | |
repo_root_path = None | |
temp_dir_to_clean = None | |
# Ensure max_size_kb is treated as a number | |
try: | |
max_size_kb_int = int(max_size_kb) if max_size_kb is not None else 0 | |
except ValueError: | |
yield "Error: Max File Size must be a number.", "", gr.update(value=None, visible=False) | |
return | |
try: | |
progress(0, desc="Starting...") | |
# Initial state update for all outputs | |
yield "Preparing...", "", gr.update(value=None, visible=False) | |
# 1. Get Repository Path | |
yield "Fetching repository...", "", gr.update(value=None, visible=False) | |
repo_root_path, temp_dir_to_clean = get_repo_path( | |
source_type, repo_url, branch_tag, zip_file_obj, progress=progress | |
) | |
# Check if path finding was successful before proceeding | |
if not repo_root_path: | |
# Error should have been raised in get_repo_path, but double-check | |
raise ValueError("Failed to obtain repository path.") | |
yield f"Repository ready at: {repo_root_path.name}", "", gr.update(value=None, visible=False) | |
# 2. Generate Markdown | |
yield "Generating Markdown structure...", "", gr.update(value=None, visible=False) | |
markdown_content = generate_markdown_structure( | |
repo_root_path, include_content, max_size_kb_int, ignore_patterns, progress=progress | |
) | |
# Limit preview size robustly | |
preview_limit = 3000 | |
markdown_preview = markdown_content[:preview_limit] | |
if len(markdown_content) > preview_limit: | |
markdown_preview += "\n\n[... Output truncated in preview ...]" | |
# 3. Prepare Output File | |
yield "Saving Markdown to file...", markdown_preview, gr.update(value=None, visible=False) | |
output_filename = f"{repo_root_path.name}_structure.md" | |
# Sanitize filename slightly (replace spaces, etc.) - less critical in temp file context | |
output_filename = "".join(c if c.isalnum() or c in ('_', '-', '.') else '_' for c in output_filename) | |
# Save the file in a place Gradio can access (it manages temp files) | |
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".md", encoding='utf-8') as temp_file: | |
temp_file.write(markdown_content) | |
output_file_path = temp_file.name # Gradio needs the path to this file | |
# *** CORRECTED YIELD USING gr.update *** | |
yield f"Done. Output file '{output_filename}' ready for download.", markdown_preview, gr.update(value=output_file_path, visible=True, label=f"Download {output_filename}") | |
except ValueError as ve: | |
print(f"Value Error: {ve}") # Log error | |
traceback.print_exc() | |
# *** CORRECTED YIELD USING gr.update *** | |
yield f"Error: {ve}", "", gr.update(value=None, visible=False) | |
except subprocess.CalledProcessError as cpe: | |
error_detail = cpe.stderr or cpe.stdout or "Unknown git error" | |
print(f"Git Error: {error_detail}") # Log error | |
traceback.print_exc() | |
# *** CORRECTED YIELD USING gr.update *** | |
yield f"Git command failed: {error_detail}", "", gr.update(value=None, visible=False) | |
except Exception as e: | |
print(f"Unexpected Error: {e}") # Log error | |
traceback.print_exc() # Print full traceback to logs | |
# *** CORRECTED YIELD USING gr.update *** | |
yield f"An unexpected error occurred: {e}", "", gr.update(value=None, visible=False) | |
finally: | |
# 4. Cleanup | |
if temp_dir_to_clean: | |
print(f"Cleaning up temporary directory: {temp_dir_to_clean}") | |
shutil.rmtree(temp_dir_to_clean, ignore_errors=True) | |
print("Cleanup complete.") | |
# --- Build Gradio UI --- (Keep as before) | |
with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")) as demo: | |
gr.Markdown("# GitHub Repository to Markdown Converter") | |
gr.Markdown("Convert a GitHub repository's structure (and optionally content) into a single Markdown file.") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("## Input Source") | |
input_source = gr.Radio( | |
["URL", "Upload ZIP"], label="Select Source Type", value="URL" | |
) | |
url_input_group = gr.Group(visible=True) # Show URL by default | |
with url_input_group: | |
repo_url_input = gr.Textbox(label="Git Repository URL", placeholder="https://github.com/user/repo.git") | |
branch_tag_input = gr.Textbox(label="Branch / Tag (Optional)", placeholder="main") | |
zip_input_group = gr.Group(visible=False) # Hide ZIP by default | |
with zip_input_group: | |
zip_file_input = gr.File(label="Upload Repository ZIP", file_types=[".zip"]) | |
# --- Configuration Options --- | |
gr.Markdown("## Configuration") | |
include_content_checkbox = gr.Checkbox(label="Include File Content in Output", value=False) | |
max_size_input = gr.Number(label="Max File Size for Content (KB)", value=100, minimum=0, step=10, | |
info="Files larger than this won't have content included. Set to 0 to disable content inclusion entirely, even if checked above.") | |
ignore_patterns_input = gr.Textbox( | |
label="Ignore Patterns (comma-separated or newline, gitignore style)", | |
info="Uses .gitignore syntax. Add / for directories. Default patterns provided.", | |
lines=5, | |
value=DEFAULT_IGNORE_PATTERNS.strip() | |
) | |
generate_button = gr.Button("Generate Markdown", variant="primary") | |
with gr.Column(scale=2): | |
gr.Markdown("## Status & Output") | |
status_output = gr.Textbox(label="Current Status", interactive=False, lines=2) | |
# Use a Textbox for preview initially, as Markdown rendering can be slow/heavy | |
markdown_preview_output = gr.Textbox(label="Markdown Preview (Truncated)", interactive=False, lines=20) | |
# Use gr.File for the final download link | |
download_output = gr.File(label="Download Markdown File", visible=False, interactive=False) # Set interactive=False | |
# --- Event Handlers --- (Keep as before) | |
def toggle_input_visibility(choice): | |
if choice == "URL": | |
return gr.update(visible=True), gr.update(visible=False) | |
else: # ZIP | |
return gr.update(visible=False), gr.update(visible=True) | |
input_source.change( | |
fn=toggle_input_visibility, | |
inputs=input_source, | |
outputs=[url_input_group, zip_input_group], | |
) | |
generate_button.click( | |
fn=process_repo, | |
inputs=[ | |
input_source, repo_url_input, branch_tag_input, zip_file_input, | |
include_content_checkbox, max_size_input, ignore_patterns_input | |
], | |
outputs=[status_output, markdown_preview_output, download_output], | |
# api_name="generate_markdown" # Optional: for API access | |
) | |
# --- Launch the App --- (Keep as before) | |
if __name__ == "__main__": | |
# Ensure queue is enabled for HF Spaces deployment | |
# debug=True is useful for local testing, might remove/set to False for production space | |
demo.queue().launch(debug=True) |