Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
GitHub README Downloader for Julien Serbanescu RAG System | |
This script downloads README files from your GitHub repositories and saves them | |
to the docs/readmes/ directory for indexing. | |
Usage: | |
python download_readmes.py --username DaJulster | |
python download_readmes.py --username DaJulster --token YOUR_GITHUB_TOKEN | |
""" | |
import os | |
import sys | |
import requests | |
import argparse | |
from pathlib import Path | |
import time | |
import re | |
def clean_filename(repo_name): | |
"""Clean repository name to create a valid filename""" | |
cleaned = re.sub(r'[^\w\-_.]', '_', repo_name) | |
cleaned = re.sub(r'_+', '_', cleaned) | |
cleaned = cleaned.strip('_') | |
return f"{cleaned}-README.md" | |
def fetch_repos(url, headers): | |
"""Helper function to fetch repositories from a URL with better rate limiting""" | |
repos = [] | |
page = 1 | |
per_page = 30 # Smaller page size to reduce rate limiting | |
while True: | |
params = {"page": page, "per_page": per_page, "sort": "updated"} | |
try: | |
response = requests.get(url, headers=headers, params=params) | |
# Handle rate limiting | |
if response.status_code == 403: | |
print(f"Rate limit hit, waiting 60 seconds...") | |
time.sleep(60) | |
continue | |
response.raise_for_status() | |
data = response.json() | |
page_repos = data["items"] if isinstance(data, dict) and "items" in data else data | |
if not page_repos: | |
break | |
repos.extend(page_repos) | |
page += 1 | |
time.sleep(0.5) # Increased rate limiting | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching repositories: {e}") | |
break | |
return repos | |
def is_user_involved_in_repo(repo, username, headers): | |
"""Check if user is involved in a repository by commits/issues/PRs""" | |
try: | |
# Check commits (most common involvement) | |
commits_url = f"https://api.github.com/repos/{repo['full_name']}/commits?author={username}&per_page=1" | |
commits_response = requests.get(commits_url, headers=headers, timeout=5) | |
if commits_response.status_code == 200 and commits_response.json(): | |
return True | |
# Check issues | |
issues_url = f"https://api.github.com/repos/{repo['full_name']}/issues?creator={username}&per_page=1" | |
issues_response = requests.get(issues_url, headers=headers, timeout=5) | |
if issues_response.status_code == 200 and issues_response.json(): | |
return True | |
# Check PRs | |
prs_url = f"https://api.github.com/repos/{repo['full_name']}/pulls?author={username}&per_page=1" | |
prs_response = requests.get(prs_url, headers=headers, timeout=5) | |
if prs_response.status_code == 200 and prs_response.json(): | |
return True | |
except Exception as e: | |
print(f" Error checking involvement in {repo['full_name']}: {e}") | |
return False | |
def get_user_repos(username, token=None): | |
"""Get all public repositories for a user (owned + contributed)""" | |
headers = {} | |
if token: | |
headers["Authorization"] = f"token {token}" | |
all_repos = [] | |
# 1. Owned repos | |
print("Fetching owned repositories...") | |
owned_repos = fetch_repos(f"https://api.github.com/users/{username}/repos", headers) | |
for repo in owned_repos: | |
repo["contribution_type"] = "owner" | |
all_repos.extend(owned_repos) | |
print(f"Found {len(owned_repos)} owned repositories") | |
# 2. Repos with commits | |
print("Fetching repositories with commits...") | |
commit_repos = fetch_repos(f"https://api.github.com/search/repositories?q=committer:{username}+is:public", headers) | |
for repo in commit_repos: | |
repo["contribution_type"] = "owner" if repo.get("owner", {}).get("login", "").lower() == username.lower() else "contributor" | |
all_repos.extend(commit_repos) | |
print(f"Found {len(commit_repos)} repositories with commits") | |
# 3. Repos with authored activity | |
print("Fetching repositories with authored activity...") | |
activity_repos = fetch_repos(f"https://api.github.com/search/repositories?q=author:{username}+is:public", headers) | |
for repo in activity_repos: | |
repo["contribution_type"] = "owner" if repo.get("owner", {}).get("login", "").lower() == username.lower() else "contributor" | |
all_repos.extend(activity_repos) | |
print(f"Found {len(activity_repos)} repositories with activity") | |
# 4. Broad search queries | |
print("Fetching ALL repositories user is involved in...") | |
search_queries = [ | |
f"https://api.github.com/search/repositories?q=involves:{username}+is:public", | |
f"https://api.github.com/search/repositories?q=user:{username}+is:public", | |
f"https://api.github.com/search/repositories?q=mentions:{username}+is:public", | |
f"https://api.github.com/search/repositories?q=assignee:{username}+is:public" | |
] | |
for i, query_url in enumerate(search_queries, 1): | |
print(f" Search query {i}/{len(search_queries)}") | |
search_repos = fetch_repos(query_url, headers) | |
for repo in search_repos: | |
repo["contribution_type"] = "owner" if repo.get("owner", {}).get("login", "").lower() == username.lower() else "contributor" | |
all_repos.extend(search_repos) | |
print(f" Found {len(search_repos)} repositories") | |
# 5. Get ALL organizations user is a member of | |
print("Fetching ALL organizations...") | |
try: | |
orgs_response = requests.get(f"https://api.github.com/users/{username}/orgs", headers=headers) | |
if orgs_response.status_code == 200: | |
orgs = orgs_response.json() | |
print(f"Found {len(orgs)} organizations you're a member of") | |
for org in orgs: | |
org_name = org["login"] | |
print(f" Processing organization: {org_name}") | |
try: | |
org_repos = fetch_repos(f"https://api.github.com/orgs/{org_name}/repos", headers) | |
for repo in org_repos: | |
# Check if user is involved in this repo | |
if is_user_involved_in_repo(repo, username, headers): | |
repo["contribution_type"] = "contributor" | |
all_repos.append(repo) | |
print(f" Found contribution: {repo['full_name']}") | |
print(f" Total repos in {org_name}: {len(org_repos)}") | |
except Exception as e: | |
print(f" Error processing {org_name}: {e}") | |
else: | |
print(f"Could not fetch organizations (status: {orgs_response.status_code})") | |
except Exception as e: | |
print(f"Error fetching organizations: {e}") | |
# 6. Also check known organizations (backup) | |
known_orgs = ["CyberScienceLab", "Guelph-Cyber-Security-Society", "Guelph-Artificial-Intelligence-Club", "ugrt"] | |
print("Checking additional known organizations...") | |
for org_name in known_orgs: | |
try: | |
print(f" Checking: {org_name}") | |
org_repos = fetch_repos(f"https://api.github.com/orgs/{org_name}/repos", headers) | |
for repo in org_repos: | |
if is_user_involved_in_repo(repo, username, headers): | |
# Check if already added | |
if not any(r["full_name"] == repo["full_name"] for r in all_repos): | |
repo["contribution_type"] = "contributor" | |
all_repos.append(repo) | |
print(f" Found additional contribution: {repo['full_name']}") | |
except Exception as e: | |
print(f" Error checking {org_name}: {e}") | |
# Deduplicate | |
seen = set() | |
unique_repos = [] | |
for repo in all_repos: | |
if repo["full_name"] not in seen: | |
seen.add(repo["full_name"]) | |
unique_repos.append(repo) | |
print(f"Total unique repositories: {len(unique_repos)}") | |
return unique_repos | |
def download_readme(repo, token=None): | |
"""Download README content via GitHub API (branch-agnostic)""" | |
headers = {"Accept": "application/vnd.github.v3.raw"} | |
if token: | |
headers["Authorization"] = f"token {token}" | |
url = f"https://api.github.com/repos/{repo['full_name']}/readme" | |
try: | |
response = requests.get(url, headers=headers, timeout=10) | |
if response.status_code == 200: | |
return response.text, "README" | |
else: | |
return None, None | |
except requests.exceptions.RequestException: | |
return None, None | |
def main(): | |
parser = argparse.ArgumentParser(description="Download README files from GitHub repositories") | |
parser.add_argument("--username", required=True, help="GitHub username") | |
parser.add_argument("--token", help="GitHub personal access token (optional, increases rate limit)") | |
parser.add_argument("--output-dir", default="docs/readmes", help="Output directory for README files") | |
parser.add_argument("--exclude-forks", action="store_true", help="Exclude forked repositories") | |
parser.add_argument("--min-stars", type=int, default=0, help="Minimum number of stars required") | |
parser.add_argument("--exclude-tpoze", action="store_true", help="Exclude repositories containing 'tpoze' in name") | |
parser.add_argument("--contribution-type", choices=["owner", "contributor", "committer", "all"], default="all", help="Filter by contribution type") | |
parser.add_argument("--show-available", action="store_true", help="Show available repositories without downloading") | |
args = parser.parse_args() | |
output_dir = Path(args.output_dir) | |
output_dir.mkdir(parents=True, exist_ok=True) | |
print(f"=== Downloading README files for user: {args.username} ===") | |
repos = get_user_repos(args.username, args.token) | |
if not repos: | |
print("No repositories found.") | |
return | |
# Filtering | |
filtered_repos = [] | |
for repo in repos: | |
if args.exclude_forks and repo.get("fork", False): | |
continue | |
if args.exclude_tpoze and "tpoze" in repo["name"].lower(): | |
continue | |
if repo.get("stargazers_count", 0) < args.min_stars: | |
continue | |
if args.contribution_type != "all" and repo.get("contribution_type") != args.contribution_type: | |
continue | |
filtered_repos.append(repo) | |
print(f"After filtering: {len(filtered_repos)} repositories") | |
if args.show_available: | |
print("\n=== Available Repositories ===") | |
for i, repo in enumerate(filtered_repos, 1): | |
print(f"{i:2d}. {repo['full_name']} ({repo.get('contribution_type','?')}) β{repo.get('stargazers_count',0)}") | |
return | |
# Download | |
downloaded_count = 0 | |
skipped_count = 0 | |
for i, repo in enumerate(filtered_repos, 1): | |
print(f"\n[{i}/{len(filtered_repos)}] {repo['full_name']} (β{repo.get('stargazers_count',0)})") | |
readme_content, _ = download_readme(repo, args.token) | |
if readme_content: | |
filename = clean_filename(repo["name"]) | |
filepath = output_dir / filename | |
contribution_type = repo.get("contribution_type", "unknown") | |
contribution_emoji = {"owner": "π", "contributor": "π€", "committer": "π»"}.get(contribution_type, "β") | |
metadata_header = f"""# {repo['name']} | |
**Repository:** {repo['full_name']} | |
**Contribution:** {contribution_emoji} {contribution_type.title()} | |
**Stars:** {repo.get('stargazers_count', 0)} | |
**Description:** {repo.get('description', 'No description')} | |
**Language:** {repo.get('language', 'Unknown')} | |
**Last Updated:** {repo.get('updated_at', 'Unknown')} | |
--- | |
""" | |
with open(filepath, "w", encoding="utf-8") as f: | |
f.write(metadata_header) | |
f.write(readme_content) | |
print(f" β Downloaded: {filename}") | |
downloaded_count += 1 | |
else: | |
print(" β No README found") | |
skipped_count += 1 | |
time.sleep(0.2) | |
print("\n=== Download Complete ===") | |
print(f"Downloaded: {downloaded_count}, Skipped: {skipped_count}") | |
print(f"Files saved to: {output_dir}") | |
if __name__ == "__main__": | |
main() | |