julienserbanescu-rag / download_readmes.py
DaJulster's picture
updated
3bddddb
#!/usr/bin/env python3
"""
GitHub README Downloader for Julien Serbanescu RAG System
This script downloads README files from your GitHub repositories and saves them
to the docs/readmes/ directory for indexing.
Usage:
python download_readmes.py --username DaJulster
python download_readmes.py --username DaJulster --token YOUR_GITHUB_TOKEN
"""
import os
import sys
import requests
import argparse
from pathlib import Path
import time
import re
def clean_filename(repo_name):
"""Clean repository name to create a valid filename"""
cleaned = re.sub(r'[^\w\-_.]', '_', repo_name)
cleaned = re.sub(r'_+', '_', cleaned)
cleaned = cleaned.strip('_')
return f"{cleaned}-README.md"
def fetch_repos(url, headers):
"""Helper function to fetch repositories from a URL with better rate limiting"""
repos = []
page = 1
per_page = 30 # Smaller page size to reduce rate limiting
while True:
params = {"page": page, "per_page": per_page, "sort": "updated"}
try:
response = requests.get(url, headers=headers, params=params)
# Handle rate limiting
if response.status_code == 403:
print(f"Rate limit hit, waiting 60 seconds...")
time.sleep(60)
continue
response.raise_for_status()
data = response.json()
page_repos = data["items"] if isinstance(data, dict) and "items" in data else data
if not page_repos:
break
repos.extend(page_repos)
page += 1
time.sleep(0.5) # Increased rate limiting
except requests.exceptions.RequestException as e:
print(f"Error fetching repositories: {e}")
break
return repos
def is_user_involved_in_repo(repo, username, headers):
"""Check if user is involved in a repository by commits/issues/PRs"""
try:
# Check commits (most common involvement)
commits_url = f"https://api.github.com/repos/{repo['full_name']}/commits?author={username}&per_page=1"
commits_response = requests.get(commits_url, headers=headers, timeout=5)
if commits_response.status_code == 200 and commits_response.json():
return True
# Check issues
issues_url = f"https://api.github.com/repos/{repo['full_name']}/issues?creator={username}&per_page=1"
issues_response = requests.get(issues_url, headers=headers, timeout=5)
if issues_response.status_code == 200 and issues_response.json():
return True
# Check PRs
prs_url = f"https://api.github.com/repos/{repo['full_name']}/pulls?author={username}&per_page=1"
prs_response = requests.get(prs_url, headers=headers, timeout=5)
if prs_response.status_code == 200 and prs_response.json():
return True
except Exception as e:
print(f" Error checking involvement in {repo['full_name']}: {e}")
return False
def get_user_repos(username, token=None):
"""Get all public repositories for a user (owned + contributed)"""
headers = {}
if token:
headers["Authorization"] = f"token {token}"
all_repos = []
# 1. Owned repos
print("Fetching owned repositories...")
owned_repos = fetch_repos(f"https://api.github.com/users/{username}/repos", headers)
for repo in owned_repos:
repo["contribution_type"] = "owner"
all_repos.extend(owned_repos)
print(f"Found {len(owned_repos)} owned repositories")
# 2. Repos with commits
print("Fetching repositories with commits...")
commit_repos = fetch_repos(f"https://api.github.com/search/repositories?q=committer:{username}+is:public", headers)
for repo in commit_repos:
repo["contribution_type"] = "owner" if repo.get("owner", {}).get("login", "").lower() == username.lower() else "contributor"
all_repos.extend(commit_repos)
print(f"Found {len(commit_repos)} repositories with commits")
# 3. Repos with authored activity
print("Fetching repositories with authored activity...")
activity_repos = fetch_repos(f"https://api.github.com/search/repositories?q=author:{username}+is:public", headers)
for repo in activity_repos:
repo["contribution_type"] = "owner" if repo.get("owner", {}).get("login", "").lower() == username.lower() else "contributor"
all_repos.extend(activity_repos)
print(f"Found {len(activity_repos)} repositories with activity")
# 4. Broad search queries
print("Fetching ALL repositories user is involved in...")
search_queries = [
f"https://api.github.com/search/repositories?q=involves:{username}+is:public",
f"https://api.github.com/search/repositories?q=user:{username}+is:public",
f"https://api.github.com/search/repositories?q=mentions:{username}+is:public",
f"https://api.github.com/search/repositories?q=assignee:{username}+is:public"
]
for i, query_url in enumerate(search_queries, 1):
print(f" Search query {i}/{len(search_queries)}")
search_repos = fetch_repos(query_url, headers)
for repo in search_repos:
repo["contribution_type"] = "owner" if repo.get("owner", {}).get("login", "").lower() == username.lower() else "contributor"
all_repos.extend(search_repos)
print(f" Found {len(search_repos)} repositories")
# 5. Get ALL organizations user is a member of
print("Fetching ALL organizations...")
try:
orgs_response = requests.get(f"https://api.github.com/users/{username}/orgs", headers=headers)
if orgs_response.status_code == 200:
orgs = orgs_response.json()
print(f"Found {len(orgs)} organizations you're a member of")
for org in orgs:
org_name = org["login"]
print(f" Processing organization: {org_name}")
try:
org_repos = fetch_repos(f"https://api.github.com/orgs/{org_name}/repos", headers)
for repo in org_repos:
# Check if user is involved in this repo
if is_user_involved_in_repo(repo, username, headers):
repo["contribution_type"] = "contributor"
all_repos.append(repo)
print(f" Found contribution: {repo['full_name']}")
print(f" Total repos in {org_name}: {len(org_repos)}")
except Exception as e:
print(f" Error processing {org_name}: {e}")
else:
print(f"Could not fetch organizations (status: {orgs_response.status_code})")
except Exception as e:
print(f"Error fetching organizations: {e}")
# 6. Also check known organizations (backup)
known_orgs = ["CyberScienceLab", "Guelph-Cyber-Security-Society", "Guelph-Artificial-Intelligence-Club", "ugrt"]
print("Checking additional known organizations...")
for org_name in known_orgs:
try:
print(f" Checking: {org_name}")
org_repos = fetch_repos(f"https://api.github.com/orgs/{org_name}/repos", headers)
for repo in org_repos:
if is_user_involved_in_repo(repo, username, headers):
# Check if already added
if not any(r["full_name"] == repo["full_name"] for r in all_repos):
repo["contribution_type"] = "contributor"
all_repos.append(repo)
print(f" Found additional contribution: {repo['full_name']}")
except Exception as e:
print(f" Error checking {org_name}: {e}")
# Deduplicate
seen = set()
unique_repos = []
for repo in all_repos:
if repo["full_name"] not in seen:
seen.add(repo["full_name"])
unique_repos.append(repo)
print(f"Total unique repositories: {len(unique_repos)}")
return unique_repos
def download_readme(repo, token=None):
"""Download README content via GitHub API (branch-agnostic)"""
headers = {"Accept": "application/vnd.github.v3.raw"}
if token:
headers["Authorization"] = f"token {token}"
url = f"https://api.github.com/repos/{repo['full_name']}/readme"
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
return response.text, "README"
else:
return None, None
except requests.exceptions.RequestException:
return None, None
def main():
parser = argparse.ArgumentParser(description="Download README files from GitHub repositories")
parser.add_argument("--username", required=True, help="GitHub username")
parser.add_argument("--token", help="GitHub personal access token (optional, increases rate limit)")
parser.add_argument("--output-dir", default="docs/readmes", help="Output directory for README files")
parser.add_argument("--exclude-forks", action="store_true", help="Exclude forked repositories")
parser.add_argument("--min-stars", type=int, default=0, help="Minimum number of stars required")
parser.add_argument("--exclude-tpoze", action="store_true", help="Exclude repositories containing 'tpoze' in name")
parser.add_argument("--contribution-type", choices=["owner", "contributor", "committer", "all"], default="all", help="Filter by contribution type")
parser.add_argument("--show-available", action="store_true", help="Show available repositories without downloading")
args = parser.parse_args()
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"=== Downloading README files for user: {args.username} ===")
repos = get_user_repos(args.username, args.token)
if not repos:
print("No repositories found.")
return
# Filtering
filtered_repos = []
for repo in repos:
if args.exclude_forks and repo.get("fork", False):
continue
if args.exclude_tpoze and "tpoze" in repo["name"].lower():
continue
if repo.get("stargazers_count", 0) < args.min_stars:
continue
if args.contribution_type != "all" and repo.get("contribution_type") != args.contribution_type:
continue
filtered_repos.append(repo)
print(f"After filtering: {len(filtered_repos)} repositories")
if args.show_available:
print("\n=== Available Repositories ===")
for i, repo in enumerate(filtered_repos, 1):
print(f"{i:2d}. {repo['full_name']} ({repo.get('contribution_type','?')}) ⭐{repo.get('stargazers_count',0)}")
return
# Download
downloaded_count = 0
skipped_count = 0
for i, repo in enumerate(filtered_repos, 1):
print(f"\n[{i}/{len(filtered_repos)}] {repo['full_name']} (⭐{repo.get('stargazers_count',0)})")
readme_content, _ = download_readme(repo, args.token)
if readme_content:
filename = clean_filename(repo["name"])
filepath = output_dir / filename
contribution_type = repo.get("contribution_type", "unknown")
contribution_emoji = {"owner": "πŸ‘‘", "contributor": "🀝", "committer": "πŸ’»"}.get(contribution_type, "❓")
metadata_header = f"""# {repo['name']}
**Repository:** {repo['full_name']}
**Contribution:** {contribution_emoji} {contribution_type.title()}
**Stars:** {repo.get('stargazers_count', 0)}
**Description:** {repo.get('description', 'No description')}
**Language:** {repo.get('language', 'Unknown')}
**Last Updated:** {repo.get('updated_at', 'Unknown')}
---
"""
with open(filepath, "w", encoding="utf-8") as f:
f.write(metadata_header)
f.write(readme_content)
print(f" βœ… Downloaded: {filename}")
downloaded_count += 1
else:
print(" ❌ No README found")
skipped_count += 1
time.sleep(0.2)
print("\n=== Download Complete ===")
print(f"Downloaded: {downloaded_count}, Skipped: {skipped_count}")
print(f"Files saved to: {output_dir}")
if __name__ == "__main__":
main()