Spaces:

DaJulster
/

julienserbanescu-rag

Sleeping

App Files Files Community

julienserbanescu-rag / download_readmes.py

DaJulster

updated

3bddddb 6 days ago

raw

history blame contribute delete

12.3 kB

	#!/usr/bin/env python3
	"""
	GitHub README Downloader for Julien Serbanescu RAG System

	This script downloads README files from your GitHub repositories and saves them
	to the docs/readmes/ directory for indexing.

	Usage:
	python download_readmes.py --username DaJulster
	python download_readmes.py --username DaJulster --token YOUR_GITHUB_TOKEN
	"""

	import os
	import sys
	import requests
	import argparse
	from pathlib import Path
	import time
	import re

	def clean_filename(repo_name):
	"""Clean repository name to create a valid filename"""
	cleaned = re.sub(r'[^\w\-_.]', '_', repo_name)
	cleaned = re.sub(r'_+', '_', cleaned)
	cleaned = cleaned.strip('_')
	return f"{cleaned}-README.md"


	def fetch_repos(url, headers):
	"""Helper function to fetch repositories from a URL with better rate limiting"""
	repos = []
	page = 1
	per_page = 30 # Smaller page size to reduce rate limiting

	while True:
	params = {"page": page, "per_page": per_page, "sort": "updated"}
	try:
	response = requests.get(url, headers=headers, params=params)

	# Handle rate limiting
	if response.status_code == 403:
	print(f"Rate limit hit, waiting 60 seconds...")
	time.sleep(60)
	continue

	response.raise_for_status()
	data = response.json()
	page_repos = data["items"] if isinstance(data, dict) and "items" in data else data
	if not page_repos:
	break
	repos.extend(page_repos)
	page += 1
	time.sleep(0.5) # Increased rate limiting
	except requests.exceptions.RequestException as e:
	print(f"Error fetching repositories: {e}")
	break

	return repos


	def is_user_involved_in_repo(repo, username, headers):
	"""Check if user is involved in a repository by commits/issues/PRs"""
	try:
	# Check commits (most common involvement)
	commits_url = f"https://api.github.com/repos/{repo['full_name']}/commits?author={username}&per_page=1"
	commits_response = requests.get(commits_url, headers=headers, timeout=5)
	if commits_response.status_code == 200 and commits_response.json():
	return True

	# Check issues
	issues_url = f"https://api.github.com/repos/{repo['full_name']}/issues?creator={username}&per_page=1"
	issues_response = requests.get(issues_url, headers=headers, timeout=5)
	if issues_response.status_code == 200 and issues_response.json():
	return True

	# Check PRs
	prs_url = f"https://api.github.com/repos/{repo['full_name']}/pulls?author={username}&per_page=1"
	prs_response = requests.get(prs_url, headers=headers, timeout=5)
	if prs_response.status_code == 200 and prs_response.json():
	return True

	except Exception as e:
	print(f" Error checking involvement in {repo['full_name']}: {e}")
	return False


	def get_user_repos(username, token=None):
	"""Get all public repositories for a user (owned + contributed)"""
	headers = {}
	if token:
	headers["Authorization"] = f"token {token}"

	all_repos = []

	# 1. Owned repos
	print("Fetching owned repositories...")
	owned_repos = fetch_repos(f"https://api.github.com/users/{username}/repos", headers)
	for repo in owned_repos:
	repo["contribution_type"] = "owner"
	all_repos.extend(owned_repos)
	print(f"Found {len(owned_repos)} owned repositories")

	# 2. Repos with commits
	print("Fetching repositories with commits...")
	commit_repos = fetch_repos(f"https://api.github.com/search/repositories?q=committer:{username}+is:public", headers)
	for repo in commit_repos:
	repo["contribution_type"] = "owner" if repo.get("owner", {}).get("login", "").lower() == username.lower() else "contributor"
	all_repos.extend(commit_repos)
	print(f"Found {len(commit_repos)} repositories with commits")

	# 3. Repos with authored activity
	print("Fetching repositories with authored activity...")
	activity_repos = fetch_repos(f"https://api.github.com/search/repositories?q=author:{username}+is:public", headers)
	for repo in activity_repos:
	repo["contribution_type"] = "owner" if repo.get("owner", {}).get("login", "").lower() == username.lower() else "contributor"
	all_repos.extend(activity_repos)
	print(f"Found {len(activity_repos)} repositories with activity")

	# 4. Broad search queries
	print("Fetching ALL repositories user is involved in...")
	search_queries = [
	f"https://api.github.com/search/repositories?q=involves:{username}+is:public",
	f"https://api.github.com/search/repositories?q=user:{username}+is:public",
	f"https://api.github.com/search/repositories?q=mentions:{username}+is:public",
	f"https://api.github.com/search/repositories?q=assignee:{username}+is:public"
	]
	for i, query_url in enumerate(search_queries, 1):
	print(f" Search query {i}/{len(search_queries)}")
	search_repos = fetch_repos(query_url, headers)
	for repo in search_repos:
	repo["contribution_type"] = "owner" if repo.get("owner", {}).get("login", "").lower() == username.lower() else "contributor"
	all_repos.extend(search_repos)
	print(f" Found {len(search_repos)} repositories")

	# 5. Get ALL organizations user is a member of
	print("Fetching ALL organizations...")
	try:
	orgs_response = requests.get(f"https://api.github.com/users/{username}/orgs", headers=headers)
	if orgs_response.status_code == 200:
	orgs = orgs_response.json()
	print(f"Found {len(orgs)} organizations you're a member of")

	for org in orgs:
	org_name = org["login"]
	print(f" Processing organization: {org_name}")
	try:
	org_repos = fetch_repos(f"https://api.github.com/orgs/{org_name}/repos", headers)
	for repo in org_repos:
	# Check if user is involved in this repo
	if is_user_involved_in_repo(repo, username, headers):
	repo["contribution_type"] = "contributor"
	all_repos.append(repo)
	print(f" Found contribution: {repo['full_name']}")
	print(f" Total repos in {org_name}: {len(org_repos)}")
	except Exception as e:
	print(f" Error processing {org_name}: {e}")
	else:
	print(f"Could not fetch organizations (status: {orgs_response.status_code})")
	except Exception as e:
	print(f"Error fetching organizations: {e}")

	# 6. Also check known organizations (backup)
	known_orgs = ["CyberScienceLab", "Guelph-Cyber-Security-Society", "Guelph-Artificial-Intelligence-Club", "ugrt"]
	print("Checking additional known organizations...")
	for org_name in known_orgs:
	try:
	print(f" Checking: {org_name}")
	org_repos = fetch_repos(f"https://api.github.com/orgs/{org_name}/repos", headers)
	for repo in org_repos:
	if is_user_involved_in_repo(repo, username, headers):
	# Check if already added
	if not any(r["full_name"] == repo["full_name"] for r in all_repos):
	repo["contribution_type"] = "contributor"
	all_repos.append(repo)
	print(f" Found additional contribution: {repo['full_name']}")
	except Exception as e:
	print(f" Error checking {org_name}: {e}")

	# Deduplicate
	seen = set()
	unique_repos = []
	for repo in all_repos:
	if repo["full_name"] not in seen:
	seen.add(repo["full_name"])
	unique_repos.append(repo)

	print(f"Total unique repositories: {len(unique_repos)}")
	return unique_repos


	def download_readme(repo, token=None):
	"""Download README content via GitHub API (branch-agnostic)"""
	headers = {"Accept": "application/vnd.github.v3.raw"}
	if token:
	headers["Authorization"] = f"token {token}"

	url = f"https://api.github.com/repos/{repo['full_name']}/readme"
	try:
	response = requests.get(url, headers=headers, timeout=10)
	if response.status_code == 200:
	return response.text, "README"
	else:
	return None, None
	except requests.exceptions.RequestException:
	return None, None


	def main():
	parser = argparse.ArgumentParser(description="Download README files from GitHub repositories")
	parser.add_argument("--username", required=True, help="GitHub username")
	parser.add_argument("--token", help="GitHub personal access token (optional, increases rate limit)")
	parser.add_argument("--output-dir", default="docs/readmes", help="Output directory for README files")
	parser.add_argument("--exclude-forks", action="store_true", help="Exclude forked repositories")
	parser.add_argument("--min-stars", type=int, default=0, help="Minimum number of stars required")
	parser.add_argument("--exclude-tpoze", action="store_true", help="Exclude repositories containing 'tpoze' in name")
	parser.add_argument("--contribution-type", choices=["owner", "contributor", "committer", "all"], default="all", help="Filter by contribution type")
	parser.add_argument("--show-available", action="store_true", help="Show available repositories without downloading")

	args = parser.parse_args()

	output_dir = Path(args.output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	print(f"=== Downloading README files for user: {args.username} ===")

	repos = get_user_repos(args.username, args.token)
	if not repos:
	print("No repositories found.")
	return

	# Filtering
	filtered_repos = []
	for repo in repos:
	if args.exclude_forks and repo.get("fork", False):
	continue
	if args.exclude_tpoze and "tpoze" in repo["name"].lower():
	continue
	if repo.get("stargazers_count", 0) < args.min_stars:
	continue
	if args.contribution_type != "all" and repo.get("contribution_type") != args.contribution_type:
	continue
	filtered_repos.append(repo)

	print(f"After filtering: {len(filtered_repos)} repositories")

	if args.show_available:
	print("\n=== Available Repositories ===")
	for i, repo in enumerate(filtered_repos, 1):
	print(f"{i:2d}. {repo['full_name']} ({repo.get('contribution_type','?')}) ⭐{repo.get('stargazers_count',0)}")
	return

	# Download
	downloaded_count = 0
	skipped_count = 0

	for i, repo in enumerate(filtered_repos, 1):
	print(f"\n[{i}/{len(filtered_repos)}] {repo['full_name']} (⭐{repo.get('stargazers_count',0)})")
	readme_content, _ = download_readme(repo, args.token)
	if readme_content:
	filename = clean_filename(repo["name"])
	filepath = output_dir / filename
	contribution_type = repo.get("contribution_type", "unknown")
	contribution_emoji = {"owner": "👑", "contributor": "🤝", "committer": "💻"}.get(contribution_type, "❓")

	metadata_header = f"""# {repo['name']}
	Repository: {repo['full_name']}
	Contribution: {contribution_emoji} {contribution_type.title()}
	Stars: {repo.get('stargazers_count', 0)}
	Description: {repo.get('description', 'No description')}
	Language: {repo.get('language', 'Unknown')}
	Last Updated: {repo.get('updated_at', 'Unknown')}

	---

	"""
	with open(filepath, "w", encoding="utf-8") as f:
	f.write(metadata_header)
	f.write(readme_content)
	print(f" ✅ Downloaded: {filename}")
	downloaded_count += 1
	else:
	print(" ❌ No README found")
	skipped_count += 1
	time.sleep(0.2)

	print("\n=== Download Complete ===")
	print(f"Downloaded: {downloaded_count}, Skipped: {skipped_count}")
	print(f"Files saved to: {output_dir}")


	if __name__ == "__main__":
	main()