Spaces:

davidmezzetti
/

analyzestars

Sleeping

App Files Files Community

analyzestars / app.py

davidmezzetti

Update app.py

9798049 verified 7 months ago

raw

history blame contribute delete

5.75 kB

	"""
	This application enables exploration with data from the paper:

	4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware
	https://arxiv.org/abs/2412.13459

	Requires the following packages
	pip install streamlit
	"""

	import os

	import pandas as pd
	import streamlit as st


	class Application:
	"""
	Main application.
	"""

	def __init__(self):
	"""
	Creates a new application.
	"""

	# Load data from GitHub project
	self.data = self.load()

	def load(self):
	"""
	Loads data from the source GitHub project.

	Returns:
	dataframe
	"""

	# Read data
	version = "241001"
	clustered = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_clustered_stars_by_month.csv")
	activity = pd.read_csv(f"https://github.com/hehao98/StarScout/raw/refs/heads/main/data/{version}/fake_stars_low_activity_stars_by_month.csv")
	data = pd.merge(clustered, activity, how="outer", on=["repo", "month"])

	# Remove duplicate stars column
	data["n_stars"] = pd.to_numeric(data[["n_stars_x", "n_stars_y"]].max(axis=1), downcast="integer")
	data = data.drop(["n_stars_x", "n_stars_y"], axis=1)

	# Aggregate fake star counts
	data["n_stars_clustered"] = pd.to_numeric(data["n_stars_clustered"].fillna(0), downcast="integer")
	data["n_stars_low_activity"] = pd.to_numeric(data["n_stars_low_activity"].fillna(0), downcast="integer")
	data["n_stars_flagged"] = data["n_stars_clustered"] + data["n_stars_low_activity"]
	data["n_stars_flagged"] = pd.to_numeric(data[["n_stars", "n_stars_flagged"]].min(axis=1), downcast="integer")

	# Calculate stat columns
	data["n_flagged_percent"] = 100 * (data["n_stars_flagged"] / data["n_stars"])

	# Rename and organize columns
	data.columns = ["repo", "month", "clustered", "low activity", "total stars", "flagged stars", "flagged %"]
	return data[["repo", "month", "clustered", "low activity", "flagged stars", "total stars", "flagged %"]]

	def run(self):
	"""
	Main rendering logic.
	"""

	# List of GitHub repos
	repos = st.text_area("GitHub Repos, one per line")

	# Format input
	repos = self.parse(repos)

	if repos:
	# Get top result per project
	frames = []
	for repo in repos:
	df = self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("flagged stars", ascending=False)[:1]
	frames.append(df)

	# Aggregate into single data frame and display
	aggregate = pd.concat(frames, axis=0)
	aggregate = aggregate.sort_values("flagged %", ascending=False).reset_index(drop=True)

	st.markdown("Top month flagged by project")
	st.dataframe(
	data=aggregate,
	column_config={
	"flagged %": st.column_config.NumberColumn(
	format="%.2f %%"
	)
	},
	use_container_width=True
	)

	for repo in aggregate["repo"]:
	st.markdown(f"{repo}")
	st.line_chart(
	data=self.data[self.data["repo"].str.lower() == repo.lower()].sort_values("month"),
	x="month",
	y=["total stars", "flagged stars"],
	color=["#F44336", "#2196F3"],
	)

	def parse(self, repos):
	"""
	Parses and cleans the input repos string.

	Returns:
	list of repos
	"""

	outputs = []
	for repo in repos.split("\n"):
	repo = repo.replace("https://github.com/", "")
	if repo:
	outputs.append(repo)

	return outputs


	@st.cache_resource(show_spinner="Initializing application...")
	def create():
	"""
	Creates and caches a Streamlit application.

	Returns:
	Application
	"""

	return Application()


	if __name__ == "__main__":
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	st.set_page_config(
	page_title="4.5 Million (Suspected) Fake Stars in GitHub",
	page_icon="⭐",
	layout="centered",
	initial_sidebar_state="auto",
	menu_items=None,
	)
	st.markdown("## 4.5 Million (Suspected) Fake ⭐'s in GitHub")

	st.markdown(
	"""
	This application explores the data provided by the paper titled:

	_4.5 Million (Suspected) Fake Stars in GitHub: A Growing Spiral of Popularity Contests, Scams, and Malware_

	_[Paper](https://arxiv.org/abs/2412.13459) \| [GitHub Project](https://github.com/hehao98/StarScout)_

	Note the disclaimer from the paper's authors.

	Disclaimer. _As we discussed in Section 3.4 and 3.5 in our paper, the resulting dataset are only repositories and users with suspected
	fake stars. The individual repositories and users in our dataset may be false positives. The main purpose of our dataset is for statistical
	analyses (which tolerates noises reasonably well), not for publicly shaming individual repositories. If you intend to publish subsequent work
	based on our dataset, please be aware of this limitation and its ethical implications._

	To add to the authors disclaimer.

	_It's also worth noting that projects that trend on popular sites such as the GitHub Trending Page can attract a lot of automated behavior outside
	of a project's control. This dataset is just a data point that shouldn't be used in a vacuum._
	"""
	)

	# Create and run application
	app = create()
	app.run()