BridgeMentor / populate_user.py
SushantGautam's picture
Add social authentication and enhance database configuration
bd98c1d
from django.utils.timezone import make_aware, is_naive
from django.utils.dateparse import parse_datetime
from tqdm import tqdm
import glob
import json
from core.models import (
Author, Institution, Affiliation, Domain, Field, Subfield, Topic, AuthorTopic, AuthorYearlyStats, Concept, AuthorConcept
)
from urllib.parse import urlparse
def parse_id_from_url(url):
if (not url) or (not url.startswith("http")):
return "N/A"
return urlparse(url).path.strip('/').split('/')[-1]
def add_author(user_info, updated_date):
author_id = parse_id_from_url(user_info["id"])
author, _ = Author.objects.update_or_create(
id=author_id,
defaults={
"name": max(user_info['display_name_alternatives'] + [user_info['display_name']], key=lambda name: len(name)),
"orcid": parse_id_from_url(user_info["orcid"]) if user_info.get("orcid") else None,
"h_index": user_info["summary_stats"]["h_index"],
"i10_index": user_info["summary_stats"]["i10_index"],
"cited_by_count": user_info["cited_by_count"],
"works_count": user_info["works_count"],
"mean_2yr_citedness": user_info["summary_stats"]["2yr_mean_citedness"]
}
)
author.updated_at = updated_date
author.save(update_fields=["updated_at"])
return author
def add_institution(inst_data):
inst_id = parse_id_from_url(inst_data["id"])
inst, _ = Institution.objects.update_or_create(
id=inst_id,
defaults={
"name": inst_data["display_name"],
"ror_id": parse_id_from_url(inst_data["ror"]),
"country_code": 'N/A' or inst_data.get("country_code"),
"institution_type": 'N/A' or inst_data.get("institution_type")
}
)
return inst
def add_affiliations(author, affiliations, last_known_insts):
last_known_ids = {parse_id_from_url(
inst["id"]) for inst in last_known_insts}
for aff in affiliations:
institution = add_institution(aff["institution"])
for year in aff["years"]:
Affiliation.objects.update_or_create(
author=author,
institution=institution,
year=year,
defaults={"is_last_known": institution.id in last_known_ids}
)
def add_hierarchy(domain_data):
domain_id = parse_id_from_url(domain_data["id"])
domain, _ = Domain.objects.update_or_create(
id=domain_id, defaults={"name": domain_data["display_name"]})
return domain
def add_field(field_data, domain_data):
domain = add_hierarchy(domain_data)
field_id = parse_id_from_url(field_data["id"])
field, _ = Field.objects.update_or_create(
id=field_id, defaults={
"name": field_data["display_name"], "domain": domain}
)
return field
def add_subfield(subfield_data, field_data, domain_data):
field = add_field(field_data, domain_data)
subfield_id = parse_id_from_url(subfield_data["id"])
subfield, _ = Subfield.objects.update_or_create(
id=subfield_id, defaults={
"name": subfield_data["display_name"], "field": field}
)
return subfield
def add_topic(author, topic_data, topic_share_map):
topic_id = parse_id_from_url(topic_data["id"])
subfield = add_subfield(
topic_data["subfield"], topic_data["field"], topic_data["domain"])
topic, _ = Topic.objects.update_or_create(
id=topic_id, defaults={
"name": topic_data["display_name"], "subfield": subfield}
)
share_value = topic_share_map.get(topic_id, 0)
AuthorTopic.objects.update_or_create(
author=author, topic=topic, defaults={
"count": topic_data["count"], "share_value": share_value}
)
def add_topic_shares(topic_share_list):
return {parse_id_from_url(topic["id"]): topic["value"] for topic in topic_share_list}
def add_yearly_stats(author, stats):
for stat in stats:
AuthorYearlyStats.objects.update_or_create(
author=author,
year=stat["year"],
defaults={
"works_count": stat["works_count"],
"cited_by_count": stat["cited_by_count"]
}
)
def add_concepts(author, concepts):
for concept in concepts:
concept_id = parse_id_from_url(concept["id"])
obj, _ = Concept.objects.update_or_create(
id=concept_id,
defaults={
"name": concept["display_name"],
"wikidata_url": concept.get("wikidata"),
"level": concept["level"],
"score": concept["score"]
}
)
AuthorConcept.objects.update_or_create(
author=author,
concept=obj,
defaults={
"level": concept["level"],
"score": concept["score"]
}
)
def populate_user(user_info):
author_id = parse_id_from_url(user_info["id"])
updated_date = parse_datetime(user_info["updated_date"])
author = Author.objects.filter(id=author_id).first()
if is_naive(updated_date):
updated_date = make_aware(updated_date)
if author and (author.updated_at >= updated_date):
return
author = add_author(user_info, updated_date)
add_affiliations(
author, user_info["affiliations"], user_info["last_known_institutions"])
topic_share_map = add_topic_shares(user_info["topic_share"])
for topic in user_info["topics"]:
add_topic(author, topic, topic_share_map)
add_yearly_stats(author, user_info["counts_by_year"])
add_concepts(author, user_info["x_concepts"])
# Call this function to load data
# populate_user(user_info)
jsons = "/Users/sgautam/Documents/BridgeMentor/C41008148_authors"
for page, json_file in tqdm(enumerate(glob.glob(f"{jsons}/*.json"))):
with open(json_file, "r") as file:
user_infos = json.load(file)['results']
for user_info in tqdm(user_infos, leave=False):
populate_user(user_info)
print(f"{page}-{user_info['display_name']}")
# python manage.py shell
# from populate_user import populate_user