Spaces:
Running
Running
from django.utils.timezone import make_aware, is_naive | |
from django.utils.dateparse import parse_datetime | |
from tqdm import tqdm | |
import glob | |
import json | |
from core.models import ( | |
Author, Institution, Affiliation, Domain, Field, Subfield, Topic, AuthorTopic, AuthorYearlyStats, Concept, AuthorConcept | |
) | |
from urllib.parse import urlparse | |
def parse_id_from_url(url): | |
if (not url) or (not url.startswith("http")): | |
return "N/A" | |
return urlparse(url).path.strip('/').split('/')[-1] | |
def add_author(user_info, updated_date): | |
author_id = parse_id_from_url(user_info["id"]) | |
author, _ = Author.objects.update_or_create( | |
id=author_id, | |
defaults={ | |
"name": max(user_info['display_name_alternatives'] + [user_info['display_name']], key=lambda name: len(name)), | |
"orcid": parse_id_from_url(user_info["orcid"]) if user_info.get("orcid") else None, | |
"h_index": user_info["summary_stats"]["h_index"], | |
"i10_index": user_info["summary_stats"]["i10_index"], | |
"cited_by_count": user_info["cited_by_count"], | |
"works_count": user_info["works_count"], | |
"mean_2yr_citedness": user_info["summary_stats"]["2yr_mean_citedness"] | |
} | |
) | |
author.updated_at = updated_date | |
author.save(update_fields=["updated_at"]) | |
return author | |
def add_institution(inst_data): | |
inst_id = parse_id_from_url(inst_data["id"]) | |
inst, _ = Institution.objects.update_or_create( | |
id=inst_id, | |
defaults={ | |
"name": inst_data["display_name"], | |
"ror_id": parse_id_from_url(inst_data["ror"]), | |
"country_code": 'N/A' or inst_data.get("country_code"), | |
"institution_type": 'N/A' or inst_data.get("institution_type") | |
} | |
) | |
return inst | |
def add_affiliations(author, affiliations, last_known_insts): | |
last_known_ids = {parse_id_from_url( | |
inst["id"]) for inst in last_known_insts} | |
for aff in affiliations: | |
institution = add_institution(aff["institution"]) | |
for year in aff["years"]: | |
Affiliation.objects.update_or_create( | |
author=author, | |
institution=institution, | |
year=year, | |
defaults={"is_last_known": institution.id in last_known_ids} | |
) | |
def add_hierarchy(domain_data): | |
domain_id = parse_id_from_url(domain_data["id"]) | |
domain, _ = Domain.objects.update_or_create( | |
id=domain_id, defaults={"name": domain_data["display_name"]}) | |
return domain | |
def add_field(field_data, domain_data): | |
domain = add_hierarchy(domain_data) | |
field_id = parse_id_from_url(field_data["id"]) | |
field, _ = Field.objects.update_or_create( | |
id=field_id, defaults={ | |
"name": field_data["display_name"], "domain": domain} | |
) | |
return field | |
def add_subfield(subfield_data, field_data, domain_data): | |
field = add_field(field_data, domain_data) | |
subfield_id = parse_id_from_url(subfield_data["id"]) | |
subfield, _ = Subfield.objects.update_or_create( | |
id=subfield_id, defaults={ | |
"name": subfield_data["display_name"], "field": field} | |
) | |
return subfield | |
def add_topic(author, topic_data, topic_share_map): | |
topic_id = parse_id_from_url(topic_data["id"]) | |
subfield = add_subfield( | |
topic_data["subfield"], topic_data["field"], topic_data["domain"]) | |
topic, _ = Topic.objects.update_or_create( | |
id=topic_id, defaults={ | |
"name": topic_data["display_name"], "subfield": subfield} | |
) | |
share_value = topic_share_map.get(topic_id, 0) | |
AuthorTopic.objects.update_or_create( | |
author=author, topic=topic, defaults={ | |
"count": topic_data["count"], "share_value": share_value} | |
) | |
def add_topic_shares(topic_share_list): | |
return {parse_id_from_url(topic["id"]): topic["value"] for topic in topic_share_list} | |
def add_yearly_stats(author, stats): | |
for stat in stats: | |
AuthorYearlyStats.objects.update_or_create( | |
author=author, | |
year=stat["year"], | |
defaults={ | |
"works_count": stat["works_count"], | |
"cited_by_count": stat["cited_by_count"] | |
} | |
) | |
def add_concepts(author, concepts): | |
for concept in concepts: | |
concept_id = parse_id_from_url(concept["id"]) | |
obj, _ = Concept.objects.update_or_create( | |
id=concept_id, | |
defaults={ | |
"name": concept["display_name"], | |
"wikidata_url": concept.get("wikidata"), | |
"level": concept["level"], | |
"score": concept["score"] | |
} | |
) | |
AuthorConcept.objects.update_or_create( | |
author=author, | |
concept=obj, | |
defaults={ | |
"level": concept["level"], | |
"score": concept["score"] | |
} | |
) | |
def populate_user(user_info): | |
author_id = parse_id_from_url(user_info["id"]) | |
updated_date = parse_datetime(user_info["updated_date"]) | |
author = Author.objects.filter(id=author_id).first() | |
if is_naive(updated_date): | |
updated_date = make_aware(updated_date) | |
if author and (author.updated_at >= updated_date): | |
return | |
author = add_author(user_info, updated_date) | |
add_affiliations( | |
author, user_info["affiliations"], user_info["last_known_institutions"]) | |
topic_share_map = add_topic_shares(user_info["topic_share"]) | |
for topic in user_info["topics"]: | |
add_topic(author, topic, topic_share_map) | |
add_yearly_stats(author, user_info["counts_by_year"]) | |
add_concepts(author, user_info["x_concepts"]) | |
# Call this function to load data | |
# populate_user(user_info) | |
jsons = "/Users/sgautam/Documents/BridgeMentor/C41008148_authors" | |
for page, json_file in tqdm(enumerate(glob.glob(f"{jsons}/*.json"))): | |
with open(json_file, "r") as file: | |
user_infos = json.load(file)['results'] | |
for user_info in tqdm(user_infos, leave=False): | |
populate_user(user_info) | |
print(f"{page}-{user_info['display_name']}") | |
# python manage.py shell | |
# from populate_user import populate_user | |