from django.utils.timezone import make_aware, is_naive from django.utils.dateparse import parse_datetime from tqdm import tqdm import glob import json from core.models import ( Author, Institution, Affiliation, Domain, Field, Subfield, Topic, AuthorTopic, AuthorYearlyStats, Concept, AuthorConcept ) from urllib.parse import urlparse def parse_id_from_url(url): if (not url) or (not url.startswith("http")): return "N/A" return urlparse(url).path.strip('/').split('/')[-1] def add_author(user_info, updated_date): author_id = parse_id_from_url(user_info["id"]) author, _ = Author.objects.update_or_create( id=author_id, defaults={ "name": max(user_info['display_name_alternatives'] + [user_info['display_name']], key=lambda name: len(name)), "orcid": parse_id_from_url(user_info["orcid"]) if user_info.get("orcid") else None, "h_index": user_info["summary_stats"]["h_index"], "i10_index": user_info["summary_stats"]["i10_index"], "cited_by_count": user_info["cited_by_count"], "works_count": user_info["works_count"], "mean_2yr_citedness": user_info["summary_stats"]["2yr_mean_citedness"] } ) author.updated_at = updated_date author.save(update_fields=["updated_at"]) return author def add_institution(inst_data): inst_id = parse_id_from_url(inst_data["id"]) inst, _ = Institution.objects.update_or_create( id=inst_id, defaults={ "name": inst_data["display_name"], "ror_id": parse_id_from_url(inst_data["ror"]), "country_code": 'N/A' or inst_data.get("country_code"), "institution_type": 'N/A' or inst_data.get("institution_type") } ) return inst def add_affiliations(author, affiliations, last_known_insts): last_known_ids = {parse_id_from_url( inst["id"]) for inst in last_known_insts} for aff in affiliations: institution = add_institution(aff["institution"]) for year in aff["years"]: Affiliation.objects.update_or_create( author=author, institution=institution, year=year, defaults={"is_last_known": institution.id in last_known_ids} ) def add_hierarchy(domain_data): domain_id = parse_id_from_url(domain_data["id"]) domain, _ = Domain.objects.update_or_create( id=domain_id, defaults={"name": domain_data["display_name"]}) return domain def add_field(field_data, domain_data): domain = add_hierarchy(domain_data) field_id = parse_id_from_url(field_data["id"]) field, _ = Field.objects.update_or_create( id=field_id, defaults={ "name": field_data["display_name"], "domain": domain} ) return field def add_subfield(subfield_data, field_data, domain_data): field = add_field(field_data, domain_data) subfield_id = parse_id_from_url(subfield_data["id"]) subfield, _ = Subfield.objects.update_or_create( id=subfield_id, defaults={ "name": subfield_data["display_name"], "field": field} ) return subfield def add_topic(author, topic_data, topic_share_map): topic_id = parse_id_from_url(topic_data["id"]) subfield = add_subfield( topic_data["subfield"], topic_data["field"], topic_data["domain"]) topic, _ = Topic.objects.update_or_create( id=topic_id, defaults={ "name": topic_data["display_name"], "subfield": subfield} ) share_value = topic_share_map.get(topic_id, 0) AuthorTopic.objects.update_or_create( author=author, topic=topic, defaults={ "count": topic_data["count"], "share_value": share_value} ) def add_topic_shares(topic_share_list): return {parse_id_from_url(topic["id"]): topic["value"] for topic in topic_share_list} def add_yearly_stats(author, stats): for stat in stats: AuthorYearlyStats.objects.update_or_create( author=author, year=stat["year"], defaults={ "works_count": stat["works_count"], "cited_by_count": stat["cited_by_count"] } ) def add_concepts(author, concepts): for concept in concepts: concept_id = parse_id_from_url(concept["id"]) obj, _ = Concept.objects.update_or_create( id=concept_id, defaults={ "name": concept["display_name"], "wikidata_url": concept.get("wikidata"), "level": concept["level"], "score": concept["score"] } ) AuthorConcept.objects.update_or_create( author=author, concept=obj, defaults={ "level": concept["level"], "score": concept["score"] } ) def populate_user(user_info): author_id = parse_id_from_url(user_info["id"]) updated_date = parse_datetime(user_info["updated_date"]) author = Author.objects.filter(id=author_id).first() if is_naive(updated_date): updated_date = make_aware(updated_date) if author and (author.updated_at >= updated_date): return author = add_author(user_info, updated_date) add_affiliations( author, user_info["affiliations"], user_info["last_known_institutions"]) topic_share_map = add_topic_shares(user_info["topic_share"]) for topic in user_info["topics"]: add_topic(author, topic, topic_share_map) add_yearly_stats(author, user_info["counts_by_year"]) add_concepts(author, user_info["x_concepts"]) # Call this function to load data # populate_user(user_info) jsons = "/Users/sgautam/Documents/BridgeMentor/C41008148_authors" for page, json_file in tqdm(enumerate(glob.glob(f"{jsons}/*.json"))): with open(json_file, "r") as file: user_infos = json.load(file)['results'] for user_info in tqdm(user_infos, leave=False): populate_user(user_info) print(f"{page}-{user_info['display_name']}") # python manage.py shell # from populate_user import populate_user