Spaces:
Sleeping
Sleeping
Commit
·
03fbd26
1
Parent(s):
b75dbd7
Initial project setup with Django, including core models, admin configuration, GraphQL schema, and data population script.
Browse files- .gitignore +4 -0
- BridgeMentor/__init__.py +0 -0
- BridgeMentor/asgi.py +16 -0
- BridgeMentor/settings.py +130 -0
- BridgeMentor/urls.py +25 -0
- BridgeMentor/wsgi.py +16 -0
- core/__init__.py +0 -0
- core/admin.py +92 -0
- core/apps.py +6 -0
- core/models.py +140 -0
- core/schema.py +194 -0
- core/tests.py +3 -0
- core/views.py +3 -0
- manage.py +22 -0
- populate_user.py +177 -0
- requirements.txt +3 -0
- scrap_openalex.py +38 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**/*.json
|
2 |
+
venv
|
3 |
+
migrations
|
4 |
+
db.sqlite3*
|
BridgeMentor/__init__.py
ADDED
File without changes
|
BridgeMentor/asgi.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
ASGI config for BridgeMentor project.
|
3 |
+
|
4 |
+
It exposes the ASGI callable as a module-level variable named ``application``.
|
5 |
+
|
6 |
+
For more information on this file, see
|
7 |
+
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os
|
11 |
+
|
12 |
+
from django.core.asgi import get_asgi_application
|
13 |
+
|
14 |
+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'BridgeMentor.settings')
|
15 |
+
|
16 |
+
application = get_asgi_application()
|
BridgeMentor/settings.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Django settings for BridgeMentor project.
|
3 |
+
|
4 |
+
Generated by 'django-admin startproject' using Django 4.2.20.
|
5 |
+
|
6 |
+
For more information on this file, see
|
7 |
+
https://docs.djangoproject.com/en/4.2/topics/settings/
|
8 |
+
|
9 |
+
For the full list of settings and their values, see
|
10 |
+
https://docs.djangoproject.com/en/4.2/ref/settings/
|
11 |
+
"""
|
12 |
+
|
13 |
+
from pathlib import Path
|
14 |
+
|
15 |
+
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
16 |
+
BASE_DIR = Path(__file__).resolve().parent.parent
|
17 |
+
|
18 |
+
|
19 |
+
# Quick-start development settings - unsuitable for production
|
20 |
+
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
|
21 |
+
|
22 |
+
# SECURITY WARNING: keep the secret key used in production secret!
|
23 |
+
SECRET_KEY = 'django-insecure-*bimkkt===!#m(@d!x*he9z4gj#+ewbt9(y0o=8si2ny88w+yd'
|
24 |
+
|
25 |
+
# SECURITY WARNING: don't run with debug turned on in production!
|
26 |
+
DEBUG = True
|
27 |
+
|
28 |
+
ALLOWED_HOSTS = []
|
29 |
+
|
30 |
+
|
31 |
+
# Application definition
|
32 |
+
|
33 |
+
INSTALLED_APPS = [
|
34 |
+
'django.contrib.admin',
|
35 |
+
'django.contrib.auth',
|
36 |
+
'django.contrib.contenttypes',
|
37 |
+
'django.contrib.sessions',
|
38 |
+
'django.contrib.messages',
|
39 |
+
'django.contrib.staticfiles',
|
40 |
+
'django_filters',
|
41 |
+
'graphene_django',
|
42 |
+
'core',
|
43 |
+
]
|
44 |
+
|
45 |
+
MIDDLEWARE = [
|
46 |
+
'django.middleware.security.SecurityMiddleware',
|
47 |
+
'django.contrib.sessions.middleware.SessionMiddleware',
|
48 |
+
'django.middleware.common.CommonMiddleware',
|
49 |
+
'django.middleware.csrf.CsrfViewMiddleware',
|
50 |
+
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
51 |
+
'django.contrib.messages.middleware.MessageMiddleware',
|
52 |
+
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
53 |
+
]
|
54 |
+
|
55 |
+
ROOT_URLCONF = 'BridgeMentor.urls'
|
56 |
+
|
57 |
+
TEMPLATES = [
|
58 |
+
{
|
59 |
+
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
60 |
+
'DIRS': [],
|
61 |
+
'APP_DIRS': True,
|
62 |
+
'OPTIONS': {
|
63 |
+
'context_processors': [
|
64 |
+
'django.template.context_processors.debug',
|
65 |
+
'django.template.context_processors.request',
|
66 |
+
'django.contrib.auth.context_processors.auth',
|
67 |
+
'django.contrib.messages.context_processors.messages',
|
68 |
+
],
|
69 |
+
},
|
70 |
+
},
|
71 |
+
]
|
72 |
+
|
73 |
+
WSGI_APPLICATION = 'BridgeMentor.wsgi.application'
|
74 |
+
|
75 |
+
|
76 |
+
# Database
|
77 |
+
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
|
78 |
+
|
79 |
+
DATABASES = {
|
80 |
+
'default': {
|
81 |
+
'ENGINE': 'django.db.backends.sqlite3',
|
82 |
+
'NAME': BASE_DIR / 'db.sqlite3',
|
83 |
+
}
|
84 |
+
}
|
85 |
+
|
86 |
+
|
87 |
+
# Password validation
|
88 |
+
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
|
89 |
+
|
90 |
+
AUTH_PASSWORD_VALIDATORS = [
|
91 |
+
{
|
92 |
+
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
93 |
+
},
|
94 |
+
{
|
95 |
+
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
96 |
+
},
|
97 |
+
{
|
98 |
+
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
99 |
+
},
|
100 |
+
{
|
101 |
+
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
102 |
+
},
|
103 |
+
]
|
104 |
+
|
105 |
+
|
106 |
+
# Internationalization
|
107 |
+
# https://docs.djangoproject.com/en/4.2/topics/i18n/
|
108 |
+
|
109 |
+
LANGUAGE_CODE = 'en-us'
|
110 |
+
|
111 |
+
TIME_ZONE = 'UTC'
|
112 |
+
|
113 |
+
USE_I18N = True
|
114 |
+
|
115 |
+
USE_TZ = True
|
116 |
+
|
117 |
+
|
118 |
+
# Static files (CSS, JavaScript, Images)
|
119 |
+
# https://docs.djangoproject.com/en/4.2/howto/static-files/
|
120 |
+
|
121 |
+
STATIC_URL = 'static/'
|
122 |
+
|
123 |
+
# Default primary key field type
|
124 |
+
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
|
125 |
+
|
126 |
+
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
127 |
+
|
128 |
+
GRAPHENE = {
|
129 |
+
"SCHEMA": "core.schema.schema" # Adjust to match your project
|
130 |
+
}
|
BridgeMentor/urls.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
URL configuration for BridgeMentor project.
|
3 |
+
|
4 |
+
The `urlpatterns` list routes URLs to views. For more information please see:
|
5 |
+
https://docs.djangoproject.com/en/4.2/topics/http/urls/
|
6 |
+
Examples:
|
7 |
+
Function views
|
8 |
+
1. Add an import: from my_app import views
|
9 |
+
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
10 |
+
Class-based views
|
11 |
+
1. Add an import: from other_app.views import Home
|
12 |
+
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
13 |
+
Including another URLconf
|
14 |
+
1. Import the include() function: from django.urls import include, path
|
15 |
+
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
16 |
+
"""
|
17 |
+
from graphene_django.views import GraphQLView
|
18 |
+
from django.contrib import admin
|
19 |
+
from django.urls import path
|
20 |
+
|
21 |
+
urlpatterns = [
|
22 |
+
path('admin/', admin.site.urls),
|
23 |
+
path("graphql/", GraphQLView.as_view(graphiql=True)),
|
24 |
+
|
25 |
+
]
|
BridgeMentor/wsgi.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
WSGI config for BridgeMentor project.
|
3 |
+
|
4 |
+
It exposes the WSGI callable as a module-level variable named ``application``.
|
5 |
+
|
6 |
+
For more information on this file, see
|
7 |
+
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os
|
11 |
+
|
12 |
+
from django.core.wsgi import get_wsgi_application
|
13 |
+
|
14 |
+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'BridgeMentor.settings')
|
15 |
+
|
16 |
+
application = get_wsgi_application()
|
core/__init__.py
ADDED
File without changes
|
core/admin.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from django.contrib import admin
|
2 |
+
from .models import (
|
3 |
+
Institution, Author, Affiliation, Domain, Field, Subfield, Topic,
|
4 |
+
AuthorTopic, Work, AuthorYearlyStats, Concept, AuthorConcept
|
5 |
+
)
|
6 |
+
|
7 |
+
|
8 |
+
admin.site.site_header = "BridgeMentor Admin"
|
9 |
+
admin.site.site_title = "BridgeMentor Data Management"
|
10 |
+
admin.site.index_title = "BridgeMentor Dashboard"
|
11 |
+
|
12 |
+
|
13 |
+
@admin.register(Institution)
|
14 |
+
class InstitutionAdmin(admin.ModelAdmin):
|
15 |
+
list_display = ('id', 'name', 'ror_id', 'country_code', 'institution_type')
|
16 |
+
search_fields = ('name', 'ror_id')
|
17 |
+
list_filter = ('country_code', 'institution_type')
|
18 |
+
|
19 |
+
|
20 |
+
@admin.register(Author)
|
21 |
+
class AuthorAdmin(admin.ModelAdmin):
|
22 |
+
list_display = ('id', 'name', 'orcid', 'h_index',
|
23 |
+
'i10_index', 'cited_by_count', 'works_count')
|
24 |
+
search_fields = ('name', 'orcid')
|
25 |
+
list_filter = ('h_index',)
|
26 |
+
|
27 |
+
|
28 |
+
@admin.register(Affiliation)
|
29 |
+
class AffiliationAdmin(admin.ModelAdmin):
|
30 |
+
list_display = ('author', 'institution', 'year', 'is_last_known')
|
31 |
+
list_filter = ('year', 'is_last_known', 'institution')
|
32 |
+
search_fields = ('author__name', 'institution__name')
|
33 |
+
|
34 |
+
|
35 |
+
@admin.register(Domain)
|
36 |
+
class DomainAdmin(admin.ModelAdmin):
|
37 |
+
list_display = ('id', 'name')
|
38 |
+
search_fields = ('name',)
|
39 |
+
|
40 |
+
|
41 |
+
@admin.register(Field)
|
42 |
+
class FieldAdmin(admin.ModelAdmin):
|
43 |
+
list_display = ('id', 'name', 'domain')
|
44 |
+
list_filter = ('domain',)
|
45 |
+
search_fields = ('name',)
|
46 |
+
|
47 |
+
|
48 |
+
@admin.register(Subfield)
|
49 |
+
class SubfieldAdmin(admin.ModelAdmin):
|
50 |
+
list_display = ('id', 'name', 'field')
|
51 |
+
list_filter = ('field',)
|
52 |
+
search_fields = ('name',)
|
53 |
+
|
54 |
+
|
55 |
+
@admin.register(Topic)
|
56 |
+
class TopicAdmin(admin.ModelAdmin):
|
57 |
+
list_display = ('id', 'name', 'subfield')
|
58 |
+
list_filter = ('subfield',)
|
59 |
+
search_fields = ('name',)
|
60 |
+
|
61 |
+
|
62 |
+
@admin.register(AuthorTopic)
|
63 |
+
class AuthorTopicAdmin(admin.ModelAdmin):
|
64 |
+
list_display = ('author', 'topic', 'count', 'share_value')
|
65 |
+
list_filter = ('topic',)
|
66 |
+
search_fields = ('author__name', 'topic__name')
|
67 |
+
|
68 |
+
|
69 |
+
@admin.register(Work)
|
70 |
+
class WorkAdmin(admin.ModelAdmin):
|
71 |
+
list_display = ('id', 'title', 'author', 'year', 'cited_by_count')
|
72 |
+
list_filter = ('year',)
|
73 |
+
search_fields = ('title', 'author__name')
|
74 |
+
|
75 |
+
|
76 |
+
@admin.register(AuthorYearlyStats)
|
77 |
+
class AuthorYearlyStatsAdmin(admin.ModelAdmin):
|
78 |
+
list_display = ('author', 'year', 'works_count', 'cited_by_count')
|
79 |
+
list_filter = ('year',)
|
80 |
+
search_fields = ('author__name',)
|
81 |
+
|
82 |
+
|
83 |
+
@admin.register(Concept)
|
84 |
+
class ConceptAdmin(admin.ModelAdmin):
|
85 |
+
list_display = ('id', 'name', 'level', 'score')
|
86 |
+
search_fields = ('name',)
|
87 |
+
|
88 |
+
|
89 |
+
@admin.register(AuthorConcept)
|
90 |
+
class AuthorConceptAdmin(admin.ModelAdmin):
|
91 |
+
list_display = ('author', 'concept', 'level', 'score')
|
92 |
+
search_fields = ('author__name', 'concept__name')
|
core/apps.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from django.apps import AppConfig
|
2 |
+
|
3 |
+
|
4 |
+
class CoreConfig(AppConfig):
|
5 |
+
default_auto_field = 'django.db.models.BigAutoField'
|
6 |
+
name = 'core'
|
core/models.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from django.db import models
|
2 |
+
|
3 |
+
|
4 |
+
class TimestampedModel(models.Model):
|
5 |
+
created_at = models.DateTimeField(auto_now_add=True)
|
6 |
+
updated_at = models.DateTimeField(auto_now=True)
|
7 |
+
|
8 |
+
class Meta:
|
9 |
+
abstract = True
|
10 |
+
|
11 |
+
|
12 |
+
class Institution(TimestampedModel):
|
13 |
+
id = models.CharField(max_length=20, primary_key=True) # e.g., I204778367
|
14 |
+
name = models.CharField(max_length=255)
|
15 |
+
ror_id = models.CharField(max_length=100, unique=True)
|
16 |
+
country_code = models.CharField(max_length=2)
|
17 |
+
institution_type = models.CharField(max_length=50) # funder, company, etc.
|
18 |
+
|
19 |
+
def __str__(self):
|
20 |
+
return self.name
|
21 |
+
|
22 |
+
|
23 |
+
class Author(TimestampedModel):
|
24 |
+
id = models.CharField(max_length=20, primary_key=True) # e.g., A5006834808
|
25 |
+
name = models.CharField(max_length=255)
|
26 |
+
orcid = models.CharField(
|
27 |
+
max_length=100, unique=True, null=True, blank=True)
|
28 |
+
h_index = models.IntegerField()
|
29 |
+
i10_index = models.IntegerField()
|
30 |
+
cited_by_count = models.IntegerField()
|
31 |
+
works_count = models.IntegerField()
|
32 |
+
mean_2yr_citedness = models.FloatField()
|
33 |
+
|
34 |
+
def __str__(self):
|
35 |
+
return self.name
|
36 |
+
|
37 |
+
|
38 |
+
class Affiliation(TimestampedModel):
|
39 |
+
author = models.ForeignKey(
|
40 |
+
Author, on_delete=models.CASCADE, related_name='affiliations')
|
41 |
+
institution = models.ForeignKey(Institution, on_delete=models.CASCADE)
|
42 |
+
year = models.IntegerField()
|
43 |
+
is_last_known = models.BooleanField(default=False)
|
44 |
+
|
45 |
+
class Meta:
|
46 |
+
unique_together = ('author', 'institution', 'year')
|
47 |
+
|
48 |
+
|
49 |
+
class Domain(TimestampedModel):
|
50 |
+
id = models.CharField(max_length=10, primary_key=True) # e.g., 3
|
51 |
+
name = models.CharField(max_length=255)
|
52 |
+
|
53 |
+
def __str__(self):
|
54 |
+
return self.name
|
55 |
+
|
56 |
+
|
57 |
+
class Field(TimestampedModel):
|
58 |
+
id = models.CharField(max_length=10, primary_key=True) # e.g., 23
|
59 |
+
name = models.CharField(max_length=255)
|
60 |
+
domain = models.ForeignKey(
|
61 |
+
Domain, on_delete=models.CASCADE, related_name='fields')
|
62 |
+
|
63 |
+
def __str__(self):
|
64 |
+
return self.name
|
65 |
+
|
66 |
+
|
67 |
+
class Subfield(TimestampedModel):
|
68 |
+
id = models.CharField(max_length=10, primary_key=True) # e.g., 2304
|
69 |
+
name = models.CharField(max_length=255)
|
70 |
+
field = models.ForeignKey(
|
71 |
+
Field, on_delete=models.CASCADE, related_name='subfields')
|
72 |
+
|
73 |
+
def __str__(self):
|
74 |
+
return self.name
|
75 |
+
|
76 |
+
|
77 |
+
class Topic(TimestampedModel):
|
78 |
+
id = models.CharField(max_length=20, primary_key=True) # e.g., T13180
|
79 |
+
name = models.CharField(max_length=255)
|
80 |
+
subfield = models.ForeignKey(
|
81 |
+
Subfield, on_delete=models.CASCADE, related_name='topics')
|
82 |
+
|
83 |
+
def __str__(self):
|
84 |
+
return self.name
|
85 |
+
|
86 |
+
|
87 |
+
class AuthorTopic(TimestampedModel):
|
88 |
+
author = models.ForeignKey(
|
89 |
+
Author, on_delete=models.CASCADE, related_name='topics')
|
90 |
+
topic = models.ForeignKey(Topic, on_delete=models.CASCADE)
|
91 |
+
count = models.IntegerField()
|
92 |
+
share_value = models.FloatField()
|
93 |
+
|
94 |
+
class Meta:
|
95 |
+
unique_together = ('author', 'topic')
|
96 |
+
|
97 |
+
|
98 |
+
class Work(TimestampedModel):
|
99 |
+
id = models.CharField(max_length=20, primary_key=True) # e.g., W123456789
|
100 |
+
author = models.ForeignKey(
|
101 |
+
Author, on_delete=models.CASCADE, related_name='works')
|
102 |
+
title = models.CharField(max_length=512)
|
103 |
+
year = models.IntegerField()
|
104 |
+
cited_by_count = models.IntegerField()
|
105 |
+
|
106 |
+
def __str__(self):
|
107 |
+
return self.title
|
108 |
+
|
109 |
+
|
110 |
+
class AuthorYearlyStats(TimestampedModel):
|
111 |
+
author = models.ForeignKey(
|
112 |
+
Author, on_delete=models.CASCADE, related_name='yearly_stats')
|
113 |
+
year = models.IntegerField()
|
114 |
+
works_count = models.IntegerField()
|
115 |
+
cited_by_count = models.IntegerField()
|
116 |
+
|
117 |
+
class Meta:
|
118 |
+
unique_together = ('author', 'year')
|
119 |
+
|
120 |
+
|
121 |
+
class Concept(TimestampedModel):
|
122 |
+
id = models.CharField(max_length=20, primary_key=True) # e.g., C41008148
|
123 |
+
name = models.CharField(max_length=255)
|
124 |
+
wikidata_url = models.URLField(null=True, blank=True)
|
125 |
+
level = models.IntegerField()
|
126 |
+
score = models.FloatField()
|
127 |
+
|
128 |
+
def __str__(self):
|
129 |
+
return f"{self.name} ({self.id})"
|
130 |
+
|
131 |
+
|
132 |
+
class AuthorConcept(models.Model):
|
133 |
+
author = models.ForeignKey(
|
134 |
+
Author, on_delete=models.CASCADE, related_name='concepts')
|
135 |
+
concept = models.ForeignKey(Concept, on_delete=models.CASCADE)
|
136 |
+
level = models.IntegerField(null=True, blank=True)
|
137 |
+
score = models.FloatField(null=True, blank=True)
|
138 |
+
|
139 |
+
class Meta:
|
140 |
+
unique_together = ('author', 'concept')
|
core/schema.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import graphene
|
2 |
+
from graphene_django import DjangoObjectType
|
3 |
+
from graphene_django.filter import DjangoFilterConnectionField
|
4 |
+
from .models import (
|
5 |
+
Institution, Author, Affiliation, Domain, Field,
|
6 |
+
Subfield, Topic, AuthorTopic, Work, AuthorYearlyStats,
|
7 |
+
Concept, AuthorConcept
|
8 |
+
)
|
9 |
+
|
10 |
+
|
11 |
+
# === GraphQL Types ===
|
12 |
+
class InstitutionType(DjangoObjectType):
|
13 |
+
class Meta:
|
14 |
+
model = Institution
|
15 |
+
fields = "__all__"
|
16 |
+
filter_fields = {
|
17 |
+
'id': ['exact', 'icontains'],
|
18 |
+
'name': ['icontains', 'iexact'],
|
19 |
+
'ror_id': ['exact'],
|
20 |
+
'country_code': ['exact'],
|
21 |
+
'institution_type': ['exact'],
|
22 |
+
}
|
23 |
+
interfaces = (graphene.relay.Node,)
|
24 |
+
|
25 |
+
|
26 |
+
class AuthorType(DjangoObjectType):
|
27 |
+
class Meta:
|
28 |
+
model = Author
|
29 |
+
fields = "__all__"
|
30 |
+
filter_fields = {
|
31 |
+
'id': ['exact', 'icontains'],
|
32 |
+
'name': ['icontains'],
|
33 |
+
'orcid': ['exact'],
|
34 |
+
'h_index': ['exact', 'gte', 'lte'],
|
35 |
+
'cited_by_count': ['gte', 'lte'],
|
36 |
+
'works_count': ['gte', 'lte'],
|
37 |
+
}
|
38 |
+
interfaces = (graphene.relay.Node,)
|
39 |
+
|
40 |
+
|
41 |
+
class AffiliationType(DjangoObjectType):
|
42 |
+
class Meta:
|
43 |
+
model = Affiliation
|
44 |
+
fields = "__all__"
|
45 |
+
filter_fields = {
|
46 |
+
'author__id': ['exact'],
|
47 |
+
'institution__id': ['exact'],
|
48 |
+
'year': ['exact', 'gte', 'lte'],
|
49 |
+
'is_last_known': ['exact'],
|
50 |
+
}
|
51 |
+
interfaces = (graphene.relay.Node,)
|
52 |
+
|
53 |
+
|
54 |
+
class DomainType(DjangoObjectType):
|
55 |
+
class Meta:
|
56 |
+
model = Domain
|
57 |
+
fields = "__all__"
|
58 |
+
filter_fields = ['id', 'name']
|
59 |
+
interfaces = (graphene.relay.Node,)
|
60 |
+
|
61 |
+
|
62 |
+
class FieldType(DjangoObjectType):
|
63 |
+
class Meta:
|
64 |
+
model = Field
|
65 |
+
fields = "__all__"
|
66 |
+
filter_fields = {
|
67 |
+
'id': ['exact'],
|
68 |
+
'name': ['icontains'],
|
69 |
+
'domain__id': ['exact'],
|
70 |
+
}
|
71 |
+
interfaces = (graphene.relay.Node,)
|
72 |
+
|
73 |
+
|
74 |
+
class SubfieldType(DjangoObjectType):
|
75 |
+
class Meta:
|
76 |
+
model = Subfield
|
77 |
+
fields = "__all__"
|
78 |
+
filter_fields = {
|
79 |
+
'id': ['exact'],
|
80 |
+
'name': ['icontains'],
|
81 |
+
'field__id': ['exact'],
|
82 |
+
}
|
83 |
+
interfaces = (graphene.relay.Node,)
|
84 |
+
|
85 |
+
|
86 |
+
class TopicType(DjangoObjectType):
|
87 |
+
class Meta:
|
88 |
+
model = Topic
|
89 |
+
fields = "__all__"
|
90 |
+
filter_fields = {
|
91 |
+
'id': ['exact'],
|
92 |
+
'name': ['icontains'],
|
93 |
+
'subfield__id': ['exact'],
|
94 |
+
}
|
95 |
+
interfaces = (graphene.relay.Node,)
|
96 |
+
|
97 |
+
|
98 |
+
class AuthorTopicType(DjangoObjectType):
|
99 |
+
class Meta:
|
100 |
+
model = AuthorTopic
|
101 |
+
fields = "__all__"
|
102 |
+
filter_fields = {
|
103 |
+
'author__id': ['exact'],
|
104 |
+
'topic__id': ['exact'],
|
105 |
+
}
|
106 |
+
interfaces = (graphene.relay.Node,)
|
107 |
+
|
108 |
+
|
109 |
+
class WorkType(DjangoObjectType):
|
110 |
+
class Meta:
|
111 |
+
model = Work
|
112 |
+
fields = "__all__"
|
113 |
+
filter_fields = {
|
114 |
+
'id': ['exact'],
|
115 |
+
'title': ['icontains'],
|
116 |
+
'author__id': ['exact'],
|
117 |
+
'year': ['exact', 'gte', 'lte'],
|
118 |
+
'cited_by_count': ['gte', 'lte'],
|
119 |
+
}
|
120 |
+
interfaces = (graphene.relay.Node,)
|
121 |
+
|
122 |
+
|
123 |
+
class AuthorYearlyStatsType(DjangoObjectType):
|
124 |
+
class Meta:
|
125 |
+
model = AuthorYearlyStats
|
126 |
+
fields = "__all__"
|
127 |
+
filter_fields = {
|
128 |
+
'author__id': ['exact'],
|
129 |
+
'year': ['exact'],
|
130 |
+
'works_count': ['gte', 'lte'],
|
131 |
+
}
|
132 |
+
interfaces = (graphene.relay.Node,)
|
133 |
+
|
134 |
+
|
135 |
+
class ConceptType(DjangoObjectType):
|
136 |
+
class Meta:
|
137 |
+
model = Concept
|
138 |
+
fields = "__all__"
|
139 |
+
filter_fields = ['id', 'name', 'level', 'score']
|
140 |
+
interfaces = (graphene.relay.Node,)
|
141 |
+
|
142 |
+
|
143 |
+
class AuthorConceptType(DjangoObjectType):
|
144 |
+
class Meta:
|
145 |
+
model = AuthorConcept
|
146 |
+
fields = "__all__"
|
147 |
+
filter_fields = {
|
148 |
+
'author__id': ['exact'],
|
149 |
+
'concept__id': ['exact'],
|
150 |
+
}
|
151 |
+
interfaces = (graphene.relay.Node,)
|
152 |
+
|
153 |
+
|
154 |
+
# === Query with Filtered Connections ===
|
155 |
+
class Query(graphene.ObjectType):
|
156 |
+
institution = graphene.relay.Node.Field(InstitutionType)
|
157 |
+
all_institutions = DjangoFilterConnectionField(InstitutionType)
|
158 |
+
|
159 |
+
author = graphene.relay.Node.Field(AuthorType)
|
160 |
+
all_authors = DjangoFilterConnectionField(AuthorType)
|
161 |
+
|
162 |
+
affiliation = graphene.relay.Node.Field(AffiliationType)
|
163 |
+
all_affiliations = DjangoFilterConnectionField(AffiliationType)
|
164 |
+
|
165 |
+
domain = graphene.relay.Node.Field(DomainType)
|
166 |
+
all_domains = DjangoFilterConnectionField(DomainType)
|
167 |
+
|
168 |
+
field = graphene.relay.Node.Field(FieldType)
|
169 |
+
all_fields = DjangoFilterConnectionField(FieldType)
|
170 |
+
|
171 |
+
subfield = graphene.relay.Node.Field(SubfieldType)
|
172 |
+
all_subfields = DjangoFilterConnectionField(SubfieldType)
|
173 |
+
|
174 |
+
topic = graphene.relay.Node.Field(TopicType)
|
175 |
+
all_topics = DjangoFilterConnectionField(TopicType)
|
176 |
+
|
177 |
+
author_topic = graphene.relay.Node.Field(AuthorTopicType)
|
178 |
+
all_author_topics = DjangoFilterConnectionField(AuthorTopicType)
|
179 |
+
|
180 |
+
work = graphene.relay.Node.Field(WorkType)
|
181 |
+
all_works = DjangoFilterConnectionField(WorkType)
|
182 |
+
|
183 |
+
author_yearly_stats = graphene.relay.Node.Field(AuthorYearlyStatsType)
|
184 |
+
all_author_yearly_stats = DjangoFilterConnectionField(
|
185 |
+
AuthorYearlyStatsType)
|
186 |
+
|
187 |
+
concept = graphene.relay.Node.Field(ConceptType)
|
188 |
+
all_concepts = DjangoFilterConnectionField(ConceptType)
|
189 |
+
|
190 |
+
author_concept = graphene.relay.Node.Field(AuthorConceptType)
|
191 |
+
all_author_concepts = DjangoFilterConnectionField(AuthorConceptType)
|
192 |
+
|
193 |
+
|
194 |
+
schema = graphene.Schema(query=Query)
|
core/tests.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from django.test import TestCase
|
2 |
+
|
3 |
+
# Create your tests here.
|
core/views.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from django.shortcuts import render
|
2 |
+
|
3 |
+
# Create your views here.
|
manage.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""Django's command-line utility for administrative tasks."""
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
|
6 |
+
|
7 |
+
def main():
|
8 |
+
"""Run administrative tasks."""
|
9 |
+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'BridgeMentor.settings')
|
10 |
+
try:
|
11 |
+
from django.core.management import execute_from_command_line
|
12 |
+
except ImportError as exc:
|
13 |
+
raise ImportError(
|
14 |
+
"Couldn't import Django. Are you sure it's installed and "
|
15 |
+
"available on your PYTHONPATH environment variable? Did you "
|
16 |
+
"forget to activate a virtual environment?"
|
17 |
+
) from exc
|
18 |
+
execute_from_command_line(sys.argv)
|
19 |
+
|
20 |
+
|
21 |
+
if __name__ == '__main__':
|
22 |
+
main()
|
populate_user.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from django.utils.timezone import make_aware, is_naive
|
2 |
+
from django.utils.dateparse import parse_datetime
|
3 |
+
from tqdm import tqdm
|
4 |
+
import glob
|
5 |
+
import json
|
6 |
+
from core.models import (
|
7 |
+
Author, Institution, Affiliation, Domain, Field, Subfield, Topic, AuthorTopic, AuthorYearlyStats, Concept, AuthorConcept
|
8 |
+
)
|
9 |
+
from urllib.parse import urlparse
|
10 |
+
|
11 |
+
|
12 |
+
def parse_id_from_url(url):
|
13 |
+
try:
|
14 |
+
return urlparse(url).path.strip('/').split('/')[-1]
|
15 |
+
except Exception as e:
|
16 |
+
print(f"Error parsing URL {url}: {e}")
|
17 |
+
breakpoint()
|
18 |
+
|
19 |
+
|
20 |
+
def add_author(user_info, updated_date):
|
21 |
+
author_id = parse_id_from_url(user_info["id"])
|
22 |
+
author, _ = Author.objects.update_or_create(
|
23 |
+
id=author_id,
|
24 |
+
defaults={
|
25 |
+
"name": max(user_info['display_name_alternatives'], key=lambda name: len(name)),
|
26 |
+
"orcid": parse_id_from_url(user_info["orcid"]) if user_info.get("orcid") else None,
|
27 |
+
"h_index": user_info["summary_stats"]["h_index"],
|
28 |
+
"i10_index": user_info["summary_stats"]["i10_index"],
|
29 |
+
"cited_by_count": user_info["cited_by_count"],
|
30 |
+
"works_count": user_info["works_count"],
|
31 |
+
"mean_2yr_citedness": user_info["summary_stats"]["2yr_mean_citedness"]
|
32 |
+
}
|
33 |
+
)
|
34 |
+
author.updated_at = updated_date
|
35 |
+
author.save(update_fields=["updated_at"])
|
36 |
+
return author
|
37 |
+
|
38 |
+
|
39 |
+
def add_institution(inst_data):
|
40 |
+
inst_id = parse_id_from_url(inst_data["id"])
|
41 |
+
inst, _ = Institution.objects.update_or_create(
|
42 |
+
id=inst_id,
|
43 |
+
defaults={
|
44 |
+
"name": inst_data["display_name"],
|
45 |
+
"ror_id": parse_id_from_url(inst_data["ror"]),
|
46 |
+
"country_code": 'N/A' or inst_data.get("country_code"),
|
47 |
+
"institution_type": inst_data["type"]
|
48 |
+
}
|
49 |
+
)
|
50 |
+
return inst
|
51 |
+
|
52 |
+
|
53 |
+
def add_affiliations(author, affiliations, last_known_insts):
|
54 |
+
last_known_ids = {parse_id_from_url(
|
55 |
+
inst["id"]) for inst in last_known_insts}
|
56 |
+
for aff in affiliations:
|
57 |
+
institution = add_institution(aff["institution"])
|
58 |
+
for year in aff["years"]:
|
59 |
+
Affiliation.objects.update_or_create(
|
60 |
+
author=author,
|
61 |
+
institution=institution,
|
62 |
+
year=year,
|
63 |
+
defaults={"is_last_known": institution.id in last_known_ids}
|
64 |
+
)
|
65 |
+
|
66 |
+
|
67 |
+
def add_hierarchy(domain_data):
|
68 |
+
domain_id = parse_id_from_url(domain_data["id"])
|
69 |
+
domain, _ = Domain.objects.update_or_create(
|
70 |
+
id=domain_id, defaults={"name": domain_data["display_name"]})
|
71 |
+
return domain
|
72 |
+
|
73 |
+
|
74 |
+
def add_field(field_data, domain_data):
|
75 |
+
domain = add_hierarchy(domain_data)
|
76 |
+
field_id = parse_id_from_url(field_data["id"])
|
77 |
+
field, _ = Field.objects.update_or_create(
|
78 |
+
id=field_id, defaults={
|
79 |
+
"name": field_data["display_name"], "domain": domain}
|
80 |
+
)
|
81 |
+
return field
|
82 |
+
|
83 |
+
|
84 |
+
def add_subfield(subfield_data, field_data, domain_data):
|
85 |
+
field = add_field(field_data, domain_data)
|
86 |
+
subfield_id = parse_id_from_url(subfield_data["id"])
|
87 |
+
subfield, _ = Subfield.objects.update_or_create(
|
88 |
+
id=subfield_id, defaults={
|
89 |
+
"name": subfield_data["display_name"], "field": field}
|
90 |
+
)
|
91 |
+
return subfield
|
92 |
+
|
93 |
+
|
94 |
+
def add_topic(author, topic_data, topic_share_map):
|
95 |
+
topic_id = parse_id_from_url(topic_data["id"])
|
96 |
+
subfield = add_subfield(
|
97 |
+
topic_data["subfield"], topic_data["field"], topic_data["domain"])
|
98 |
+
topic, _ = Topic.objects.update_or_create(
|
99 |
+
id=topic_id, defaults={
|
100 |
+
"name": topic_data["display_name"], "subfield": subfield}
|
101 |
+
)
|
102 |
+
share_value = topic_share_map.get(topic_id, 0)
|
103 |
+
AuthorTopic.objects.update_or_create(
|
104 |
+
author=author, topic=topic, defaults={
|
105 |
+
"count": topic_data["count"], "share_value": share_value}
|
106 |
+
)
|
107 |
+
|
108 |
+
|
109 |
+
def add_topic_shares(topic_share_list):
|
110 |
+
return {parse_id_from_url(topic["id"]): topic["value"] for topic in topic_share_list}
|
111 |
+
|
112 |
+
|
113 |
+
def add_yearly_stats(author, stats):
|
114 |
+
for stat in stats:
|
115 |
+
AuthorYearlyStats.objects.update_or_create(
|
116 |
+
author=author,
|
117 |
+
year=stat["year"],
|
118 |
+
defaults={
|
119 |
+
"works_count": stat["works_count"],
|
120 |
+
"cited_by_count": stat["cited_by_count"]
|
121 |
+
}
|
122 |
+
)
|
123 |
+
|
124 |
+
|
125 |
+
def add_concepts(author, concepts):
|
126 |
+
for concept in concepts:
|
127 |
+
concept_id = parse_id_from_url(concept["id"])
|
128 |
+
obj, _ = Concept.objects.update_or_create(
|
129 |
+
id=concept_id,
|
130 |
+
defaults={
|
131 |
+
"name": concept["display_name"],
|
132 |
+
"wikidata_url": concept.get("wikidata"),
|
133 |
+
"level": concept["level"],
|
134 |
+
"score": concept["score"]
|
135 |
+
}
|
136 |
+
)
|
137 |
+
AuthorConcept.objects.update_or_create(
|
138 |
+
author=author,
|
139 |
+
concept=obj,
|
140 |
+
defaults={
|
141 |
+
"level": concept["level"],
|
142 |
+
"score": concept["score"]
|
143 |
+
}
|
144 |
+
)
|
145 |
+
|
146 |
+
|
147 |
+
def populate_user(user_info):
|
148 |
+
author_id = parse_id_from_url(user_info["id"])
|
149 |
+
updated_date = parse_datetime(user_info["updated_date"])
|
150 |
+
author = Author.objects.filter(id=author_id).first()
|
151 |
+
if is_naive(updated_date):
|
152 |
+
updated_date = make_aware(updated_date)
|
153 |
+
if author and (author.updated_at >= updated_date):
|
154 |
+
return
|
155 |
+
author = add_author(user_info, updated_date)
|
156 |
+
add_affiliations(
|
157 |
+
author, user_info["affiliations"], user_info["last_known_institutions"])
|
158 |
+
topic_share_map = add_topic_shares(user_info["topic_share"])
|
159 |
+
for topic in user_info["topics"]:
|
160 |
+
add_topic(author, topic, topic_share_map)
|
161 |
+
add_yearly_stats(author, user_info["counts_by_year"])
|
162 |
+
add_concepts(author, user_info["x_concepts"])
|
163 |
+
|
164 |
+
|
165 |
+
# Call this function to load data
|
166 |
+
# populate_user(user_info)
|
167 |
+
|
168 |
+
jsons = "/Users/sgautam/Documents/BridgeMentor/C41008148_authors"
|
169 |
+
for page, json_file in tqdm(enumerate(glob.glob(f"{jsons}/*.json"))):
|
170 |
+
with open(json_file, "r") as file:
|
171 |
+
user_infos = json.load(file)['results']
|
172 |
+
for user_info in tqdm(user_infos, leave=False):
|
173 |
+
populate_user(user_info)
|
174 |
+
print(f"{page}-{user_info['display_name']}")
|
175 |
+
|
176 |
+
# python manage.py shell
|
177 |
+
# from populate_user import populate_user
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
django==4.2.20
|
2 |
+
graphene-django
|
3 |
+
django-filter
|
scrap_openalex.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
import json
|
4 |
+
from time import sleep
|
5 |
+
|
6 |
+
# Base API URL
|
7 |
+
BASE_URL = "https://api.openalex.org/authors"
|
8 |
+
FILTER = "last_known_institutions.country_code:NO,x_concepts.id:C41008148"
|
9 |
+
PER_PAGE = 200
|
10 |
+
TOTAL_RESULTS = 86500
|
11 |
+
TOTAL_PAGES = (TOTAL_RESULTS + PER_PAGE - 1) // PER_PAGE # Ceiling division
|
12 |
+
|
13 |
+
# Output directory
|
14 |
+
OUTPUT_DIR = "C41008148_authors"
|
15 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
16 |
+
|
17 |
+
# Loop through pages
|
18 |
+
for page in range(50, TOTAL_PAGES + 1):
|
19 |
+
url = f"{BASE_URL}?filter={FILTER}&per-page={PER_PAGE}&page={page}"
|
20 |
+
try:
|
21 |
+
print(f"Fetching page {page}...")
|
22 |
+
response = requests.get(url)
|
23 |
+
response.raise_for_status()
|
24 |
+
data = response.json()
|
25 |
+
|
26 |
+
filename = os.path.join(OUTPUT_DIR, f"{page:010}.json")
|
27 |
+
# skip if exists
|
28 |
+
if os.path.exists(filename):
|
29 |
+
print(f"File {filename} already exists, skipping...")
|
30 |
+
continue
|
31 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
32 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
33 |
+
sleep(1) # Rate-limiting to avoid hitting the server too hard
|
34 |
+
except Exception as e:
|
35 |
+
print(f"Error on page {page}: {e}")
|
36 |
+
break
|
37 |
+
|
38 |
+
print("Download complete.")
|