import io import pandas as pd import geopandas as gpd from geopy.geocoders import Nominatim geolocator = Nominatim(user_agent='Cities OECD') import requests import requests_cache from retry_requests import retry from fake_useragent import UserAgent ua = UserAgent(browsers=['Chrome']) cache_session = requests_cache.CachedSession('.cache', expire_after = 3600) retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2) headers = { 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'Upgrade-Insecure-Requests': '1', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'Cache-Control': 'max-age=0', 'User-Agent': ua.random, } url_data = 'https://sdmx.oecd.org/archive/rest/data/OECD,DF_CITIES,/all?startPeriod=2018&dimensionAtObservation=AllDimensions&format=csvfilewithlabels' def geocode_nan(row): if pd.isnull(row['coordinates']): geolocate = row['Metropolitan areas'] location = geolocator.geocode(geolocate, timeout=10) row['coordinates'] = (location.longitude, location.latitude) else: row['coordinates'] = row['coordinates'] return row def oecd_data(): csv = retry_session.get(url_data, headers=headers) df = pd.read_csv(io.StringIO(csv.text), usecols=['METRO_ID', 'Metropolitan areas', 'Variables', 'TIME_PERIOD', 'OBS_VALUE'], low_memory=False) variables = ['Income', 'GDP'] df = df[df['Variables'].str.contains('|'.join(variables))].copy() df = df.sort_values(['METRO_ID', 'Variables', 'TIME_PERIOD'], ascending=[True, True, True]) df = df.drop_duplicates(subset=['METRO_ID', 'Variables'], keep='last') df_concat = df[['METRO_ID', 'Metropolitan areas']].sort_values('METRO_ID').drop_duplicates(subset='METRO_ID', keep='first').reset_index(drop=True) df_concat = df_concat[df_concat['METRO_ID'].str.len() > 3].reset_index(drop=True).copy() df_groups = df.groupby('Variables') for group in df_groups: group_name = group[0] group = group[1][['METRO_ID', 'OBS_VALUE']] group = group.rename(columns={'OBS_VALUE': group_name}) df_concat = df_concat.merge(group, on='METRO_ID', how='left') df_concat.to_csv('OECD_DF_CITIES_all.csv', encoding='utf-8', index=False) url_shap = 'https://www.oecd.org/content/dam/oecd/en/data/datasets/oecd-definition-of-cities-and-functional-urban-areas/fuas%20(1).zip' shape = retry_session.get(url_shap, headers=headers) with open('OECD_FUAS_shape.zip', 'wb') as z: z.write(shape.content) df_fua = gpd.read_file('OECD_FUAS_shape.zip') df_fua['centroid'] = df_fua.representative_point() df_fua['coordinates'] = df_fua['centroid'].apply(lambda p: (p.x, p.y)) df_fua['fuacode'] = df_fua['fuacode'].str.replace(r'F$', '', regex=True) df_fua = df_fua[['fuacode', 'coordinates']] df_fua = df_fua.rename(columns={'fuacode': 'METRO_ID'}) df_merged = df_concat.merge(df_fua, on='METRO_ID', how='left') #df_merged[['Longitude', 'Latitude']] = df_merged.apply(lambda x: geocode_nan(x) if pd.isnull(x['Longitude']) else [x['Longitude'], x['Latitude']], axis=1, result_type='expand') df_merged = df_merged.apply(geocode_nan, axis=1) df_merged.to_csv('OECD_DF_CITIES_coord.csv', encoding='utf-8', index=False) return df_merged