Yuchen commited on
Commit
618792d
·
1 Parent(s): 29cb463
Files changed (2) hide show
  1. app.py +94 -47
  2. data/meta.json +1 -1
app.py CHANGED
@@ -1,13 +1,17 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
  import json
 
5
  import altair as alt
 
 
 
 
6
  from utils import get_variable_filter, is_numerical
7
 
 
8
  def check_codes(var, data_meta):
9
  return 'codes' in data_meta[var] and data_meta[var]['codes'] is not None and len(data_meta[var]['codes']) > 0
10
 
 
11
  @st.cache(show_spinner=False)
12
  def load_data():
13
  with st.spinner('loading data...'):
@@ -21,20 +25,24 @@ def load_data():
21
  name2id[k] = {vv: int(kk) for kk, vv in v['codes'].items()}
22
  return data_main, data_meta, data_variable, data_filter, name2id
23
 
 
24
  DATA_MAIN, DATA_META, DATA_VARIABLE, DATA_FILTER, NAME2ID = load_data()
25
 
 
26
  def id2name(var, id_):
27
  id_ = str(id_)
28
  if var not in DATA_META or 'codes' not in DATA_META[var]:
29
  return id_
30
  return DATA_META[var]['codes'].get(id_, id_)
31
 
 
32
  @st.cache
33
  def name2id(var, name):
34
  if var in NAME2ID:
35
  return NAME2ID[var][name]
36
  return name
37
 
 
38
  def main():
39
  # st.write(df.head())
40
  load_data()
@@ -47,19 +55,22 @@ def main():
47
  'Heatmap': heat,
48
  'Histogram': hist,
49
  }
50
- st.title('HRL Portal')
51
  with st.sidebar:
52
- chart = st.selectbox('Select a chart type:', list(charts.keys()))
 
53
  # st.write(DATA_VARIABLE)
54
  # st.write(DATA_FILTER)
55
  charts[chart]()
56
 
 
57
  @st.cache
58
  def get_unique(var):
59
  if check_codes(var, DATA_META):
60
  return list(DATA_META[var]['codes'].values())
61
  return DATA_MAIN[var].unique().tolist()
62
 
 
63
  def get_var_name(var):
64
  if var in DATA_META and 'name' in DATA_META[var]:
65
  name = DATA_META[var]['name']
@@ -68,75 +79,105 @@ def get_var_name(var):
68
  return f"{var} ({name})"
69
  return var
70
 
 
71
  def name2var(name):
72
  return name.split()[0]
73
 
 
74
  def meta():
75
  st.header('Variables')
76
  for k, v in DATA_META.items():
77
  st.subheader(k)
78
  st.write(v['description'])
79
 
 
80
  def area():
81
  with st.sidebar:
82
- var_name = st.selectbox('Select a variable:', [get_var_name(var) for var in DATA_VARIABLE['area']])
 
83
  var = name2var(var_name)
84
- st.write('Filters:')
 
85
  filters = {}
86
  for fvar in DATA_FILTER[var]:
87
  if fvar == 'YEAR':
88
  continue
89
  if is_numerical(fvar):
90
- names = st.slider(fvar, min(get_unique(fvar)), max(get_unique(fvar)), value=(min(get_unique(fvar)), max(get_unique(fvar))))
 
91
  else:
92
  names = st.multiselect(fvar, get_unique(fvar), default=get_unique(fvar))
93
  filters[fvar] = set([name2id(fvar, n) for n in names])
94
- st.header(f'Area Chart: {var_name}')
95
- with st.expander(f'{var}'):
96
- st.write(DATA_META.get(var, {}).get('description', ''))
97
-
98
- names = st.multiselect('Select values to display:', get_unique(var),
99
- default=['Authors',
100
- 'Musicians and music teachers',
101
- 'Telephone operators',
102
- 'Bus drivers','Cashiers'] if var == 'OCC1950' else ['Drugs and medicines',
103
- 'Fisheries',
104
- 'Glass and glass products',
105
- ])
106
- vals = set([name2id(var, n) for n in names])
107
- df = get_area_data(var, filters, vals)
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  # st.write(df)
109
- st.write(f'found {len(df)} records')
110
- if len(df) > 0:
111
- selection = alt.selection_multi(fields=[var], bind='legend')
112
- plot = alt.Chart(df, title=f'Count of Different {var} Values').mark_area().encode(alt.X('YEAR'),
113
- alt.Y('count', title='count', stack='zero'),
114
- alt.Color(var,
115
- scale=alt.Scale(scheme='category20'),
116
- legend=alt.Legend(orient='bottom')),
117
- opacity=alt.condition(selection,
118
- alt.value(
119
- 1),
120
- alt.value(
121
- 0.3)),
122
- tooltip='count'
123
- )\
124
- .properties(width=650).add_selection(selection)
125
- st.write(plot)
 
 
 
 
 
 
 
 
126
 
127
  @st.cache(show_spinner=False)
128
- def get_area_data(var, filters, vals):
129
  # for k, v in filters.items():
130
  # st.write(k)
131
  # st.write(v)
132
  # st.write(DATA_MAIN[k].unique())
133
  # st.write(len(DATA_MAIN))
134
- df = DATA_MAIN[['YEAR', var, *filters.keys()]]
135
- df = df[df[var].isin(vals)]
136
- with st.spinner('filtering...'):
137
- for fvar, fvals in filters.items():
138
- df = df[df[fvar].isin(fvals)]
139
- df = df[['YEAR', var]]
 
 
 
140
  # st.write(len(df))
141
  # st.write(df.head())
142
  with st.spinner('counting...'):
@@ -144,24 +185,30 @@ def get_area_data(var, filters, vals):
144
  groups[var] = groups[var].apply(lambda x: id2name(var, x))
145
  return groups
146
 
 
147
  def line():
148
  st.header('Line Graph')
149
 
 
150
  def scatter():
151
  st.header('Scatter Plot')
152
 
 
153
  def bar():
154
  st.header('Bar Chart')
155
 
 
156
  def box():
157
  st.header('Box Plot')
158
 
 
159
  def heat():
160
  st.header('Heatmap')
161
 
 
162
  def hist():
163
  st.header('Histogram')
164
 
 
165
  if __name__ == '__main__':
166
  main()
167
-
 
 
 
 
1
  import json
2
+ import math
3
  import altair as alt
4
+ import pandas as pd
5
+ import streamlit as st
6
+ st.set_page_config(layout="wide")
7
+
8
  from utils import get_variable_filter, is_numerical
9
 
10
+
11
  def check_codes(var, data_meta):
12
  return 'codes' in data_meta[var] and data_meta[var]['codes'] is not None and len(data_meta[var]['codes']) > 0
13
 
14
+
15
  @st.cache(show_spinner=False)
16
  def load_data():
17
  with st.spinner('loading data...'):
 
25
  name2id[k] = {vv: int(kk) for kk, vv in v['codes'].items()}
26
  return data_main, data_meta, data_variable, data_filter, name2id
27
 
28
+
29
  DATA_MAIN, DATA_META, DATA_VARIABLE, DATA_FILTER, NAME2ID = load_data()
30
 
31
+
32
  def id2name(var, id_):
33
  id_ = str(id_)
34
  if var not in DATA_META or 'codes' not in DATA_META[var]:
35
  return id_
36
  return DATA_META[var]['codes'].get(id_, id_)
37
 
38
+
39
  @st.cache
40
  def name2id(var, name):
41
  if var in NAME2ID:
42
  return NAME2ID[var][name]
43
  return name
44
 
45
+
46
  def main():
47
  # st.write(df.head())
48
  load_data()
 
55
  'Heatmap': heat,
56
  'Histogram': hist,
57
  }
58
+ # st.title('HRL Portal')
59
  with st.sidebar:
60
+ st.subheader('Chart Type')
61
+ chart = st.selectbox('Select a chart type:', list(charts.keys()), label_visibility='collapsed')
62
  # st.write(DATA_VARIABLE)
63
  # st.write(DATA_FILTER)
64
  charts[chart]()
65
 
66
+
67
  @st.cache
68
  def get_unique(var):
69
  if check_codes(var, DATA_META):
70
  return list(DATA_META[var]['codes'].values())
71
  return DATA_MAIN[var].unique().tolist()
72
 
73
+
74
  def get_var_name(var):
75
  if var in DATA_META and 'name' in DATA_META[var]:
76
  name = DATA_META[var]['name']
 
79
  return f"{var} ({name})"
80
  return var
81
 
82
+
83
  def name2var(name):
84
  return name.split()[0]
85
 
86
+
87
  def meta():
88
  st.header('Variables')
89
  for k, v in DATA_META.items():
90
  st.subheader(k)
91
  st.write(v['description'])
92
 
93
+
94
  def area():
95
  with st.sidebar:
96
+ st.subheader('Variable')
97
+ var_name = st.selectbox('Select a variable:', [get_var_name(var) for var in DATA_VARIABLE['area']], label_visibility='collapsed')
98
  var = name2var(var_name)
99
+ st.subheader('Filters')
100
+ # apply_fil = st.button('Apply Filters')
101
  filters = {}
102
  for fvar in DATA_FILTER[var]:
103
  if fvar == 'YEAR':
104
  continue
105
  if is_numerical(fvar):
106
+ names = st.slider(fvar, min(get_unique(fvar)), max(get_unique(fvar)),
107
+ value=(min(get_unique(fvar)), max(get_unique(fvar))))
108
  else:
109
  names = st.multiselect(fvar, get_unique(fvar), default=get_unique(fvar))
110
  filters[fvar] = set([name2id(fvar, n) for n in names])
111
+ st.header(f'Area Chart - {var_name}')
112
+ # with st.expander(f'{var}'):
113
+ # st.write(DATA_META.get(var, {}).get('description', ''))
114
+
115
+ # st.write(len([v for k, v in DATA_META[var]['codes'].items() if k != -1]))
116
+ # vals = set([name2id(var, n) for n in names])
117
+
118
+ container = st.container()
119
+ all = st.button("Select all")
120
+ clear = st.button("Clear all")
121
+
122
+ if all:
123
+ names = container.multiselect(f"Select one or more {var} values to display:",
124
+ get_unique(var), [v for k, v in DATA_META[var]['codes'].items() if k != -1])
125
+ elif clear:
126
+ names = container.multiselect(f"Select one or more {var} values to display:",
127
+ get_unique(var), [])
128
+ else:
129
+ names = container.multiselect(f"Select one or more {var} values to display:",
130
+ get_unique(var), default=['Authors', 'Musicians and music teachers', 'Telephone operators', 'Bus drivers', 'Cashiers'] if var == 'OCC1950' else ['Drugs and medicines','Fisheries','Glass and glass products',])
131
+
132
+ # if apply_fil:
133
+ # df = get_area_data(var, filters)
134
+ # else:
135
+ # df = get_area_data(var)
136
+ df = get_area_data(var, filters)
137
+ df = df[df[var].isin(names)]
138
  # st.write(df)
139
+
140
+ if len(df) == 0:
141
+ st.warning('Select some data to display')
142
+ return
143
+ selection = alt.selection_multi(fields=[var], bind='legend')
144
+ plot = alt.Chart(df, title=f'Count of Different {var} Values').mark_area().encode(alt.X('YEAR'),
145
+ alt.Y('count', title='count',
146
+ stack='zero'),
147
+ alt.Color(var,
148
+ scale=alt.Scale(
149
+ scheme='category20'),
150
+ legend=alt.Legend(
151
+ # orient='bottom',
152
+ # columns=3
153
+ )),
154
+ opacity=alt.condition(selection,
155
+ alt.value(
156
+ 1),
157
+ alt.value(
158
+ 0.3)),
159
+ tooltip=['YEAR', var, 'count']
160
+ ) \
161
+ .properties(height=400 + len(names)).add_selection(selection)
162
+ st.altair_chart(plot, use_container_width=True)
163
+
164
 
165
  @st.cache(show_spinner=False)
166
+ def get_area_data(var, filters=None):
167
  # for k, v in filters.items():
168
  # st.write(k)
169
  # st.write(v)
170
  # st.write(DATA_MAIN[k].unique())
171
  # st.write(len(DATA_MAIN))
172
+ if filters is not None:
173
+ df = DATA_MAIN[['YEAR', var, *filters.keys()]]
174
+ # df = df[df[var].isin(vals)]
175
+ with st.spinner('filtering...'):
176
+ for fvar, fvals in filters.items():
177
+ df = df[df[fvar].isin(fvals)]
178
+ df = df[['YEAR', var]]
179
+ else:
180
+ df = DATA_MAIN[['YEAR', var]]
181
  # st.write(len(df))
182
  # st.write(df.head())
183
  with st.spinner('counting...'):
 
185
  groups[var] = groups[var].apply(lambda x: id2name(var, x))
186
  return groups
187
 
188
+
189
  def line():
190
  st.header('Line Graph')
191
 
192
+
193
  def scatter():
194
  st.header('Scatter Plot')
195
 
196
+
197
  def bar():
198
  st.header('Bar Chart')
199
 
200
+
201
  def box():
202
  st.header('Box Plot')
203
 
204
+
205
  def heat():
206
  st.header('Heatmap')
207
 
208
+
209
  def hist():
210
  st.header('Histogram')
211
 
212
+
213
  if __name__ == '__main__':
214
  main()
 
data/meta.json CHANGED
@@ -647,7 +647,7 @@
647
  },
648
  "OCC1950": {
649
  "name": "Occupation",
650
- "description": "OCC1950 applies the 1950 Census Bureau occupational classification system to occupational data, to enhance comparability across years. For pre-1940 samples created at Minnesota, the alphabetic responses supplied by enumerators were directly coded into the 1950 classification. For other samples, the information in the variable OCC was recoded into the 1950 classification. Codes above 970 are non-occupational responses retained in the historical census samples or blank/unknown. The design of OCC1950 is described at length in \" Integrated Occupation and Industry Codes and Occupational Standing Variables in the IPUMS.\". The composition of the 1950 occupation categories is described in detail in U.S. Bureau of the Census, Alphabetic Index of Occupations and Industries: 1950 (Washington D.C., 1950).\n\nIn 1850-1880, any laborer with no specified industry in a household with a farmer is recoded into farm labor. In 1860-1900, any woman with an occupational response of \"housekeeper\" enters the non-occupational category \"keeping house\" if she is related to the head of household. Cases affected by these imputation procedures are identified by an appropriate data quality flag.\n\nA parallel variable called OCC1990, available for the samples from 1950 onward, codes occupations into a simplified version of the 1990 occupational coding scheme.",
651
  "codes": {
652
  "0": "Accountants and auditors",
653
  "1": "Actors and actresses",
 
647
  },
648
  "OCC1950": {
649
  "name": "Occupation",
650
+ "description": "OCC1950 applies the 1950 Census Bureau occupational classification system to occupational data, to enhance comparability across years. For pre-1940 samples created at Minnesota, the alphabetic responses supplied by enumerators were directly coded into the 1950 classification. For other samples, the information in the variable OCC was recoded into the 1950 classification.",
651
  "codes": {
652
  "0": "Accountants and auditors",
653
  "1": "Actors and actresses",