Spaces:
Running
Running
victormiller
commited on
Update curated.py
Browse files- curated.py +49 -0
curated.py
CHANGED
@@ -808,6 +808,54 @@ fig.update_layout(
|
|
808 |
# Show the plot
|
809 |
stacked_bar = fig
|
810 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
811 |
|
812 |
def curated(request):
|
813 |
|
@@ -943,6 +991,7 @@ def curated(request):
|
|
943 |
plotly2fasthtml(get_chart_28168342()),
|
944 |
plotly2fasthtml(get_chart_new()),
|
945 |
plotly2fasthtml(stacked_bar),
|
|
|
946 |
H2("Curated Sources Processing"),
|
947 |
filtering_process,
|
948 |
data_preparation_div,
|
|
|
808 |
# Show the plot
|
809 |
stacked_bar = fig
|
810 |
|
811 |
+
# Aggregating the data for filters and datasets
|
812 |
+
filter_data = {
|
813 |
+
'Filter': ['Language Filter', 'Min Word Count', 'Unigram log probability', 'Total Lines Remaining'],
|
814 |
+
'Wikipedia': [0, 1146416, 60468491, 60468491],
|
815 |
+
'Freelaw': [2280522, 5518932, 68171834, 68123174],
|
816 |
+
'DM Maths': [0, 0, 112559888, 112559888],
|
817 |
+
'USPTO': [1312, 129042, 6749922, 6749389],
|
818 |
+
'PG19': [69, 1, 28682, 28632],
|
819 |
+
'Hackernews': [54129, 314, 2010488, 2003636],
|
820 |
+
'Ubuntu IRC': [14465, 33, 23468, 23205],
|
821 |
+
'Europarl': [0, 0, 69814, 69814],
|
822 |
+
'StackExchange': [0, 196, 23246352, 23246352],
|
823 |
+
'Arxiv': [42426, 105601, 1763840, 1762661],
|
824 |
+
'S2ORC': [0, 0, 12963563, 12963563],
|
825 |
+
'S2ORC Abstract': [18456575, 978308, 82889293, 82777912],
|
826 |
+
'Pubmed Central': [400446, 62176, 4768310, 4767474],
|
827 |
+
'Pubmed Abstract': [3100, 36419, 25747955, 25746724],
|
828 |
+
'Phil Papers': [10214, 0, 39175, 39128]
|
829 |
+
}
|
830 |
+
|
831 |
+
# Creating a new dataframe for the filter data
|
832 |
+
filter_df = pd.DataFrame(filter_data)
|
833 |
+
|
834 |
+
# Creating the stacked bar chart
|
835 |
+
fig = go.Figure()
|
836 |
+
|
837 |
+
# Add trace for each dataset
|
838 |
+
for dataset in filter_df.columns[1:]:
|
839 |
+
fig.add_trace(go.Bar(
|
840 |
+
name=dataset,
|
841 |
+
x=filter_df['Filter'],
|
842 |
+
y=filter_df[dataset]
|
843 |
+
))
|
844 |
+
|
845 |
+
# Update the layout
|
846 |
+
fig.update_layout(
|
847 |
+
barmode='stack',
|
848 |
+
title='Stacked Bar Chart of Filters for Each Dataset',
|
849 |
+
xaxis_title='Filter',
|
850 |
+
yaxis_title='Number of Lines',
|
851 |
+
legend_title='Dataset',
|
852 |
+
height=600,
|
853 |
+
width=1000
|
854 |
+
)
|
855 |
+
|
856 |
+
# Show the plot
|
857 |
+
diff_stacked_bar = fig
|
858 |
+
|
859 |
|
860 |
def curated(request):
|
861 |
|
|
|
991 |
plotly2fasthtml(get_chart_28168342()),
|
992 |
plotly2fasthtml(get_chart_new()),
|
993 |
plotly2fasthtml(stacked_bar),
|
994 |
+
plotly2fasthtml(diff_stacked_bar),
|
995 |
H2("Curated Sources Processing"),
|
996 |
filtering_process,
|
997 |
data_preparation_div,
|