Spaces:
Running
Running
victormiller
commited on
Update curated.py
Browse files- curated.py +4 -206
curated.py
CHANGED
@@ -645,101 +645,6 @@ def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo")
|
|
645 |
)
|
646 |
|
647 |
|
648 |
-
def get_chart_28168342():
|
649 |
-
fig = go.Figure()
|
650 |
-
filter_names = [
|
651 |
-
"Download",
|
652 |
-
"Language",
|
653 |
-
"Min word count",
|
654 |
-
"Title Abstract",
|
655 |
-
"Majority language",
|
656 |
-
"Paragraph count",
|
657 |
-
"Frequency",
|
658 |
-
"Unigram log probability",
|
659 |
-
"Local dedup",
|
660 |
-
]
|
661 |
-
|
662 |
-
data_sources = [
|
663 |
-
("Wikipedia", [61614907, 61614907, 60468491, 60468491, 60468491, 60468491, 60468491, 60468491, 20]),
|
664 |
-
("Freelaw", [75971288, 73690766, 68171834, 68171834, 68171834, 68171834, 68171834, 68123174, 20]),
|
665 |
-
("DM Maths", [112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 20]),
|
666 |
-
("USPTO", [6880276, 6878964, 6749922, 6749922, 6749922, 6749922, 6749922, 6749389, 20]),
|
667 |
-
("PG19", [28752, 28683, 28682, 28682, 28682, 28682, 28682, 28632, 20]),
|
668 |
-
("Hackernews", [2064931, 2010802, 2010488, 2010488, 2010488, 2010488, 2010488, 2003636, 20]),
|
669 |
-
("Ubuntu IRC", [37966, 23501, 23468, 23468, 23468, 23468, 23468, 23205, 20]),
|
670 |
-
("Europarl", [69814, 69814,69814,69814,69814,69814,69814,69814, 20]),
|
671 |
-
("StackExchange", [23246548, 23246548, 23246352, 23246352, 23246352, 23246352, 23246352, 23246352, 20]),
|
672 |
-
("Arxiv", [1911867, 1869441, 1763840, 1763840, 1763840, 1763840, 1763840, 1762661, 20]),
|
673 |
-
("S2ORC", [12963563, 12963563, 12963563, 10731113, 9455620, 9306816, 8055147, 8055147, 20]),
|
674 |
-
("S2ORC Abstract", [102324176, 83867601, 82889293, 82889293, 82889293, 82889293, 82889293, 82777912, 20]),
|
675 |
-
("PubMed Central", [5230932, 4830486, 4768310, 4768310, 4768310, 4768310, 4768310, 4767474, 20]),
|
676 |
-
("PubMed Central Abstract", [25787474, 25784374, 25747955, 25747955, 25747955, 25747955, 25747955, 25746724, 20]),
|
677 |
-
("PhilPapers", [49389, 39175, 39175, 39175, 39175, 39175, 39175, 39128, 20]),
|
678 |
-
]
|
679 |
-
|
680 |
-
for name, x_values in data_sources:
|
681 |
-
fig.add_trace(
|
682 |
-
go.Funnel(
|
683 |
-
name=name,
|
684 |
-
orientation="h",
|
685 |
-
y=filter_names,
|
686 |
-
x=x_values,
|
687 |
-
textinfo="value+percent total",
|
688 |
-
textposition="inside",
|
689 |
-
)
|
690 |
-
)
|
691 |
-
|
692 |
-
fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
|
693 |
-
return fig
|
694 |
-
|
695 |
-
|
696 |
-
def get_chart_new():
|
697 |
-
fig = go.Figure()
|
698 |
-
filter_names = [
|
699 |
-
"Download",
|
700 |
-
"Language",
|
701 |
-
"Min word count",
|
702 |
-
"Title Abstract",
|
703 |
-
"Majority language",
|
704 |
-
"Paragraph count",
|
705 |
-
"Frequency",
|
706 |
-
"Unigram log probability",
|
707 |
-
"Local dedup",
|
708 |
-
]
|
709 |
-
|
710 |
-
data_sources = [
|
711 |
-
("Wikipedia", [61614907, 0, 1146416, 0, 0, 0, 0, 0, 20]),
|
712 |
-
("Freelaw", [75971288, 2280522, 5518932, 0, 0, 0, 0, 48660, 20]),
|
713 |
-
("DM Maths", [112559888, 0, 0, 0, 0, 0, 0, 0, 20]),
|
714 |
-
("USPTO", [6880276, 1312, 129042, 0, 0, 0, 0, 533, 20]),
|
715 |
-
("PG19", [28752, 69, 1, 0, 0, 0, 0, 50, 20]),
|
716 |
-
("Hackernews", [2064931, 54129, 314, 0, 0, 0, 0, 6852, 20]),
|
717 |
-
("Ubuntu IRC", [37966, 14465, 33, 0, 0, 0, 0, 263, 20]),
|
718 |
-
("Europarl", [69814, 0, 0, 0, 0, 0, 0, 0, 20]),
|
719 |
-
("StackExchange", [23246548, 0, 196, 0, 0, 0, 0, 0, 20]),
|
720 |
-
("Arxiv", [1911867, 42426, 105601, 0, 0, 0, 0, 1179, 20]),
|
721 |
-
("S2ORC", [12963563, 0, 0, 2232450, 1275493, 148804, 1251669, 0, 20]),
|
722 |
-
("S2ORC Abstract", [102324176, 18456575, 978308, 0, 0, 0, 0, 111381, 20]),
|
723 |
-
("PubMed Central", [5230932, 400446, 62176, 0, 0, 0, 0, 836, 20]),
|
724 |
-
("PubMed Central Abstract", [25787474, 3100, 36419, 0, 0, 0, 0, 1231, 20]),
|
725 |
-
("PhilPapers", [49389, 10214, 0, 0, 0, 0, 0, 47, 20]),
|
726 |
-
]
|
727 |
-
|
728 |
-
for name, x_values in data_sources:
|
729 |
-
fig.add_trace(
|
730 |
-
go.Funnel(
|
731 |
-
name=name,
|
732 |
-
orientation="h",
|
733 |
-
y=filter_names,
|
734 |
-
x=x_values,
|
735 |
-
textinfo="value+percent total",
|
736 |
-
textposition="inside",
|
737 |
-
)
|
738 |
-
)
|
739 |
-
|
740 |
-
fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
|
741 |
-
return fig
|
742 |
-
|
743 |
def update(target: str, request):
|
744 |
params = request.query_params
|
745 |
if data_source := params.get(f"data_source_{target}"):
|
@@ -749,113 +654,6 @@ def update(target: str, request):
|
|
749 |
return get_data(
|
750 |
params.get(f"data_source_{target}"), doc_id, target)
|
751 |
|
752 |
-
|
753 |
-
# Creating the dataframe from the provided table data
|
754 |
-
data = {
|
755 |
-
'Dataset': ['Wikipedia', 'Freelaw', 'DM Maths', 'USPTO', 'PG19', 'Hackernews', 'Ubuntu IRC', 'Europarl',
|
756 |
-
'StackExchange', 'Arxiv', 'S2ORC', 'S2ORC Abstract', 'Pubmed Central', 'Pubmed Abstract', 'Phil Papers'],
|
757 |
-
'Downloaded Lines': [61614907, 75971288, 112559888, 6880276, 28752, 2064931, 37966, 69814, 23246548, 1911867,
|
758 |
-
12963563, 102324176, 5230932, 25787474, 49389],
|
759 |
-
'Language Filter': [0, 2280522, 0, 1312, 69, 54129, 14465, 0, 0, 42426, 0, 18456575, 400446, 3100, 10214],
|
760 |
-
'Min Word Count': [1146416, 5518932, 0, 129042, 1, 314, 33, 0, 196, 105601, 0, 978308, 62176, 36419, 0],
|
761 |
-
'Unigram log probability': [60468491, 68171834, 112559888, 6749922, 28682, 2010488, 23468, 69814, 23246352,
|
762 |
-
1763840, 12963563, 82889293, 4768310, 25747955, 39175],
|
763 |
-
'Total Lines Remaining': [60468491, 68123174, 112559888, 6749389, 28632, 2003636, 23205, 69814, 23246352,
|
764 |
-
1762661, 12963563, 82777912, 4767474, 25746724, 39128]
|
765 |
-
}
|
766 |
-
|
767 |
-
df = pd.DataFrame(data)
|
768 |
-
|
769 |
-
# Create the stacked bar chart
|
770 |
-
fig = go.Figure()
|
771 |
-
|
772 |
-
# Adding traces for each filter stage
|
773 |
-
fig.add_trace(go.Bar(
|
774 |
-
name='Language Filter',
|
775 |
-
x=df['Dataset'],
|
776 |
-
y=df['Language Filter']
|
777 |
-
))
|
778 |
-
|
779 |
-
fig.add_trace(go.Bar(
|
780 |
-
name='Min Word Count Filter',
|
781 |
-
x=df['Dataset'],
|
782 |
-
y=df['Min Word Count']
|
783 |
-
))
|
784 |
-
|
785 |
-
fig.add_trace(go.Bar(
|
786 |
-
name='Unigram log probability Filter',
|
787 |
-
x=df['Dataset'],
|
788 |
-
y=df['Unigram log probability']
|
789 |
-
))
|
790 |
-
|
791 |
-
fig.add_trace(go.Bar(
|
792 |
-
name='Total Lines Remaining',
|
793 |
-
x=df['Dataset'],
|
794 |
-
y=df['Total Lines Remaining']
|
795 |
-
))
|
796 |
-
|
797 |
-
# Update the layout
|
798 |
-
fig.update_layout(
|
799 |
-
barmode='stack',
|
800 |
-
title='Stacked Bar Chart of Line Reductions by Dataset',
|
801 |
-
xaxis_title='Dataset',
|
802 |
-
yaxis_title='Number of Lines',
|
803 |
-
legend_title='Filters',
|
804 |
-
height=600,
|
805 |
-
width=1000
|
806 |
-
)
|
807 |
-
|
808 |
-
# Show the plot
|
809 |
-
stacked_bar = fig
|
810 |
-
|
811 |
-
# Aggregating the data for filters and datasets
|
812 |
-
filter_data = {
|
813 |
-
'Filter': ['Language Filter', 'Min Word Count', 'Unigram log probability', 'Total Lines Remaining'],
|
814 |
-
'Wikipedia': [0, 1146416, 60468491, 60468491],
|
815 |
-
'Freelaw': [2280522, 5518932, 68171834, 68123174],
|
816 |
-
'DM Maths': [0, 0, 112559888, 112559888],
|
817 |
-
'USPTO': [1312, 129042, 6749922, 6749389],
|
818 |
-
'PG19': [69, 1, 28682, 28632],
|
819 |
-
'Hackernews': [54129, 314, 2010488, 2003636],
|
820 |
-
'Ubuntu IRC': [14465, 33, 23468, 23205],
|
821 |
-
'Europarl': [0, 0, 69814, 69814],
|
822 |
-
'StackExchange': [0, 196, 23246352, 23246352],
|
823 |
-
'Arxiv': [42426, 105601, 1763840, 1762661],
|
824 |
-
'S2ORC': [0, 0, 12963563, 12963563],
|
825 |
-
'S2ORC Abstract': [18456575, 978308, 82889293, 82777912],
|
826 |
-
'Pubmed Central': [400446, 62176, 4768310, 4767474],
|
827 |
-
'Pubmed Abstract': [3100, 36419, 25747955, 25746724],
|
828 |
-
'Phil Papers': [10214, 0, 39175, 39128]
|
829 |
-
}
|
830 |
-
|
831 |
-
# Creating a new dataframe for the filter data
|
832 |
-
filter_df = pd.DataFrame(filter_data)
|
833 |
-
|
834 |
-
# Creating the stacked bar chart
|
835 |
-
fig = go.Figure()
|
836 |
-
|
837 |
-
# Add trace for each dataset
|
838 |
-
for dataset in filter_df.columns[1:]:
|
839 |
-
fig.add_trace(go.Bar(
|
840 |
-
name=dataset,
|
841 |
-
x=filter_df['Filter'],
|
842 |
-
y=filter_df[dataset]
|
843 |
-
))
|
844 |
-
|
845 |
-
# Update the layout
|
846 |
-
fig.update_layout(
|
847 |
-
barmode='stack',
|
848 |
-
title='Stacked Bar Chart of Filters for Each Dataset',
|
849 |
-
xaxis_title='Filter',
|
850 |
-
yaxis_title='Number of Lines',
|
851 |
-
legend_title='Dataset',
|
852 |
-
height=600,
|
853 |
-
width=1000
|
854 |
-
)
|
855 |
-
|
856 |
-
# Show the plot
|
857 |
-
diff_stacked_bar = fig
|
858 |
-
|
859 |
# Data for the stacked bar chart
|
860 |
data = {
|
861 |
'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
|
@@ -1037,10 +835,10 @@ def curated(request):
|
|
1037 |
H2("Curated Sources Defined"),
|
1038 |
table_desc,
|
1039 |
data_preprocessing_div,
|
1040 |
-
|
1041 |
-
|
1042 |
-
|
1043 |
-
|
1044 |
plotly2fasthtml(diff2_stacked_bar),
|
1045 |
H2("Curated Sources Processing"),
|
1046 |
filtering_process,
|
|
|
645 |
)
|
646 |
|
647 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
648 |
def update(target: str, request):
|
649 |
params = request.query_params
|
650 |
if data_source := params.get(f"data_source_{target}"):
|
|
|
654 |
return get_data(
|
655 |
params.get(f"data_source_{target}"), doc_id, target)
|
656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
657 |
# Data for the stacked bar chart
|
658 |
data = {
|
659 |
'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
|
|
|
835 |
H2("Curated Sources Defined"),
|
836 |
table_desc,
|
837 |
data_preprocessing_div,
|
838 |
+
# plotly2fasthtml(get_chart_28168342()),
|
839 |
+
# plotly2fasthtml(get_chart_new()),
|
840 |
+
# plotly2fasthtml(stacked_bar),
|
841 |
+
# plotly2fasthtml(diff_stacked_bar),
|
842 |
plotly2fasthtml(diff2_stacked_bar),
|
843 |
H2("Curated Sources Processing"),
|
844 |
filtering_process,
|