omkarenator commited on
Commit
34ecf31
·
1 Parent(s): 85da60b

add generic data viewer. separate routes

Browse files
Files changed (6) hide show
  1. common.py +7 -0
  2. curated.py +186 -46
  3. data_viewer.py +83 -0
  4. main.py +100 -280
  5. results.py +7 -0
  6. web.py +7 -0
common.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from fasthtml.common import *
2
+ from fasthtml.components import *
3
+
4
+
5
+ def common_steps():
6
+ return Div(Section(H2(P("Common Steps")), id="inner-text"))
7
+
curated.py CHANGED
@@ -1,5 +1,12 @@
1
  from fasthtml.common import *
 
 
 
 
2
  import json
 
 
 
3
 
4
 
5
  data_sources = [
@@ -20,7 +27,7 @@ data_sources = [
20
  ]
21
 
22
 
23
- def get_data(data_source: str = "Freelaw", doc_id: int = 3):
24
  doc_id = max(0, min(int(doc_id), 9))
25
 
26
  if data_source == "Freelaw":
@@ -77,60 +84,193 @@ def get_data(data_source: str = "Freelaw", doc_id: int = 3):
77
 
78
  raw_json = raw_sample_doc[doc_id]
79
  extracted_json = extracted_sample_doc[doc_id]
80
-
81
- drop_down = Select(
82
- *[Option(ds, value=ds, selected=(ds == data_source)) for ds in data_sources],
83
- name="data_source",
84
- hx_get="/curated",
85
- hx_target="#colcontent",
86
- hx_trigger="change",
87
- hx_swap="innerHTML",
88
  )
89
 
90
- slider = Input(
91
- type="range",
92
- name="doc_id",
93
- min="0",
94
- max="9",
95
- value=str(doc_id),
96
- hx_get="/curated",
97
- hx_target="#colcontent",
98
- hx_trigger="change",
99
- hx_swap="innerHTML",
100
- hx_include="[name='data_source']",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  )
102
 
103
- form = Form(
104
- Div(
105
- Label("Data source: ", drop_down),
106
- ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  Div(
108
- Label("Data sample: ", slider, f"{doc_id}", cls="plotly_slider"),
 
109
  ),
110
- cls="plotly_input_container",
111
  )
112
 
113
- col1 = Div(
114
- H3("Raw format"),
115
- Pre(
116
- json.dumps(raw_json, indent=4),
117
- style="white-space: pre-wrap; word-break: break-all;",
118
- ),
119
- style="width: 48%; float: left; overflow-x: auto;",
120
- )
121
 
122
- col2 = Div(
123
- H3("Extracted format"),
124
- Pre(
125
- json.dumps(extracted_json, indent=4),
126
- style="white-space: pre-wrap; word-break: break-all;",
127
- ),
128
- style="width: 48%; float: right; overflow-x: auto;",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  )
130
 
131
- data_display = Div(
132
- col1,
133
- col2,
134
- style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
 
 
 
 
 
 
 
 
135
  )
136
- return Div(form, data_display, style="margin-top: 10px;", id="colcontent")
 
1
  from fasthtml.common import *
2
+ from fasthtml.components import *
3
+ from plotly import graph_objects as go
4
+ from fh_plotly import plotly2fasthtml
5
+ import pandas as pd
6
  import json
7
+ from data_viewer import view_data, gen_random_id
8
+ from rich import print
9
+ import uuid
10
 
11
 
12
  data_sources = [
 
27
  ]
28
 
29
 
30
+ def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
31
  doc_id = max(0, min(int(doc_id), 9))
32
 
33
  if data_source == "Freelaw":
 
84
 
85
  raw_json = raw_sample_doc[doc_id]
86
  extracted_json = extracted_sample_doc[doc_id]
87
+ return view_data(
88
+ raw_json,
89
+ extracted_json,
90
+ doc_id=doc_id,
91
+ data_source=data_source,
92
+ data_sources=data_sources,
93
+ target=target,
 
94
  )
95
 
96
+
97
+ def get_chart_28168342():
98
+ fig = go.Figure()
99
+ filter_names = [
100
+ "Download",
101
+ "Language",
102
+ "Min word count",
103
+ "Title Abstract",
104
+ "Majority language",
105
+ "Paragraph count",
106
+ "Frequency",
107
+ "Unigram log probability",
108
+ "Local dedup",
109
+ ]
110
+
111
+ data_sources = [
112
+ ("Wikipedia", [100, 90, 80, 70, 60, 50, 40, 30, 20]),
113
+ ("Freelaw", [100, 90, 80, 70, 60, 50, 40, 20, 20]),
114
+ ("DM Maths", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
115
+ ("USPTO", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
116
+ ("PG19", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
117
+ ("Hackernews", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
118
+ ("Ubuntu IRC", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
119
+ ("Europarl", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
120
+ ("StackExchange", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
121
+ ("Arxiv", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
122
+ ("S2ORC", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
123
+ ("S2ORC Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
124
+ ("PubMed Central", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
125
+ ("PubMed Central Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
126
+ ("PhilPapers", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
127
+ ]
128
+
129
+ for name, x_values in data_sources:
130
+ fig.add_trace(
131
+ go.Funnel(
132
+ name=name,
133
+ orientation="h",
134
+ y=filter_names,
135
+ x=x_values,
136
+ textinfo="value+percent total",
137
+ textposition="inside",
138
+ )
139
+ )
140
+
141
+ fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
142
+ return fig
143
+
144
+
145
+ def curated(request):
146
+ # Partial Updates
147
+ params = dict(request.query_params)
148
+ if target := params.get("target"):
149
+ if data_source := params.get(f"data_source_{target}"):
150
+ return get_data(
151
+ data_source, params.get(f"doc_id_{target}", 3), params.get("target")
152
+ )
153
+ if doc_id := params.get(f"doc_id_{target}"):
154
+ return get_data(
155
+ params.get(f"data_source_{target}"), doc_id, params.get("target")
156
+ )
157
+
158
+ data_preparation_steps = pd.DataFrame(
159
+ {
160
+ "Method": [
161
+ "HTTP/FTP dumps",
162
+ "Web crawling",
163
+ "Archive snapshot",
164
+ "Generated",
165
+ "Curated",
166
+ ],
167
+ "Description": [
168
+ "Acquiring data from HTTP/FTP dumps",
169
+ "Crawling websites to extract data",
170
+ "Working with archive dumps",
171
+ "Generating synthetic data",
172
+ "High quality curated data",
173
+ ],
174
+ "Source": [
175
+ "Freelaw | Wikipedia | PhilPapers | Arxiv | S2ORC | Pubmeds",
176
+ "USPTO | Hackernews | Ubuntu IRC",
177
+ "StackExchange",
178
+ "DM Maths",
179
+ "PG19 | Europarl",
180
+ ],
181
+ }
182
  )
183
 
184
+ table_html = data_preparation_steps.to_html(index=False, border=0)
185
+ table_div = Div(NotStr(table_html), style="margin: 40px;")
186
+
187
+ text = P("""This initial stage serves as the foundation for the entire
188
+ process. Here, we focus on acquiring and extracting the raw data, which can
189
+ come from various sources such as crawling websites, using HTTP/FTP dumps,
190
+ or working with archive dumps. For instance, to download and prepare a
191
+ dataset, we can specific downloaders based on the data source. Each dataset
192
+ might have its own downloader script which can be updated in real time to
193
+ handle changes in the data source. Here is a general outline of the data
194
+ preparation process: It's worth noting that some pipelines might require
195
+ invoking additional functions or scripts to handle specific data sources or
196
+ formats. These helper scripts can be located within specific directories
197
+ or modules dedicated to the dataset.""")
198
+
199
+ data_preparation_div = Div(
200
+ H3("Data Preparation"),
201
+ text,
202
+ table_div,
203
  Div(
204
+ get_data(target=gen_random_id()),
205
+ style="border: 1px solid #ccc; padding: 20px;",
206
  ),
 
207
  )
208
 
209
+ text = P("""Data preprocessing is a crucial step in the data science
210
+ pipeline. It involves cleaning and transforming raw data into a format that
211
+ is suitable for analysis. This process includes handling missing values,
212
+ normalizing data, encoding categorical variables, and more.""")
 
 
 
 
213
 
214
+ preprocessing_steps = pd.DataFrame(
215
+ {
216
+ "Step": [
217
+ "Language Filter",
218
+ "Min Word Count",
219
+ "Title Abstract",
220
+ "Majority Language",
221
+ "Paragraph Count",
222
+ "Frequency",
223
+ "Unigram Log Probability",
224
+ ],
225
+ "Description": [
226
+ "Filtering data based on language",
227
+ "Setting a minimum word count threshold",
228
+ "Extracting information from the title and abstract",
229
+ "Identifying the majority language in the dataset",
230
+ "Counting the number of paragraphs in each document",
231
+ "Calculating the frequency of each word in the dataset",
232
+ "Calculating the log probability of each unigram",
233
+ ],
234
+ "Need": [
235
+ "To remove documents in unwanted languages",
236
+ "To filter out documents with very few words",
237
+ "To extract relevant information for analysis",
238
+ "To understand the distribution of languages in the dataset",
239
+ "To analyze the structure and length of documents",
240
+ "To identify important words in the dataset",
241
+ "To measure the significance of individual words",
242
+ ],
243
+ "Pros": [
244
+ "Improves data quality by removing irrelevant documents",
245
+ "Filters out low-quality or incomplete documents",
246
+ "Provides additional information for analysis",
247
+ "Enables language-specific analysis and insights",
248
+ "Helps understand the complexity and content of documents",
249
+ "Identifies important terms and topics in the dataset",
250
+ "Quantifies the importance of individual words",
251
+ ],
252
+ "Cons": [
253
+ "May exclude documents in less common languages",
254
+ "May remove documents with valuable information",
255
+ "May introduce bias in the analysis",
256
+ "May not accurately represent the language distribution",
257
+ "May not capture the complexity of document structure",
258
+ "May be sensitive to noise and outliers",
259
+ "May not capture the semantic meaning of words",
260
+ ],
261
+ }
262
  )
263
 
264
+ table_html = preprocessing_steps.to_html(index=False, border=0)
265
+ table_div = Div(NotStr(table_html), style="margin: 40px;")
266
+ data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
267
+
268
+ return Div(
269
+ Section(
270
+ H2("Curated Sources"),
271
+ plotly2fasthtml(get_chart_28168342()),
272
+ data_preparation_div,
273
+ data_preprocessing_div,
274
+ id="inner-text",
275
+ )
276
  )
 
data_viewer.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fasthtml.common import *
2
+ from fasthtml.components import *
3
+ import json
4
+ import string
5
+ import random
6
+
7
+
8
+ def gen_random_id() -> str:
9
+ return "".join(random.choices(string.ascii_lowercase, k=8))
10
+
11
+
12
+ def view_data(
13
+ before,
14
+ after,
15
+ doc_id,
16
+ data_source: str,
17
+ data_sources=None,
18
+ target: str = "colcontent",
19
+ ):
20
+ if data_sources is not None:
21
+ drop_down = Select(
22
+ *[
23
+ Option(ds, value=ds, selected=(ds == data_source))
24
+ for ds in data_sources
25
+ ],
26
+ name=f"data_source_{target}",
27
+ hx_get="/curated",
28
+ hx_target=f"#{target}",
29
+ hx_trigger="change",
30
+ hx_swap="innerHTML",
31
+ hx_vals=json.dumps({"target": f"{target}"}),
32
+ )
33
+
34
+ slider = Input(
35
+ type="range",
36
+ name=f"doc_id_{target}",
37
+ min="0",
38
+ max="9",
39
+ value=str(doc_id),
40
+ hx_get="/curated",
41
+ hx_target=f"#{target}",
42
+ hx_trigger="change",
43
+ hx_swap="innerHTML",
44
+ hx_include=f'[name="data_source_{target}"]',
45
+ hx_vals=json.dumps({"target": f"{target}"}),
46
+ )
47
+
48
+ form = Form(
49
+ Div(
50
+ Label("Data source: ", drop_down),
51
+ )
52
+ if (data_sources is not None)
53
+ else None,
54
+ Div(
55
+ Label("Data sample: ", slider, f"{doc_id}", cls="plotly_slider"),
56
+ ),
57
+ cls="plotly_input_container",
58
+ )
59
+
60
+ col1 = Div(
61
+ H3("Raw format"),
62
+ Pre(
63
+ json.dumps(before, indent=4),
64
+ style="white-space: pre-wrap; word-break: break-all;",
65
+ ),
66
+ style="width: 48%; float: left; overflow-x: auto;",
67
+ )
68
+
69
+ col2 = Div(
70
+ H3("Extracted format"),
71
+ Pre(
72
+ json.dumps(after, indent=4),
73
+ style="white-space: pre-wrap; word-break: break-all;",
74
+ ),
75
+ style="width: 48%; float: right; overflow-x: auto;",
76
+ )
77
+
78
+ data_display = Div(
79
+ col1,
80
+ col2,
81
+ style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
82
+ )
83
+ return Div(form, data_display, style="margin-top: 10px;", id=target)
main.py CHANGED
@@ -1,115 +1,120 @@
1
  from fasthtml.common import *
2
  from fasthtml.components import *
3
  from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline
4
- from fasthtml.components import HR
5
  from plotly import graph_objects as go
6
  from fh_plotly import plotly2fasthtml
7
  import pandas as pd
8
  import json
9
  from rich import print
 
 
 
 
10
 
11
 
12
- app, rt = fast_app(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  @app.get("/")
16
  def main():
17
- return Html(
18
- Head(
19
- Meta(charset="UTF-8"),
20
- Meta(name="viewport", content="width=device-width, initial-scale=1.0"),
21
- Script(src="https://distill.pub/template.v2.js"),
22
- Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"),
23
- Script(src="https://cdn.plot.ly/plotly-latest.min.js"),
24
- Link(rel="stylesheet", href="style.css"),
25
- ),
26
- Body(
27
- D_title(
28
- H1(
29
- "TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models",
30
- cls="l-body",
31
- style="text-align: center;",
32
- ),
33
- Div(
34
- Img(src="images/llm360_logo.png"),
35
- id="title-plot",
36
- cls="main-plot-container l-page",
37
- ),
38
  ),
39
- D_article(
40
- D_contents(
41
- Nav(
42
- H3("Table of Contents"),
43
- Div(
44
- A("TxT360", href="#_self"),
45
- hx_get="/intro",
46
- hx_target="#inner-text",
47
- ),
48
- Div(
49
- Ul(
50
- Li(
51
- A(
52
- "Introduction",
53
- href="/intro#section1",
54
- hx_get="/intro#section1",
55
- hx_target="#inner-text",
56
- )
57
- ),
58
- Li(
59
- A(
60
- "Background",
61
- href="/intro#section2",
62
- hx_get="/intro#section2",
63
- hx_target="#inner-text",
64
- )
65
- ),
66
- Li(
67
- A(
68
- "Main Content",
69
- href="/intro#section3",
70
- hx_get="/intro#section3",
71
- hx_target="#inner-text",
72
- )
73
- ),
74
- Li(
75
- A(
76
- "Conclusion",
77
- href="/intro#section4",
78
- hx_get="/intro#section4",
79
- hx_target="#inner-text",
80
- )
81
- ),
 
 
 
 
 
82
  ),
83
  ),
84
- Div(
85
- A("Web Data", href="#inner-text"),
86
- hx_get="/webdata",
87
- hx_target="#inner-text",
88
- ),
89
- Div(
90
- A("Curated Sources", href="#inner-text"),
91
- hx_get="/curated",
92
- hx_target="#inner-text",
93
- ),
94
- Div(
95
- A("Common Steps", href="#inner-text"),
96
- hx_get="/common",
97
- hx_target="#inner-text",
98
- ),
99
- Div(
100
- A("TxT360 Results", href="#inner-text"),
101
- hx_get="/results",
102
- hx_target="#inner-text",
103
- ),
104
- role="navigation",
105
- cls="l-text figcaption",
106
  ),
107
- prerendered="true",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  ),
109
- intro(),
110
  ),
 
111
  ),
112
- lang="en",
113
  )
114
 
115
 
@@ -254,197 +259,12 @@ def intro():
254
  )
255
 
256
 
257
- @app.get("/webdata")
258
- def web_data():
259
- return Div(Section(H2(P("Web Data")), id="inner-text"))
260
-
261
-
262
- def get_chart_28168342():
263
- fig = go.Figure()
264
- filter_names = [
265
- "Download",
266
- "Language",
267
- "Min word count",
268
- "Title Abstract",
269
- "Majority language",
270
- "Paragraph count",
271
- "Frequency",
272
- "Unigram log probability",
273
- "Local dedup",
274
- ]
275
-
276
- data_sources = [
277
- ("Wikipedia", [100, 90, 80, 70, 60, 50, 40, 30, 20]),
278
- ("Freelaw", [100, 90, 80, 70, 60, 50, 40, 20, 20]),
279
- ("DM Maths", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
280
- ("USPTO", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
281
- ("PG19", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
282
- ("Hackernews", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
283
- ("Ubuntu IRC", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
284
- ("Europarl", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
285
- ("StackExchange", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
286
- ("Arxiv", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
287
- ("S2ORC", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
288
- ("S2ORC Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
289
- ("PubMed Central", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
290
- ("PubMed Central Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
291
- ("PhilPapers", [100, 90, 80, 70, 60, 40, 40, 30, 20]),
292
- ]
293
-
294
- for name, x_values in data_sources:
295
- fig.add_trace(
296
- go.Funnel(
297
- name=name,
298
- orientation="h",
299
- y=filter_names,
300
- x=x_values,
301
- textinfo="value+percent total",
302
- textposition="inside",
303
- )
304
- )
305
-
306
- fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
307
- return fig
308
-
309
-
310
- @app.get("/curated")
311
- def curated(request):
312
- from curated import get_data
313
-
314
- # Partial Updates
315
- params = request.query_params
316
- if data_source := params.get("data_source"):
317
- return get_data(data_source, params.get("doc_id", 3))
318
- if doc_id := params.get("doc_id"):
319
- return get_data(params.get("data_source"), doc_id)
320
-
321
- hr = HR()
322
- data_preparation_steps = pd.DataFrame(
323
- {
324
- "Method": [
325
- "HTTP/FTP dumps",
326
- "Web crawling",
327
- "Archive snapshot",
328
- "Generated",
329
- "Curated",
330
- ],
331
- "Description": [
332
- "Acquiring data from HTTP/FTP dumps",
333
- "Crawling websites to extract data",
334
- "Working with archive dumps",
335
- "Generating synthetic data",
336
- "High quality curated data",
337
- ],
338
- "Source": [
339
- "Freelaw | Wikipedia | PhilPapers | Arxiv | S2ORC | Pubmeds",
340
- "USPTO | Hackernews | Ubuntu IRC",
341
- "StackExchange",
342
- "DM Maths",
343
- "PG19 | Europarl",
344
- ],
345
- }
346
- )
347
-
348
- table_html = data_preparation_steps.to_html(index=False, border=0)
349
- table_div = Div(NotStr(table_html), style="margin: 40px;")
350
-
351
- text = P("""This initial stage serves as the foundation for the entire
352
- process. Here, we focus on acquiring and extracting the raw data, which can
353
- come from various sources such as crawling websites, using HTTP/FTP dumps,
354
- or working with archive dumps. For instance, to download and prepare a
355
- dataset, we can specific downloaders based on the data source. Each dataset
356
- might have its own downloader script which can be updated in real time to
357
- handle changes in the data source. Here is a general outline of the data
358
- preparation process: It's worth noting that some pipelines might require
359
- invoking additional functions or scripts to handle specific data sources or
360
- formats. These helper scripts can be located within specific directories
361
- or modules dedicated to the dataset.""")
362
-
363
- data_preparation_div = Div(
364
- H3("Data Preparation"),
365
- text,
366
- table_div,
367
- Div(get_data(), style="border: 1px solid #ccc; padding: 20px;"),
368
- )
369
-
370
- text = P("""Data preprocessing is a crucial step in the data science
371
- pipeline. It involves cleaning and transforming raw data into a format that
372
- is suitable for analysis. This process includes handling missing values,
373
- normalizing data, encoding categorical variables, and more.""")
374
-
375
- preprocessing_steps = pd.DataFrame(
376
- {
377
- "Step": [
378
- "Language Filter",
379
- "Min Word Count",
380
- "Title Abstract",
381
- "Majority Language",
382
- "Paragraph Count",
383
- "Frequency",
384
- "Unigram Log Probability",
385
- ],
386
- "Description": [
387
- "Filtering data based on language",
388
- "Setting a minimum word count threshold",
389
- "Extracting information from the title and abstract",
390
- "Identifying the majority language in the dataset",
391
- "Counting the number of paragraphs in each document",
392
- "Calculating the frequency of each word in the dataset",
393
- "Calculating the log probability of each unigram",
394
- ],
395
- "Need": [
396
- "To remove documents in unwanted languages",
397
- "To filter out documents with very few words",
398
- "To extract relevant information for analysis",
399
- "To understand the distribution of languages in the dataset",
400
- "To analyze the structure and length of documents",
401
- "To identify important words in the dataset",
402
- "To measure the significance of individual words",
403
- ],
404
- "Pros": [
405
- "Improves data quality by removing irrelevant documents",
406
- "Filters out low-quality or incomplete documents",
407
- "Provides additional information for analysis",
408
- "Enables language-specific analysis and insights",
409
- "Helps understand the complexity and content of documents",
410
- "Identifies important terms and topics in the dataset",
411
- "Quantifies the importance of individual words",
412
- ],
413
- "Cons": [
414
- "May exclude documents in less common languages",
415
- "May remove documents with valuable information",
416
- "May introduce bias in the analysis",
417
- "May not accurately represent the language distribution",
418
- "May not capture the complexity of document structure",
419
- "May be sensitive to noise and outliers",
420
- "May not capture the semantic meaning of words",
421
- ],
422
- }
423
- )
424
-
425
- table_html = preprocessing_steps.to_html(index=False, border=0)
426
- table_div = Div(NotStr(table_html), style="margin: 40px;")
427
- data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
428
-
429
- return Div(
430
- Section(
431
- H2("Curated Sources"),
432
- plotly2fasthtml(get_chart_28168342()),
433
- data_preparation_div,
434
- data_preprocessing_div,
435
- id="inner-text",
436
- )
437
- )
438
-
439
-
440
- @app.get("/common")
441
- def common_steps():
442
- return Div(Section(H2(P("Common Steps")), id="inner-text"))
443
 
 
444
 
445
- @app.get("/results")
446
- def results():
447
- return Div(Section(H2(P("Results")), id="inner-text"))
448
 
 
449
 
450
  serve()
 
1
  from fasthtml.common import *
2
  from fasthtml.components import *
3
  from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline
 
4
  from plotly import graph_objects as go
5
  from fh_plotly import plotly2fasthtml
6
  import pandas as pd
7
  import json
8
  from rich import print
9
+ import curated
10
+ import web
11
+ import common
12
+ import results
13
 
14
 
15
+ app, rt = fast_app(
16
+ debug=True,
17
+ pico=False,
18
+ hdrs=(
19
+ Meta(charset="UTF-8"),
20
+ Meta(name="viewport", content="width=device-width, initial-scale=1.0"),
21
+ Script(src="https://distill.pub/template.v2.js"),
22
+ Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"),
23
+ Script(src="https://cdn.plot.ly/plotly-latest.min.js"),
24
+ Link(rel="stylesheet", href="style.css"),
25
+ MarkdownJS(),
26
+ HighlightJS(langs=["python", "javascript", "html", "css"]),
27
+ ),
28
+ )
29
 
30
 
31
  @app.get("/")
32
  def main():
33
+ return Div(
34
+ D_front_matter(),
35
+ D_title(
36
+ H1(
37
+ "TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models",
38
+ cls="l-body",
39
+ style="text-align: center;",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ),
41
+ Div(
42
+ Img(src="images/llm360_logo.png"),
43
+ id="title-plot",
44
+ cls="main-plot-container l-page",
45
+ ),
46
+ ),
47
+ D_article(
48
+ D_contents(
49
+ Nav(
50
+ H3("Table of Contents"),
51
+ Div(
52
+ A("TxT360", href="#_self"),
53
+ hx_get="/intro",
54
+ hx_target="#inner-text",
55
+ ),
56
+ Div(
57
+ Ul(
58
+ Li(
59
+ A(
60
+ "Introduction",
61
+ href="/intro#section1",
62
+ hx_get="/intro#section1",
63
+ hx_target="#inner-text",
64
+ )
65
+ ),
66
+ Li(
67
+ A(
68
+ "Background",
69
+ href="/intro#section2",
70
+ hx_get="/intro#section2",
71
+ hx_target="#inner-text",
72
+ )
73
+ ),
74
+ Li(
75
+ A(
76
+ "Main Content",
77
+ href="/intro#section3",
78
+ hx_get="/intro#section3",
79
+ hx_target="#inner-text",
80
+ )
81
+ ),
82
+ Li(
83
+ A(
84
+ "Conclusion",
85
+ href="/intro#section4",
86
+ hx_get="/intro#section4",
87
+ hx_target="#inner-text",
88
+ )
89
  ),
90
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  ),
92
+ Div(
93
+ A("Web Data", href="#inner-text"),
94
+ hx_get="/webdata",
95
+ hx_target="#inner-text",
96
+ ),
97
+ Div(
98
+ A("Curated Sources", href="#inner-text"),
99
+ hx_get="/curated",
100
+ hx_target="#inner-text",
101
+ ),
102
+ Div(
103
+ A("Common Steps", href="#inner-text"),
104
+ hx_get="/common",
105
+ hx_target="#inner-text",
106
+ ),
107
+ Div(
108
+ A("TxT360 Results", href="#inner-text"),
109
+ hx_get="/results",
110
+ hx_target="#inner-text",
111
+ ),
112
+ role="navigation",
113
+ cls="l-text figcaption",
114
  ),
 
115
  ),
116
+ intro(),
117
  ),
 
118
  )
119
 
120
 
 
259
  )
260
 
261
 
262
+ rt("/curated")(curated.curated)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
+ rt("/webdata")(web.web_data)
265
 
266
+ rt("/common")(common.common_steps)
 
 
267
 
268
+ rt("/results")(results.results)
269
 
270
  serve()
results.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from fasthtml.common import *
2
+ from fasthtml.components import *
3
+
4
+
5
+ def results():
6
+ return Div(Section(H2(P("Results")), id="inner-text"))
7
+
web.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from fasthtml.common import *
2
+ from fasthtml.components import *
3
+
4
+
5
+ def web_data():
6
+ return Div(Section(H2(P("Web Data")), id="inner-text"))
7
+