Spaces:
Running
Running
use a *much faster* SQL query
Browse files- docs/gazette.md +42 -27
docs/gazette.md
CHANGED
@@ -7,7 +7,7 @@ const db = DuckDBClient.of({ presse: FileAttachment("data/presse.parquet") });
|
|
7 |
|
8 |
This page allows you to explore the 3 million newspapers by title. I called it “Gazette” because I was surprised that most of the corpus in the earlier years had a title containing this word.
|
9 |
|
10 |
-
Type in words such as “jeune”, “révolution”, “république”, “soir”, “
|
11 |
|
12 |
```js
|
13 |
const search = view(
|
@@ -15,41 +15,56 @@ const search = view(
|
|
15 |
);
|
16 |
```
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
```js echo
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
marks: [
|
27 |
-
Plot.areaY(gazette, {
|
28 |
-
x: "year",
|
29 |
-
y: (d) => d.matches / d.total,
|
30 |
-
fillOpacity: 0.2,
|
31 |
-
}),
|
32 |
-
Plot.lineY(gazette, {
|
33 |
-
x: "year",
|
34 |
-
y: (d) => d.matches / d.total,
|
35 |
-
}),
|
36 |
-
],
|
37 |
-
})
|
38 |
);
|
39 |
```
|
40 |
|
41 |
-
|
|
|
|
|
|
|
42 |
|
43 |
-
```js
|
44 |
-
|
|
|
45 |
`SELECT year
|
46 |
-
,
|
47 |
-
, COUNT(*) total
|
48 |
FROM presse
|
49 |
WHERE year > '1000'
|
50 |
GROUP BY year
|
51 |
ORDER BY year
|
52 |
-
|
53 |
-
[search]
|
54 |
);
|
55 |
```
|
|
|
7 |
|
8 |
This page allows you to explore the 3 million newspapers by title. I called it “Gazette” because I was surprised that most of the corpus in the earlier years had a title containing this word.
|
9 |
|
10 |
+
Type in words such as “jeune”, “révolution”, “république”, “matin”, “soir”, “humanité”, “nouvelle”, “moderne”, “femme”, “paysan”, “ouvrier”, “social”, “résistance” etc. to see different historical trends.
|
11 |
|
12 |
```js
|
13 |
const search = view(
|
|
|
15 |
);
|
16 |
```
|
17 |
|
18 |
+
${
|
19 |
+
Plot.plot({
|
20 |
+
x: { nice: true },
|
21 |
+
y: {
|
22 |
+
label: `Share of titles matching ${search}`,
|
23 |
+
tickFormat: "%",
|
24 |
+
},
|
25 |
+
marks: [
|
26 |
+
Plot.areaY(base, {
|
27 |
+
x: "year",
|
28 |
+
y: ({year, total}) => gazette.get(year) / total,
|
29 |
+
fillOpacity: 0.2,
|
30 |
+
curve: "step"
|
31 |
+
}),
|
32 |
+
Plot.lineY(base, {
|
33 |
+
x: "year",
|
34 |
+
y: ({year, total}) => gazette.get(year) / total,
|
35 |
+
curve: "step"
|
36 |
+
}),
|
37 |
+
],
|
38 |
+
})
|
39 |
+
}
|
40 |
+
|
41 |
+
The query uses a case-insensitive [REGEXP_MATCHES](https://duckdb.org/docs/archive/0.9.2/sql/functions/patternmatching) operator to count occurrences; you can query for example “socialis[tm]e” to match both “socialiste” and “socialisme”.
|
42 |
+
|
43 |
```js echo
|
44 |
+
const results = db.query(
|
45 |
+
`SELECT year, COUNT() c
|
46 |
+
FROM presse
|
47 |
+
WHERE REGEXP_MATCHES(title, ?, 'i')
|
48 |
+
GROUP BY year
|
49 |
+
`,
|
50 |
+
[search]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
);
|
52 |
```
|
53 |
|
54 |
+
```js
|
55 |
+
// A Map for fast retrieval—precisely an InternMap, indexed by Date
|
56 |
+
const gazette = new d3.InternMap(results.map(({ year, c }) => [year, c]));
|
57 |
+
```
|
58 |
|
59 |
+
```js
|
60 |
+
// The base denominator (count by year)
|
61 |
+
const base = db.query(
|
62 |
`SELECT year
|
63 |
+
, COUNT(*)::int total
|
|
|
64 |
FROM presse
|
65 |
WHERE year > '1000'
|
66 |
GROUP BY year
|
67 |
ORDER BY year
|
68 |
+
`
|
|
|
69 |
);
|
70 |
```
|