fil commited on
Commit
40ea232
·
1 Parent(s): f87af5d

use a *much faster* SQL query

Browse files
Files changed (1) hide show
  1. docs/gazette.md +42 -27
docs/gazette.md CHANGED
@@ -7,7 +7,7 @@ const db = DuckDBClient.of({ presse: FileAttachment("data/presse.parquet") });
7
 
8
  This page allows you to explore the 3 million newspapers by title. I called it “Gazette” because I was surprised that most of the corpus in the earlier years had a title containing this word.
9
 
10
- Type in words such as “jeune”, “révolution”, “république”, “soir”, “fille”, “femme”, “paysan”, “ouvrier”, “social”, etc., to see different historical trends.
11
 
12
  ```js
13
  const search = view(
@@ -15,41 +15,56 @@ const search = view(
15
  );
16
  ```
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ```js echo
19
- display(
20
- Plot.plot({
21
- x: { nice: true },
22
- y: {
23
- label: `Share of titles matching ${search}`,
24
- tickFormat: "%",
25
- },
26
- marks: [
27
- Plot.areaY(gazette, {
28
- x: "year",
29
- y: (d) => d.matches / d.total,
30
- fillOpacity: 0.2,
31
- }),
32
- Plot.lineY(gazette, {
33
- x: "year",
34
- y: (d) => d.matches / d.total,
35
- }),
36
- ],
37
- })
38
  );
39
  ```
40
 
41
- The query uses the [REGEXP_MATCHES](https://duckdb.org/docs/archive/0.9.2/sql/functions/patternmatching) operator to count occurrences; you can query for example “socialis[tm]e” to match both “socialiste” and “socialisme”. The 'i' flag makes it ignore case.
 
 
 
42
 
43
- ```js echo
44
- const gazette = db.query(
 
45
  `SELECT year
46
- , SUM(CASE WHEN REGEXP_MATCHES(title, ?, 'i') THEN 1 ELSE 0 END)::int matches
47
- , COUNT(*) total
48
  FROM presse
49
  WHERE year > '1000'
50
  GROUP BY year
51
  ORDER BY year
52
- `,
53
- [search]
54
  );
55
  ```
 
7
 
8
  This page allows you to explore the 3 million newspapers by title. I called it “Gazette” because I was surprised that most of the corpus in the earlier years had a title containing this word.
9
 
10
+ Type in words such as “jeune”, “révolution”, “république”, “matin”, “soir”, “humanité”, “nouvelle”, “moderne”, “femme”, “paysan”, “ouvrier”, “social”, “résistance” etc. to see different historical trends.
11
 
12
  ```js
13
  const search = view(
 
15
  );
16
  ```
17
 
18
+ ${
19
+ Plot.plot({
20
+ x: { nice: true },
21
+ y: {
22
+ label: `Share of titles matching ${search}`,
23
+ tickFormat: "%",
24
+ },
25
+ marks: [
26
+ Plot.areaY(base, {
27
+ x: "year",
28
+ y: ({year, total}) => gazette.get(year) / total,
29
+ fillOpacity: 0.2,
30
+ curve: "step"
31
+ }),
32
+ Plot.lineY(base, {
33
+ x: "year",
34
+ y: ({year, total}) => gazette.get(year) / total,
35
+ curve: "step"
36
+ }),
37
+ ],
38
+ })
39
+ }
40
+
41
+ The query uses a case-insensitive [REGEXP_MATCHES](https://duckdb.org/docs/archive/0.9.2/sql/functions/patternmatching) operator to count occurrences; you can query for example “socialis[tm]e” to match both “socialiste” and “socialisme”.
42
+
43
  ```js echo
44
+ const results = db.query(
45
+ `SELECT year, COUNT() c
46
+ FROM presse
47
+ WHERE REGEXP_MATCHES(title, ?, 'i')
48
+ GROUP BY year
49
+ `,
50
+ [search]
 
 
 
 
 
 
 
 
 
 
 
 
51
  );
52
  ```
53
 
54
+ ```js
55
+ // A Map for fast retrieval—precisely an InternMap, indexed by Date
56
+ const gazette = new d3.InternMap(results.map(({ year, c }) => [year, c]));
57
+ ```
58
 
59
+ ```js
60
+ // The base denominator (count by year)
61
+ const base = db.query(
62
  `SELECT year
63
+ , COUNT(*)::int total
 
64
  FROM presse
65
  WHERE year > '1000'
66
  GROUP BY year
67
  ORDER BY year
68
+ `
 
69
  );
70
  ```