victormiller commited on
Commit
dc5ac06
·
verified ·
1 Parent(s): d95e4d8

Update common.py

Browse files
Files changed (1) hide show
  1. common.py +48 -1
common.py CHANGED
@@ -29,6 +29,53 @@ fig = px.bar(
29
 
30
  dup_cluster_graph = fig.update_layout(showlegend=False)
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  dup_docs_count = {
33
  "80": 382164413,
34
  "90": 660766607,
@@ -224,7 +271,7 @@ global_div = Div(
224
  H3("Finding Duplicate Pairs"),
225
  P("Multiple bands can create the same document pairs, leading to duplicates. The simplest way to eliminate these duplicate pairs is to call distinct() before the compute(). However, we found that Dask is not very efficient when it comes to distributed distinct execution. Additionally, since we process each band separately, this approach wouldn’t remove duplicates across different bands."),
226
  P("To address this, we use a Bloom filter with a capacity of 64 billion and a false positive rate of 0.001 to remove duplicates. One way we parallelize the Bloom filter execution is by partitioning pairs horizontally and running one filter per partition, as shown in the table below. There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. This step reduces the number of pairs by nearly ninefold."),
227
- P("THIS IS A PLACEHOLDER FOR A GRAPH"),
228
  P("The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches. This step produced 1.9 TB of unique pairs."),
229
  ),
230
  Section(
 
29
 
30
  dup_cluster_graph = fig.update_layout(showlegend=False)
31
 
32
+
33
+ bloom_filter_table_info = pd.DataFrame(
34
+ {
35
+ "Bloom Filter": [
36
+ "BF 0",
37
+ "BF 8 ",
38
+ ],
39
+ "Band 0": [
40
+ """
41
+ (A,B)
42
+ (C,D)
43
+ (E,K)
44
+ """,
45
+ "(B,K)",
46
+ ],
47
+ "Band 1": [
48
+ """
49
+ (A,B)
50
+ (C,D)
51
+ (F,K)
52
+ """,,
53
+ "(B,K)",
54
+ ],
55
+ "....": [
56
+ "...",
57
+ "...",
58
+ ],
59
+ "Band 8": [
60
+ """
61
+ (A,B)
62
+ (C,D)
63
+ (D,E)
64
+ """,
65
+ """
66
+ (E,K)
67
+ (B,K)
68
+ """,
69
+ ],
70
+
71
+ }
72
+ )
73
+
74
+ table_html_bloom_filter = bloom_filter_table_info.to_html(index=False, border=0)
75
+ table_div_bloom_examples = Div(NotStr(table_html_bloom_filter), style="margin: 40px;")
76
+
77
+
78
+
79
  dup_docs_count = {
80
  "80": 382164413,
81
  "90": 660766607,
 
271
  H3("Finding Duplicate Pairs"),
272
  P("Multiple bands can create the same document pairs, leading to duplicates. The simplest way to eliminate these duplicate pairs is to call distinct() before the compute(). However, we found that Dask is not very efficient when it comes to distributed distinct execution. Additionally, since we process each band separately, this approach wouldn’t remove duplicates across different bands."),
273
  P("To address this, we use a Bloom filter with a capacity of 64 billion and a false positive rate of 0.001 to remove duplicates. One way we parallelize the Bloom filter execution is by partitioning pairs horizontally and running one filter per partition, as shown in the table below. There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. This step reduces the number of pairs by nearly ninefold."),
274
+ table_div_bloom_examples,
275
  P("The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches. This step produced 1.9 TB of unique pairs."),
276
  ),
277
  Section(