ToluClassics commited on
Commit
4350355
β€’
1 Parent(s): 161e138

Upload with huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,34 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ index/**/* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Xsum Test
3
- emoji: πŸš€
4
- colorFrom: yellow
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 3.18.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: XSum Train Dataset Search
3
+ emoji: 🐠
4
+ colorFrom: blue
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 3.12.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client as http_client
2
+ import json
3
+ import logging
4
+ import os
5
+ import re
6
+ import time
7
+ import string
8
+ import traceback
9
+
10
+ import gradio as gr
11
+ from typing import Callable, Optional, Tuple, Union, Dict, Any
12
+ from pyserini import util
13
+ from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder
14
+ from pyserini.index.lucene import IndexReader
15
+
16
+
17
+ Searcher = Union[FaissSearcher, LuceneSearcher]
18
+
19
+ def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher):
20
+ searcher = LuceneSearcher(f'index/')
21
+ searcher.set_language(language)
22
+ if k1 is not None and b is not None:
23
+ searcher.set_bm25(k1, b)
24
+ retriever_name = f'BM25 (k1={k1}, b={b})'
25
+ else:
26
+ retriever_name = 'BM25'
27
+
28
+ return searcher
29
+
30
+
31
+ def get_docid_html(docid):
32
+ if "False":
33
+ docid_html = (
34
+ f"<a "
35
+ f'class="underline-on-hover"'
36
+ f'style="color:#AA4A44;"'
37
+ 'href="https://huggingface.co/datasets/xsum"'
38
+ 'target="_blank"><b>πŸ”’xsum</b></a><span style="color: #7978FF;">/'+f'{docid}</span>'
39
+ )
40
+ else:
41
+ docid_html = (
42
+ f"<a "
43
+ f'class="underline-on-hover"'
44
+ 'title="This dataset is licensed apache-2.0"'
45
+ f'style="color:#2D31FA;"'
46
+ 'href="https://huggingface.co/datasets/πŸš€"'
47
+ 'target="_blank"><b>πŸ”’xsum</b></a><span style="color: #7978FF;">/'+f'{docid}</span>'
48
+ )
49
+ return docid_html
50
+
51
+ def fetch_index_stats(index_path: str) -> Dict[str, Any]:
52
+ """
53
+ Fetch index statistics
54
+ index_path : str
55
+ Path to index directory
56
+ Returns
57
+ -------
58
+ Dictionary of index statistics
59
+ Dictionary Keys ==> total_terms, documents, unique_terms
60
+ """
61
+ assert os.path.exists(index_path), f"Index path {index_path} does not exist"
62
+ index_reader = IndexReader(index_path)
63
+ return index_reader.stats()
64
+
65
+ def process_results(results, highlight_terms=[]):
66
+ if len(results) == 0:
67
+ return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
68
+ No results retrieved.</p><br><hr>"""
69
+
70
+ results_html = ""
71
+ for i in range(len(results)):
72
+ tokens = results["text"][i].split()
73
+ tokens_html = []
74
+ for token in tokens:
75
+ if token in highlight_terms:
76
+ tokens_html.append("<b>{}</b>".format(token))
77
+ else:
78
+ tokens_html.append(token)
79
+ tokens_html = " ".join(tokens_html)
80
+ meta_html = (
81
+ """
82
+ <p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
83
+ """
84
+ )
85
+ docid_html = get_docid_html(results["docid"][i])
86
+ results_html += """{}
87
+ <p style='font-size:20px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
88
+ <p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Score: {}</p>
89
+ <p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
90
+ <p style='font-family: Arial;font-size:15px;'>{}</p>
91
+ <br>
92
+ """.format(
93
+ meta_html, docid_html, results["score"][i], results["lang"], tokens_html
94
+ )
95
+ return results_html + "<hr>"
96
+
97
+ def search(query, language, num_results=10):
98
+ searcher = _load_sparse_searcher(language=language)
99
+
100
+ t_0 = time.time()
101
+ search_results = searcher.search(query, k=num_results)
102
+ search_time = time.time() - t_0
103
+
104
+ results_dict ={"text": [], "docid": [], "score":[], "lang": language}
105
+ for i, result in enumerate(search_results):
106
+ result = json.loads(result.raw)
107
+ results_dict["text"].append(result["contents"])
108
+ results_dict["docid"].append(result["id"])
109
+ results_dict["score"].append(search_results[i].score)
110
+
111
+ return process_results(results_dict)
112
+
113
+ stats = fetch_index_stats('index/')
114
+
115
+ description = f"""# <h2 style="text-align: center;"> πŸš€ πŸ”Ž XSum Train Dataset Search πŸ” πŸš€ </h2>
116
+ <p style="text-align: center;font-size:15px;">This is a demo of Spacerini using the XSum dataset.</p>
117
+ <p style="text-align: center;font-size:20px;">Dataset Statistics: Total Number of Documents = <b>{stats["documents"]}</b>, Number of Terms = <b>{stats["total_terms"]}</b> </p>"""
118
+
119
+ demo = gr.Blocks(
120
+ css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; color:Silver; }"
121
+ )
122
+
123
+ with demo:
124
+ with gr.Row():
125
+ gr.Markdown(value=description)
126
+ with gr.Row():
127
+ query = gr.Textbox(lines=1, max_lines=1, placeholder="Type your query here...", label="Query")
128
+ with gr.Row():
129
+ lang = gr.Dropdown(
130
+ choices=[
131
+ "en",
132
+ "detect_language",
133
+ "all",
134
+ ],
135
+ value="en",
136
+ label="Language",
137
+ )
138
+ with gr.Row():
139
+ k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
140
+ with gr.Row():
141
+ submit_btn = gr.Button("Submit")
142
+ with gr.Row():
143
+ results = gr.HTML(label="Results")
144
+
145
+
146
+ def submit(query, lang, k):
147
+ query = query.strip()
148
+ if query is None or query == "":
149
+ return "", ""
150
+ return {
151
+ results: search(query, lang, k),
152
+ }
153
+
154
+ query.submit(fn=submit, inputs=[query, lang, k], outputs=[results])
155
+ submit_btn.click(submit, inputs=[query, lang, k], outputs=[results])
156
+ demo.launch(enable_queue=True, debug=True)
index/.gitkeep ADDED
File without changes
index/_0.fdm ADDED
Binary file (158 Bytes). View file
 
index/_0.fdt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dee88d3d753c930bd7a7084748ce0245ca3d92e8ca636b283773f3668fa8f71f
3
+ size 25713651
index/_0.fdx ADDED
Binary file (2.17 kB). View file
 
index/_0.fnm ADDED
Binary file (343 Bytes). View file
 
index/_0.nvd ADDED
Binary file (11.4 kB). View file
 
index/_0.nvm ADDED
Binary file (103 Bytes). View file
 
index/_0.si ADDED
Binary file (483 Bytes). View file
 
index/_0_Lucene90_0.doc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c09409a7db9c80d45eb89a1ded576941bb31f5ca468153f02f6de4f8efaf33d
3
+ size 2539833
index/_0_Lucene90_0.dvd ADDED
Binary file (90.7 kB). View file
 
index/_0_Lucene90_0.dvm ADDED
Binary file (133 Bytes). View file
 
index/_0_Lucene90_0.pos ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9452c4e43b83ad05e3cc89a15db1c294675ff0c37879928b4244afcc5f248933
3
+ size 3890273
index/_0_Lucene90_0.tim ADDED
Binary file (660 kB). View file
 
index/_0_Lucene90_0.tip ADDED
Binary file (16.9 kB). View file
 
index/_0_Lucene90_0.tmd ADDED
Binary file (271 Bytes). View file
 
index/segments_1 ADDED
Binary file (154 Bytes). View file
 
index/write.lock ADDED
File without changes
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ openjdk-11-jdk
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ pyserini
2
+ datasets
3
+ faiss-cpu
4
+ torch