Spaces:

HuggingFaceGECLM
/

random_dataset_exploration

Sleeping

App Files Files Community

ola13 commited on Mar 24, 2023

Commit

0f43f50

1 Parent(s): 0112a25

flagging to datasets

Browse files

Files changed (4) hide show

app.py +27 -58
bad_examples/reddit_threaded_bad_examples.jsonl +2 -2
report.jsonl +3 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,61 +1,39 @@
 import json
 import math
 from functools import partial
 import streamlit as st
 import streamlit.components.v1 as components
-from gforms import Form
 BAD_EXAMPLES_PATH = "bad_examples"
 DATA_PATH = "data"
-MAX_DOC_LENGTH = 30000
-def form_callback(
-    element,
-    page_index,
-    element_index,
-    dataset,
-    docid,
-    text,
-    metadata,
-    reason,
-    person,
-    part,
-):
-    if element.name == "Dataset":
-        return dataset
-    if element.name == "Datapoint ID":
-        return docid
-    if element.name == "Text":
-        return text
-    if element.name == "Metadata":
-        return metadata
-    if element.name == "Flagging Reason":
-        return reason
-    if element.name == "Flagging Person":
-        return person
-    if element.name == "Part":
-        return part
-def report_result(dataset, docid, text, metadata, reason, person, part):
-    form = Form()
-    FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform"
-    form.load(FORM_URL)
-    form.fill(
-        partial(
-            form_callback,
-            dataset=dataset,
-            docid=docid,
-            text=text,
-            metadata=metadata,
-            reason=reason,
-            person=person,
-            part=part,
-        ),
     )
-    form.submit()
 def load_jsonl(file_path):
@@ -84,7 +62,6 @@ def save_flag_and_get_next_item(sample, issue):
         f.write(json.dumps(sample) + "\n")
     text = sample["text"]
     sample.pop("text")
     sample.pop("issue")
     sample_id = ""
@@ -94,15 +71,7 @@ def save_flag_and_get_next_item(sample, issue):
     else:
         sample_id = sample["id"]
-    if len(text) > MAX_DOC_LENGTH:
-        num_parts = math.ceil(len(text) / MAX_DOC_LENGTH)
-        for i in range(num_parts):
-            text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH]
-            report_result(
-                dataset, sample_id, text_portion, str(sample), issue, "", str(i)
-            )
-    else:
-        report_result(dataset, sample_id, text, str(sample), issue, "", str(0))
     get_next_item()

 import json
 import math
+import os
+import uuid
 from functools import partial
+import jsonlines
 import streamlit as st
 import streamlit.components.v1 as components
+from huggingface_hub import HfApi
 BAD_EXAMPLES_PATH = "bad_examples"
 DATA_PATH = "data"
+def report_result_dataset(dataset, docid, text, metadata, reason, annotator):
+    with jsonlines.open("report.jsonl", "w") as f:
+        f.write(
+            {
+                "dataset": dataset,
+                "docid": docid,
+                "text": text,
+                "metadata": metadata,
+                "reason": reason,
+                "annotator": annotator,
+            }
+        )
+    api = HfApi()
+    api.upload_file(
+        path_or_fileobj="report.jsonl",
+        path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
+        repo_id="HuggingFaceGECLM/data_feedback",
+        repo_type="dataset",
+        token=os.environ.get("geclm_token"),
     )
 def load_jsonl(file_path):
         f.write(json.dumps(sample) + "\n")
     text = sample["text"]
     sample.pop("text")
     sample.pop("issue")
     sample_id = ""
     else:
         sample_id = sample["id"]
+    report_result_dataset(dataset, sample_id, text, str(sample), issue, "")
     get_next_item()

bad_examples/reddit_threaded_bad_examples.jsonl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90cccebb836615224b151fe1576ad3667933d425bc16e0e8f231671e151b0dbb
-size 2971

 version https://git-lfs.github.com/spec/v1
+oid sha256:562ed8ca881c564329b0cb138863cdedfd8339913635ddd1f4c733fc723b3230
+size 13634

report.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c93030e14be30172f1313437e54c47c138d99b91cedefc965f88ba1e5c6025c6
+size 2250

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
-gforms
 streamlit==1.20.0

+huggingface_hub
+jsonlines
 streamlit==1.20.0