Spaces:

lhoestq
/

duckdb-spreadsheets

Running

App Files Files Community

lhoestq HF Staff commited on Nov 20, 2024

Commit

9a96811

1 Parent(s): 8a9db0e

v0

Browse files

Files changed (3) hide show

app.py +92 -20
requirements.txt +1 -0
text_functions.tsv +82 -0

app.py CHANGED Viewed

@@ -1,9 +1,14 @@
 import gradio as gr
 import requests
 from huggingface_hub import HfApi
-session = requests.Session()
 css = """
 @media (prefers-color-scheme: dark) {
     .transparent-dropdown, .transparent-dropdown .container .wrap  {
@@ -15,23 +20,63 @@ css = """
         background: var(--bg);
     }
 }
 """
-with gr.Blocks(css=css) as demo:
-    with gr.Row():
-        with gr.Column(scale=4):
-            with gr.Group():
-                dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, interactive=True)
-                with gr.Row():
-                    subset_dropdown = gr.Dropdown(info="Subset", show_label=False, visible=False, elem_classes="transparent-dropdown")
-                    split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False, elem_classes="transparent-dropdown")
-        gr.LoginButton()
     loading_codes_json = gr.JSON(visible=False)
     dataset_subset_split_textbox = gr.Textbox(visible=False)
-    dataframe = gr.DataFrame()
     @demo.load(outputs=dataset_dropdown)
-    def fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
         api = HfApi(token=oauth_token.token if oauth_token else None)
         datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"]))
         if oauth_token and (user := api.whoami().get("user")):
@@ -40,14 +85,14 @@ with gr.Blocks(css=css) as demo:
         return {dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset)}
     @dataset_dropdown.change(inputs=dataset_dropdown, outputs=loading_codes_json)
-    def fetch_read_parquet_loading(dataset: str):
-        if "/" not in dataset.strip().strip("/"):
             return []
-        resp = session.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
-        return ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] == "dd.read_parquet"] or [[]])[0] or []
     @loading_codes_json.change(inputs=loading_codes_json, outputs=[subset_dropdown, split_dropdown])
-    def show_subset_dropdown(loading_codes: list[dict]):
         subsets = [loading_code["config_name"] for loading_code in loading_codes]
         subset = (subsets or [""])[0]
         splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
@@ -55,11 +100,38 @@ with gr.Blocks(css=css) as demo:
         return gr.Dropdown(subsets, value=subset, visible=len(subsets) > 1), gr.Dropdown(splits, value=split, visible=len(splits) > 1)
     @subset_dropdown.change(inputs=[loading_codes_json, subset_dropdown], outputs=split_dropdown)
-    def show_split_dropdown(loading_codes: list[dict], subset: str):
         splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
         split = (splits or [""])[0]
         return gr.Dropdown(splits, value=split, visible=len(splits) > 1)
 if __name__ == "__main__":

+from functools import partial, lru_cache
+import duckdb
 import gradio as gr
+import pandas as pd
 import requests
 from huggingface_hub import HfApi
+READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
+EMPTY_DF = pd.DataFrame([{str(i): "" for i in range(4)}] * 10)
+MAX_NUM_COLUMNS = 20
 css = """
 @media (prefers-color-scheme: dark) {
     .transparent-dropdown, .transparent-dropdown .container .wrap  {
         background: var(--bg);
     }
 }
+input {
+  -webkit-user-select: none;
+  -moz-user-select: none;
+  -ms-user-select: none;
+  user-select: none;
+}
+.cell-menu-button {
+    z-index: -1;
+}
+thead {
+    display: none;
+}
 """
+js = """
+function setDataFrameReadonly() {
+    MutationObserver = window.MutationObserver || window.WebKitMutationObserver;
+    var observer = new MutationObserver(function(mutations, observer) {
+        // fired when a mutation occurs
+        document.querySelectorAll('.readonly-dataframe div .table-wrap button svelte-virtual-table-viewport table tbody tr td .cell-wrap input').forEach(i => i.setAttribute("readonly", "true"));
+    });
+    // define what element should be observed by the observer
+    // and what types of mutations trigger the callback
+    observer.observe(document, {
+        subtree: true,
+        childList: true
+    });
+}
+"""
+text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
+def prepare_function(func: str, placeholder: str, column_name: str) -> str:
+    if "(" in func:
+        prepared_func = func.split("(")
+        prepared_func[1] = prepared_func[1].replace(placeholder, column_name, 1)
+        prepared_func = "(".join(prepared_func)
+    else:
+        prepared_func = func.replace(placeholder, column_name, 1)
+    return prepared_func
+with gr.Blocks(css=css, js=js) as demo:
     loading_codes_json = gr.JSON(visible=False)
     dataset_subset_split_textbox = gr.Textbox(visible=False)
+    input_dataframe = gr.DataFrame(visible=False)
+    with gr.Group():
+        with gr.Row():
+            dataset_dropdown = gr.Dropdown(label="Open Dataset", allow_custom_value=True, scale=10)
+            subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
+            split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
+            gr.LoginButton()
+        with gr.Row():
+            transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in EMPTY_DF.columns]
+            transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
+        dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
     @demo.load(outputs=dataset_dropdown)
+    def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
         api = HfApi(token=oauth_token.token if oauth_token else None)
         datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"]))
         if oauth_token and (user := api.whoami().get("user")):
         return {dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset)}
     @dataset_dropdown.change(inputs=dataset_dropdown, outputs=loading_codes_json)
+    def _fetch_read_parquet_loading(dataset: str):
+        if dataset and "/" not in dataset.strip().strip("/"):
             return []
+        resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
+        return ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or []
     @loading_codes_json.change(inputs=loading_codes_json, outputs=[subset_dropdown, split_dropdown])
+    def _show_subset_dropdown(loading_codes: list[dict]):
         subsets = [loading_code["config_name"] for loading_code in loading_codes]
         subset = (subsets or [""])[0]
         splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
         return gr.Dropdown(subsets, value=subset, visible=len(subsets) > 1), gr.Dropdown(splits, value=split, visible=len(splits) > 1)
     @subset_dropdown.change(inputs=[loading_codes_json, subset_dropdown], outputs=split_dropdown)
+    def _show_split_dropdown(loading_codes: list[dict], subset: str):
         splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
         split = (splits or [""])[0]
         return gr.Dropdown(splits, value=split, visible=len(splits) > 1)
+    @split_dropdown.change(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=input_dataframe)
+    @lru_cache(maxsize=3)
+    def _set_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
+        pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
+        if dataset and subset and split and pattern:
+            df = duckdb.sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").df()
+            return gr.DataFrame(df, column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))
+        else:
+            return gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns))
+    @input_dataframe.change(inputs=input_dataframe, outputs=transform_dropdowns)
+    def _set_transforms(input_df: pd.DataFrame):
+        new_transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in input_df.columns]
+        new_transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
+        return new_transform_dropdowns
+    def _set_dataframe(input_df: pd.DataFrame, *transforms: tuple[str], column_index: int):
+        try:
+            print(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;")
+            # return input_df
+            return duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;")
+        except Exception as e:
+            raise gr.Error(f"{type(e).__name__}: {e}")
+    for column_index, transform_dropdown in enumerate(transform_dropdowns):
+        transform_dropdown.change(partial(_set_dataframe, column_index=column_index), inputs=[input_dataframe] + transform_dropdowns, outputs=dataframe)
 if __name__ == "__main__":

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ duckdb

text_functions.tsv ADDED Viewed

	@@ -0,0 +1,82 @@

+Name	Description
+string ^@ search_string	Return true if string begins with search_string.
+string || string	Concatenate two strings. Any NULL input results in NULL. See also concat(string, ...).
+string[index]	Extract a single character using a (1-based) index.
+string[begin:end]	Extract a string using slice conventions, see slicing.
+string LIKE target	Returns true if the string matches the like specifier (see Pattern Matching).
+string SIMILAR TO regex	Returns true if the string matches the regex; identical to regexp_full_match (see Pattern Matching).
+array_extract(list, index)	Extract a single character using a (1-based) index.
+array_slice(list, begin, end)	Extract a string using slice conventions. Negative values are accepted.
+ascii(string)	Returns an integer that represents the Unicode code point of the first character of the string.
+bar(x, min, max[, width])	Draw a band whose width is proportional to (x - min) and equal to width characters when x = max. width defaults to 80.
+bit_length(string)	Number of bits in a string.
+chr(x)	Returns a character which is corresponding the ASCII code value or Unicode code point.
+concat_ws(separator, string, ...)	Concatenate many strings, separated by separator. NULL inputs are skipped.
+concat(string, ...)	Concatenate many strings. NULL inputs are skipped. See also string || string.
+contains(string, search_string)	Return true if search_string is found within string.
+ends_with(string, search_string)	Return true if string ends with search_string.
+format_bytes(bytes)	Converts bytes to a human-readable representation using units based on powers of 2 (KiB, MiB, GiB, etc.).
+format(format, parameters, ...)	Formats a string using the fmt syntax.
+from_base64(string)	Convert a base64 encoded string to a character string.
+greatest(x1, x2, ...)	Selects the largest value using lexicographical ordering. Note that lowercase characters are considered “larger” than uppercase characters and collations are not supported.
+hash(value)	Returns a UBIGINT with the hash of the value.
+ilike_escape(string, like_specifier, escape_character)	Returns true if the string matches the like_specifier (see Pattern Matching) using case-insensitive matching. escape_character is used to search for wildcard characters in the string.
+instr(string, search_string)	Return location of first occurrence of search_string in string, counting from 1. Returns 0 if no match found.
+least(x1, x2, ...)	Selects the smallest value using lexicographical ordering. Note that uppercase characters are considered “smaller” than lowercase characters, and collations are not supported.
+left_grapheme(string, count)	Extract the left-most grapheme clusters.
+left(string, count)	Extract the left-most count characters.
+length_grapheme(string)	Number of grapheme clusters in string.
+length(string)	Number of characters in string.
+like_escape(string, like_specifier, escape_character)	Returns true if the string matches the like_specifier (see Pattern Matching) using case-sensitive matching. escape_character is used to search for wildcard characters in the string.
+lower(string)	Convert string to lower case.
+lpad(string, count, character)	Pads the string with the character from the left until it has count characters.
+ltrim(string, characters)	Removes any occurrences of any of the characters from the left side of the string.
+ltrim(string)	Removes any spaces from the left side of the string.
+md5(string)	Returns the MD5 hash of the string as a VARCHAR.
+md5_number(string)	Returns the MD5 hash of the string as a HUGEINT.
+md5_number_lower(string)	Returns the lower 64-bit segment of the MD5 hash of the string as a BIGINT.
+md5_number_higher(string)	Returns the higher 64-bit segment of the MD5 hash of the string as a BIGINT.
+nfc_normalize(string)	Convert string to Unicode NFC normalized string. Useful for comparisons and ordering if text data is mixed between NFC normalized and not.
+not_ilike_escape(string, like_specifier, escape_character)	Returns false if the string matches the like_specifier (see Pattern Matching) using case-sensitive matching. escape_character is used to search for wildcard characters in the string.
+not_like_escape(string, like_specifier, escape_character)	Returns false if the string matches the like_specifier (see Pattern Matching) using case-insensitive matching. escape_character is used to search for wildcard characters in the string.
+ord(string)	Return ASCII character code of the leftmost character in a string.
+parse_dirname(path, separator)	Returns the top-level directory name from the given path. separator options: system, both_slash (default), forward_slash, backslash.
+parse_dirpath(path, separator)	Returns the head of the path (the pathname until the last slash) similarly to Python's os.path.dirname function. separator options: system, both_slash (default), forward_slash, backslash.
+parse_filename(path, trim_extension, separator)	Returns the last component of the path similarly to Python's os.path.basename function. If trim_extension is true, the file extension will be removed (defaults to false). separator options: system, both_slash (default), forward_slash, backslash.
+parse_path(path, separator)	Returns a list of the components (directories and filename) in the path similarly to Python's pathlib.parts function. separator options: system, both_slash (default), forward_slash, backslash.
+position(search_string IN string)	Return location of first occurrence of search_string in string, counting from 1. Returns 0 if no match found.
+printf(format, parameters...)	Formats a string using printf syntax.
+read_text(source)	Returns the content from source (a filename, a list of filenames, or a glob pattern) as a VARCHAR. The file content is first validated to be valid UTF-8. If read_text attempts to read a file with invalid UTF-8 an error is thrown suggesting to use read_blob instead. See the read_text guide for more details.
+regexp_escape(string)	Escapes special patterns to turn string into a regular expression similarly to Python's re.escape function.
+regexp_extract(string, pattern[, group = 0])	If string contains the regexp pattern, returns the capturing group specified by optional parameter group (see Pattern Matching).
+regexp_extract(string, pattern, name_list)	If string contains the regexp pattern, returns the capturing groups as a struct with corresponding names from name_list (see Pattern Matching).
+regexp_extract_all(string, regex[, group = 0])	Split the string along the regex and extract all occurrences of group.
+regexp_full_match(string, regex)	Returns true if the entire string matches the regex (see Pattern Matching).
+regexp_matches(string, pattern)	Returns true if string contains the regexp pattern, false otherwise (see Pattern Matching).
+regexp_replace(string, pattern, replacement)	If string contains the regexp pattern, replaces the matching part with replacement (see Pattern Matching).
+regexp_split_to_array(string, regex)	Splits the string along the regex.
+regexp_split_to_table(string, regex)	Splits the string along the regex and returns a row for each part.
+repeat(string, count)	Repeats the string count number of times.
+replace(string, source, target)	Replaces any occurrences of the source with target in string.
+reverse(string)	Reverses the string.
+right_grapheme(string, count)	Extract the right-most count grapheme clusters.
+right(string, count)	Extract the right-most count characters.
+rpad(string, count, character)	Pads the string with the character from the right until it has count characters.
+rtrim(string, characters)	Removes any occurrences of any of the characters from the right side of the string.
+rtrim(string)	Removes any spaces from the right side of the string.
+sha256(value)	Returns a VARCHAR with the SHA-256 hash of the value.
+split_part(string, separator, index)	Split the string along the separator and return the data at the (1-based) index of the list. If the index is outside the bounds of the list, return an empty string (to match PostgreSQL's behavior).
+starts_with(string, search_string)	Return true if string begins with search_string.
+str_split_regex(string, regex)	Splits the string along the regex.
+string_split_regex(string, regex)	Splits the string along the regex.
+string_split(string, separator)	Splits the string along the separator.
+strip_accents(string)	Strips accents from string.
+strlen(string)	Number of bytes in string.
+strpos(string, search_string)	Return location of first occurrence of search_string in string, counting from 1. Returns 0 if no match found.
+substring(string, start, length)	Extract substring of length characters starting from character start. Note that a start value of 1 refers to the first character of the string.
+substring_grapheme(string, start, length)	Extract substring of length grapheme clusters starting from character start. Note that a start value of 1 refers to the first character of the string.
+to_base64(blob)	Convert a blob to a base64 encoded string.
+trim(string, characters)	Removes any occurrences of any of the characters from either side of the string.
+trim(string)	Removes any spaces from either side of the string.
+unicode(string)	Returns the Unicode code of the first character of the string.
+upper(string)	Convert string to upper case.