Datasets-Convertor

Running

App Files Files Community

openfree commited on 1 day ago

Commit

5322786

verified ·

1 Parent(s): 4a831bc

Create app-backup.py

Browse files

Files changed (1) hide show

app-backup.py +179 -0

app-backup.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import gradio as gr
+import pandas as pd
+import json
+from io import BytesIO
+import requests
+def dataset_converter(input_file, conversion_type, parquet_url):
+    # Initialize variables for file data and extension
+    file_bytes = None
+    file_name = None
+    file_extension = None
+    # Read the input file if provided
+    if input_file is not None:
+        try:
+            file_bytes = input_file.read()
+            file_name = input_file.name
+        except AttributeError:
+            file_name = input_file
+            with open(file_name, "rb") as f:
+                file_bytes = f.read()
+        file_extension = file_name.lower().split('.')[-1]
+    # Conversion: CSV to Parquet
+    if conversion_type == "CSV to Parquet":
+        if input_file is None or file_extension != "csv":
+            raise ValueError("For CSV to Parquet conversion, please upload a CSV file. 📄")
+        df = pd.read_csv(BytesIO(file_bytes))
+        output_file = "output.parquet"
+        df.to_parquet(output_file, index=False)
+        converted_format = "Parquet"
+        preview_str = df.head(10).to_string(index=False)
+    # Conversion: Parquet to CSV
+    elif conversion_type == "Parquet to CSV":
+        if input_file is None or file_extension != "parquet":
+            raise ValueError("For Parquet to CSV conversion, please upload a Parquet file. 📄")
+        df = pd.read_parquet(BytesIO(file_bytes))
+        output_file = "output.csv"
+        df.to_csv(output_file, index=False)
+        converted_format = "CSV"
+        preview_str = df.head(10).to_string(index=False)
+    # Conversion: CSV to JSONL
+    elif conversion_type == "CSV to JSONL":
+        if input_file is None or file_extension != "csv":
+            raise ValueError("For CSV to JSONL conversion, please upload a CSV file. 📄")
+        # Read CSV with latin1 encoding
+        df = pd.read_csv(BytesIO(file_bytes), encoding='latin1')
+        output_file = "metadata.jsonl"
+        total_data = []
+        for index, row in df.iterrows():
+            data = {}
+            file_name_val = None  # Initialize file_name for each row
+            for column in df.columns:
+                if column == 'file_name':
+                    file_name_val = row[column]
+                data[column] = row[column]
+            row_data = {"file_name": file_name_val, "ground_truth": json.dumps(data)}
+            total_data.append(row_data)
+        # Write JSONL output (using write mode so previous data is overwritten)
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for row_data in total_data:
+                f.write(json.dumps(row_data) + '\n')
+        converted_format = "JSONL"
+        preview_str = df.head(10).to_string(index=False)
+    # Conversion: Parquet to JSONL
+    elif conversion_type == "Parquet to JSONL":
+        # Use uploaded file if available; otherwise try the provided URL
+        if input_file is not None:
+            df = pd.read_parquet(BytesIO(file_bytes))
+        elif parquet_url:
+            response = requests.get(parquet_url)
+            response.raise_for_status()  # Ensure the request was successful
+            df = pd.read_parquet(BytesIO(response.content))
+            file_name = "from_url.parquet"
+        else:
+            raise ValueError("For Parquet to JSONL conversion, please upload a file or provide a URL. 🌐")
+        output_file = "output.jsonl"
+        # Recursive function to decode bytes to UTF-8 strings
+        def recursive_sanitize(val):
+            if isinstance(val, bytes):
+                return val.decode("utf-8", errors="replace")
+            elif isinstance(val, dict):
+                return {k: recursive_sanitize(v) for k, v in val.items()}
+            elif isinstance(val, list):
+                return [recursive_sanitize(item) for item in val]
+            else:
+                return val
+        records = df.to_dict(orient="records")
+        with open(output_file, "w", encoding="utf-8") as f:
+            for record in records:
+                sanitized_record = recursive_sanitize(record)
+                f.write(json.dumps(sanitized_record, ensure_ascii=False) + "\n")
+        converted_format = "JSONL"
+        preview_str = df.head(10).to_string(index=False)
+    else:
+        raise ValueError("Invalid conversion type selected. ⚠️")
+    info_message = (
+        f"Input file: {file_name if file_name is not None else 'N/A'}\n"
+        f"Converted file format: {converted_format}\n\n"
+        f"Preview (Top 10 Rows):\n{preview_str}\n\n"
+        "Community: https://discord.gg/openfreeai 🚀"
+    )
+    return output_file, info_message
+# Custom CSS for a modern and sleek look
+custom_css = """
+body {
+    background-color: #f4f4f4;
+    font-family: 'Helvetica Neue', Arial, sans-serif;
+}
+.gradio-container {
+    max-width: 900px;
+    margin: 40px auto;
+    padding: 20px;
+    background-color: #ffffff;
+    border-radius: 12px;
+    box-shadow: 0 8px 16px rgba(0,0,0,0.1);
+}
+h1, h2 {
+    color: #333333;
+}
+.gradio-input, .gradio-output {
+    margin-bottom: 20px;
+}
+.gradio-button {
+    background-color: #4CAF50 !important;
+    color: white !important;
+    border: none !important;
+    padding: 10px 20px !important;
+    font-size: 16px !important;
+    border-radius: 6px !important;
+    cursor: pointer;
+}
+.gradio-button:hover {
+    background-color: #45a049 !important;
+}
+"""
+with gr.Blocks(css=custom_css, title="Datasets Convertor") as demo:
+    gr.Markdown("# Datasets Convertor 🚀")
+    gr.Markdown(
+        "Upload a CSV or Parquet file (or provide a Parquet file URL for Parquet to JSONL conversion) "
+        "and select the conversion type. The app converts the file to the desired format and displays a preview of the top 10 rows. ✨"
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_file = gr.File(label="Upload CSV or Parquet File 📄")
+        with gr.Column(scale=1):
+            conversion_type = gr.Radio(
+                choices=["CSV to Parquet", "Parquet to CSV", "CSV to JSONL", "Parquet to JSONL"],
+                label="Conversion Type 🔄"
+            )
+    # Optional URL input for Parquet to JSONL conversion
+    parquet_url = gr.Textbox(label="Parquet File URL (Optional) 🌐", placeholder="Enter URL if not uploading a file")
+    convert_button = gr.Button("Convert ⚡", elem_classes=["gradio-button"])
+    with gr.Row():
+        output_file = gr.File(label="Converted File 💾")
+        preview = gr.Textbox(label="Preview (Top 10 Rows) 🔍", lines=15)
+    convert_button.click(
+        fn=dataset_converter,
+        inputs=[input_file, conversion_type, parquet_url],
+        outputs=[output_file, preview]
+    )
+    gr.Markdown("**Join our Community:** [https://discord.gg/openfreeai](https://discord.gg/openfreeai) 🤝")
+demo.launch()