Dark_humor_generator / data_playground.py
Beav3r's picture
Upload folder using huggingface_hub
de37907 verified
import os
import pandas as pd
import json
from typing import Callable
# Define the base path
base_path = "/mnt/d/Semester7/NLP/RAG/Data"
# Construct the full paths
reddit_jokes_1_path = os.path.join(base_path, "reddit_jokes1.csv")
reddit_jokes_1_path_processed = os.path.join(base_path, "reddit_jokes1_processed.json")
hate_speech_path = os.path.join(base_path, "hate_speech.csv")
hate_speech_path_processed = os.path.join(base_path, "hate_speech_processed.json")
reddit_jokes_2_path = os.path.join(base_path, "reddit_jokes2.json")
reddit_jokes_2_processed_path = os.path.join(base_path, "reddit_jokes2_processed.json")
stupidstuff_path = os.path.join(base_path, "stupidstuff.json")
stupidstuff_path_processed = os.path.join(base_path, "stupidstuff_processed.json")
wocka_path = os.path.join(base_path, "wocka.json")
wocka_path_processed = os.path.join(base_path, "wocka_processed.json")
def csv_to_json(in_path: str, out_path: str, preprocess_function: Callable[[list], None] = None) -> None:
# Read the CSV file
df = pd.read_csv(in_path)
# Convert the DataFrame to a list of dictionaries
data = df.to_dict(orient='records') # orient='records' means that each row is converted to a dictionary
# Preprocess the data
if preprocess_function is not None:
preprocess_function(data)
# Save the list to a JSON file
with open(out_path, 'w') as f:
json.dump(data, f, indent=4)
def preprocess_json(in_path: str, out_path: str, preprocess_function: Callable[[list], None]) -> None:
# Read json file
with open(in_path, 'r') as f:
data = json.load(f)
# Preprocess the data
preprocess_function(data)
# Save the modified list to a new JSON file
with open(out_path, 'w') as f:
json.dump(data, f, indent=4)
def delete_id(data: list) -> None:
# Remove "id" from each dictionary
for joke in data:
if 'id' in joke:
del joke['id']
def delete_Content_int(data: list) -> None:
# Remove "Content_int" from each dictionary
for joke in data:
if 'Content_int' in joke:
del joke['Content_int']
if __name__ == "__main__":
# preprocess_json(reddit_jokes_2_path, reddit_jokes_2_processed_path, delete_id)
# preprocess_json(stupidstuff_path, stupidstuff_path_processed, delete_id)
# preprocess_json(wocka_path, wocka_path_processed, delete_id)
# csv_to_json(reddit_jokes_1_path, reddit_jokes_1_path_processed)
# csv_to_json(hate_speech_path, hate_speech_path_processed, delete_Content_int)
pass