search_demo / src /utils.py
bibliotecadebabel
first commit
37c2a8d
raw
history blame
3.1 kB
import os
import pandas as pd
import numpy as np
import json
class Utils:
@staticmethod
def read_dataframe_from_csv(file_path):
"""
Reads a DataFrame from a CSV file if the file exists.
Parameters:
- file_path: The full path to the CSV file.
Returns:
- A pandas DataFrame if the file exists and is read successfully; None otherwise.
"""
# Check if the file exists
if os.path.isfile(file_path):
try:
# Attempt to read the CSV file into a DataFrame
df = pd.read_csv(file_path)
return df
except Exception as e:
# If an error occurs during reading, print it
print(f"An error occurred while reading the file: {e}")
return None
else:
# If the file does not exist, print a message
print(f"File does not exist: {file_path}")
return None
@staticmethod
def read_json_files_to_dataframe(folder_path):
"""
Reads JSON files from a specified folder, automatically infers columns from the JSON files,
and returns the data as a pandas DataFrame.
:param folder_path: Path to the folder containing JSON files.
:return: A pandas DataFrame containing data from all JSON files in the folder.
"""
data = []
for filename in os.listdir(folder_path):
if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r') as file:
# First attempt to load the JSON
json_data = json.load(file)
# Check if json_data is a string instead of a dict, decode it again
if isinstance(json_data, str):
json_data = json.loads(json_data)
data.append(json_data)
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)
return df
@staticmethod
def write_pandas_to_local(df, output_path):
"""
Writes a pandas DataFrame to a CSV file at the specified output path.
:param df: The pandas DataFrame to be saved.
:param output_path: The file path where the DataFrame should be saved as a CSV.
"""
# Create the directory if it does not exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Save the DataFrame to a CSV file without saving the index
df.to_csv(output_path, index=False)
@staticmethod
def convert_iterables_to_strings(df):
"""
Convert columns with iterable types (excluding strings) to string representations.
This includes handling numpy arrays or lists within dataframe cells.
"""
for col in df.columns:
# Apply conversion if the value is an iterable (excluding strings) or a numpy array
df[col] = df[col].apply(lambda x: str(x) if isinstance(x, (np.ndarray, list)) else x)
return df