taesiri's picture
backup
6c33596
import gradio as gr
from datasets import load_dataset
from PIL import Image
import io
import time
import os
from datetime import datetime, timedelta
import json
access_token = os.environ.get("HUGGINGFACE_TOKEN")
# Global variables
dataset = None
dataset_size = "Unknown"
last_refresh_time = None
REFRESH_INTERVAL = timedelta(hours=24)
def load_and_prepare_dataset():
global dataset, dataset_size, last_refresh_time
dataset = load_dataset(
"taesiri/PhotoshopRequest-DailyDump",
split="train",
streaming=True,
token=access_token,
)
# Get dataset info
dataset_info = dataset.info
dataset_size = (
dataset_info.splits["train"].num_examples
if dataset_info.splits.get("train")
else "Unknown"
)
last_refresh_time = datetime.now()
def check_and_refresh_dataset():
global last_refresh_time
current_time = datetime.now()
if (
last_refresh_time is None
or (current_time - last_refresh_time) >= REFRESH_INTERVAL
):
load_and_prepare_dataset()
# Initial dataset load
load_and_prepare_dataset()
# Load and prepare the dataset
dataset = load_dataset(
"taesiri/PhotoshopRequest-DailyDump",
split="train",
streaming=True,
token=access_token,
)
# Get dataset info
dataset_info = dataset.info
dataset_size = (
dataset_info.splits["train"].num_examples
if dataset_info.splits.get("train")
else "Unknown"
)
BUFFER_SIZE = 1
sample_iterator = None
sample_count = 0
def reshuffle_dataset():
global sample_iterator, sample_count
seed = int(time.time()) # Convert time to an integer
shuffled_dataset = dataset.shuffle(seed=seed, buffer_size=BUFFER_SIZE)
sample_iterator = iter(shuffled_dataset)
sample_count = 0
reshuffle_dataset() # Initial shuffle
def get_next_sample():
check_and_refresh_dataset()
global sample_count
if sample_count >= BUFFER_SIZE:
reshuffle_dataset()
sample = next(sample_iterator)
sample_count += 1
print(sample)
post_id = sample["post_id"]
title = sample["title"]
reddit_url = f"https://www.reddit.com/r/PhotoshopRequest/comments/{post_id}"
selftext = ""
try:
selftext = json.loads(sample["json_data"])["post"]["selftext"]
except:
print("No selftext found")
markdown_text = f"# {title}\n\n{selftext}\n\n[View post on r/PhotoshopRequest]({reddit_url})"
return (
markdown_text,
sample["source_image"],
sample["edited_image"],
)
with gr.Blocks() as demo:
gr.Markdown("# PhotoshopRequest Dataset Sampler")
gr.Markdown(
"""
This is a preview of the PhotoshopRequest dataset. Each sample represents a Photoshop editing request post.
Click the 'Sample New Item' button to retrieve a random sample from the dataset.
"""
)
post_info = gr.Markdown()
with gr.Row():
source_image = gr.Image(label="Source Image")
edited_image = gr.Image(label="Edited Image")
sample_button = gr.Button("Sample New Item")
info_md = gr.Markdown()
def update_info():
return f"""
<div style="text-align: center;">
<hr>
Dataset Size: {dataset_size} items<br>
Last Refreshed: {last_refresh_time.strftime('%Y-%m-%d %H:%M:%S UTC') if last_refresh_time else 'Unknown'}
</div>
"""
sample_button.click(
get_next_sample, outputs=[post_info, source_image, edited_image]
).then(update_info, outputs=[info_md])
if __name__ == "__main__":
demo.launch()