# import os | |
# import json | |
# import gradio as gr | |
# import spaces | |
# import torch | |
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification | |
# from sentence_splitter import SentenceSplitter | |
# from itertools import product | |
# # Get the Hugging Face token from environment variable | |
# hf_token = os.getenv('HF_TOKEN') | |
# cuda_available = torch.cuda.is_available() | |
# device = torch.device("cuda" if cuda_available else "cpu") | |
# print(f"Using device: {device}") | |
# # Initialize paraphraser model and tokenizer | |
# paraphraser_model_name = "NoaiGPT/777" | |
# paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token) | |
# paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device) | |
# # Initialize classifier model and tokenizer | |
# classifier_model_name = "andreas122001/roberta-mixed-detector" | |
# classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name) | |
# classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device) | |
# # Initialize sentence splitter | |
# splitter = SentenceSplitter(language='en') | |
# def classify_text(text): | |
# inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device) | |
# with torch.no_grad(): | |
# outputs = classifier_model(**inputs) | |
# probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
# predicted_class = torch.argmax(probabilities, dim=-1).item() | |
# main_label = classifier_model.config.id2label[predicted_class] | |
# main_score = probabilities[0][predicted_class].item() | |
# return main_label, main_score | |
# @spaces.GPU | |
# def generate_paraphrases(text, setting, output_format): | |
# sentences = splitter.split(text) | |
# all_sentence_paraphrases = [] | |
# if setting == 1: | |
# num_return_sequences = 5 | |
# repetition_penalty = 1.1 | |
# no_repeat_ngram_size = 2 | |
# temperature = 1.0 | |
# max_length = 128 | |
# elif setting == 2: | |
# num_return_sequences = 10 | |
# repetition_penalty = 1.2 | |
# no_repeat_ngram_size = 3 | |
# temperature = 1.2 | |
# max_length = 192 | |
# elif setting == 3: | |
# num_return_sequences = 15 | |
# repetition_penalty = 1.3 | |
# no_repeat_ngram_size = 4 | |
# temperature = 1.4 | |
# max_length = 256 | |
# elif setting == 4: | |
# num_return_sequences = 20 | |
# repetition_penalty = 1.4 | |
# no_repeat_ngram_size = 5 | |
# temperature = 1.6 | |
# max_length = 320 | |
# else: | |
# num_return_sequences = 25 | |
# repetition_penalty = 1.5 | |
# no_repeat_ngram_size = 6 | |
# temperature = 1.8 | |
# max_length = 384 | |
# top_k = 50 | |
# top_p = 0.95 | |
# length_penalty = 1.0 | |
# formatted_output = "Original text:\n" + text + "\n\n" | |
# formatted_output += "Paraphrased versions:\n" | |
# json_output = { | |
# "original_text": text, | |
# "paraphrased_versions": [], | |
# "combined_versions": [], | |
# "human_like_versions": [] | |
# } | |
# for i, sentence in enumerate(sentences): | |
# inputs = paraphraser_tokenizer(f'{sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device) | |
# # Generate paraphrases using the specified parameters | |
# outputs = paraphraser_model.generate( | |
# inputs.input_ids, | |
# attention_mask=inputs.attention_mask, | |
# num_return_sequences=num_return_sequences, | |
# repetition_penalty=repetition_penalty, | |
# no_repeat_ngram_size=no_repeat_ngram_size, | |
# temperature=temperature, | |
# max_length=max_length, | |
# top_k=top_k, | |
# top_p=top_p, | |
# do_sample=True, | |
# early_stopping=False, | |
# length_penalty=length_penalty | |
# ) | |
# paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
# formatted_output += f"Original sentence {i+1}: {sentence}\n" | |
# for j, paraphrase in enumerate(paraphrases, 1): | |
# formatted_output += f" Paraphrase {j}: {paraphrase}\n" | |
# json_output["paraphrased_versions"].append({ | |
# f"original_sentence_{i+1}": sentence, | |
# "paraphrases": paraphrases | |
# }) | |
# all_sentence_paraphrases.append(paraphrases) | |
# formatted_output += "\n" | |
# all_combinations = list(product(*all_sentence_paraphrases)) | |
# formatted_output += "\nCombined paraphrased versions:\n" | |
# combined_versions = [] | |
# for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations | |
# combined_paraphrase = " ".join(combination) | |
# combined_versions.append(combined_paraphrase) | |
# json_output["combined_versions"] = combined_versions | |
# # Classify combined versions | |
# human_versions = [] | |
# for i, version in enumerate(combined_versions, 1): | |
# label, score = classify_text(version) | |
# formatted_output += f"Version {i}:\n{version}\n" | |
# formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# if label == "human-produced" or (label == "machine-generated" and score < 0.98): | |
# human_versions.append((version, label, score)) | |
# formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n" | |
# for i, (version, label, score) in enumerate(human_versions, 1): | |
# formatted_output += f"Version {i}:\n{version}\n" | |
# formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# json_output["human_like_versions"] = [ | |
# {"version": version, "label": label, "confidence_score": score} | |
# for version, label, score in human_versions | |
# ] | |
# # If no human-like versions, include the top 5 least confident machine-generated versions | |
# if not human_versions: | |
# human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5] | |
# formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n" | |
# for i, (version, label, score) in enumerate(human_versions, 1): | |
# formatted_output += f"Version {i}:\n{version}\n" | |
# formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
# if output_format == "text": | |
# return formatted_output, "\n\n".join([v[0] for v in human_versions]) | |
# else: | |
# return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions]) | |
# # Define the Gradio interface | |
# iface = gr.Interface( | |
# fn=generate_paraphrases, | |
# inputs=[ | |
# gr.Textbox(lines=5, label="Input Text"), | |
# gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"), | |
# gr.Radio(["text", "json"], label="Output Format") | |
# ], | |
# outputs=[ | |
# gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"), | |
# gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases") | |
# ], | |
# title="Advanced Diverse Paraphraser with Human-like Filter", | |
# description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output." | |
# ) | |
# # Launch the interface | |
# iface.launch() | |
import os | |
import json | |
import gradio as gr | |
import spaces | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, T5ForConditionalGeneration | |
from sentence_splitter import SentenceSplitter | |
from itertools import product | |
# Get the Hugging Face token from environment variable | |
hf_token = os.getenv('HF_TOKEN') | |
cuda_available = torch.cuda.is_available() | |
device = torch.device("cuda" if cuda_available else "cpu") | |
print(f"Using device: {device}") | |
# Initialize paraphraser model and tokenizer | |
paraphraser_model_name = "NoaiGPT/777" | |
paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token) | |
paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device) | |
# Initialize classifier model and tokenizer | |
classifier_model_name = "andreas122001/roberta-mixed-detector" | |
classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name) | |
classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device) | |
# Initialize grammar correction model and tokenizer | |
grammar_model_name = "grammarly/coedit-large" | |
grammar_tokenizer = AutoTokenizer.from_pretrained(grammar_model_name) | |
grammar_model = T5ForConditionalGeneration.from_pretrained(grammar_model_name).to(device) | |
# Initialize sentence splitter | |
splitter = SentenceSplitter(language='en') | |
def classify_text(text): | |
inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device) | |
with torch.no_grad(): | |
outputs = classifier_model(**inputs) | |
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
predicted_class = torch.argmax(probabilities, dim=-1).item() | |
main_label = classifier_model.config.id2label[predicted_class] | |
main_score = probabilities[0][predicted_class].item() | |
return main_label, main_score | |
def correct_grammar(text): | |
inputs = grammar_tokenizer(f'Fix grammatical errors in this sentence: {text}', return_tensors="pt").input_ids.to(device) | |
outputs = grammar_model.generate(inputs, max_length=256) | |
corrected_text = grammar_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
print(corrected_text) | |
return corrected_text | |
def generate_paraphrases(text, setting, output_format): | |
sentences = splitter.split(text) | |
all_sentence_paraphrases = [] | |
if setting == 1: | |
num_return_sequences = 5 | |
repetition_penalty = 1.1 | |
no_repeat_ngram_size = 2 | |
temperature = 1.0 | |
max_length = 128 | |
elif setting == 2: | |
num_return_sequences = 10 | |
repetition_penalty = 1.2 | |
no_repeat_ngram_size = 3 | |
temperature = 1.2 | |
max_length = 192 | |
elif setting == 3: | |
# num_return_sequences = 15 | |
num_return_sequences = 2 | |
repetition_penalty = 1.3 | |
no_repeat_ngram_size = 4 | |
temperature = 1.4 | |
max_length = 256 | |
elif setting == 4: | |
num_return_sequences = 20 | |
repetition_penalty = 1.4 | |
no_repeat_ngram_size = 5 | |
temperature = 1.6 | |
max_length = 320 | |
else: | |
num_return_sequences = 25 | |
repetition_penalty = 1.5 | |
no_repeat_ngram_size = 6 | |
temperature = 1.8 | |
max_length = 384 | |
top_k = 50 | |
top_p = 0.95 | |
length_penalty = 1.0 | |
formatted_output = "Original text:\n" + text + "\n\n" | |
formatted_output += "Paraphrased versions:\n" | |
json_output = { | |
"original_text": text, | |
"paraphrased_versions": [], | |
"combined_versions": [], | |
"human_like_versions": [] | |
} | |
for i, sentence in enumerate(sentences): | |
inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device) | |
# Generate paraphrases using the specified parameters | |
outputs = paraphraser_model.generate( | |
inputs.input_ids, | |
attention_mask=inputs.attention_mask, | |
num_return_sequences=num_return_sequences, | |
repetition_penalty=repetition_penalty, | |
no_repeat_ngram_size=no_repeat_ngram_size, | |
temperature=temperature, | |
max_length=max_length, | |
top_k=top_k, | |
top_p=top_p, | |
do_sample=True, | |
early_stopping=False, | |
length_penalty=length_penalty | |
) | |
paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
corrected_paraphrases = [correct_grammar(paraphrase) for paraphrase in paraphrases] | |
formatted_output += f"Original sentence {i+1}: {sentence}\n" | |
for j, paraphrase in enumerate(corrected_paraphrases, 1): | |
formatted_output += f" Paraphrase {j}: {paraphrase}\n" | |
json_output["paraphrased_versions"].append({ | |
f"original_sentence_{i+1}": sentence, | |
"paraphrases": corrected_paraphrases | |
}) | |
all_sentence_paraphrases.append(corrected_paraphrases) | |
formatted_output += "\n" | |
all_combinations = list(product(*all_sentence_paraphrases)) | |
formatted_output += "\nCombined paraphrased versions:\n" | |
combined_versions = [] | |
for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations | |
combined_paraphrase = " ".join(combination) | |
combined_versions.append(combined_paraphrase) | |
json_output["combined_versions"] = combined_versions | |
# Classify combined versions | |
human_versions = [] | |
for i, version in enumerate(combined_versions, 1): | |
label, score = classify_text(version) | |
formatted_output += f"Version {i}:\n{version}\n" | |
formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
if label == "human-produced" or (label == "machine-generated" and score < 0.98): | |
human_versions.append((version, label, score)) | |
formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n" | |
for i, (version, label, score) in enumerate(human_versions, 1): | |
formatted_output += f"Version {i}:\n{version}\n" | |
formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
json_output["human_like_versions"] = [ | |
{"version": version, "label": label, "confidence_score": score} | |
for version, label, score in human_versions | |
] | |
# If no human-like versions, include the top 5 least confident machine-generated versions | |
if not human_versions: | |
human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5] | |
formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n" | |
for i, (version, label, score) in enumerate(human_versions, 1): | |
formatted_output += f"Version {i}:\n{version}\n" | |
formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n" | |
if output_format == "text": | |
return formatted_output, "\n\n".join([v[0] for v in human_versions]) | |
else: | |
return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions]) | |
# Define the Gradio interface | |
iface = gr.Interface( | |
fn=generate_paraphrases, | |
inputs=[ | |
gr.Textbox(lines=5, label="Input Text"), | |
gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"), | |
gr.Radio(["text", "json"], label="Output Format") | |
], | |
outputs=[ | |
gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"), | |
gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases") | |
], | |
title="Advanced Diverse Paraphraser with Human-like Filter", | |
description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output." | |
) | |
# Launch the interface | |
iface.launch() |