Spaces:

tbitai
/

bayes-or-spam

Running

App Files Files Community

bayes-or-spam / app.py

tbitai

Fix unbiasing

e33f7e6 verified about 1 month ago

raw

history blame

2.72 kB

	import gradio as gr
	from huggingface_hub import hf_hub_download
	import json
	import tensorflow as tf
	import numpy as np

	model_probs_path = hf_hub_download(repo_id="tbitai/bayes-enron1-spam", filename="probs.json")
	with open(model_probs_path) as f:
	model_probs = json.load(f)

	UNK = '[UNK]'

	def tokenize(text):
	return tf.keras.preprocessing.text.text_to_word_sequence(text)

	def combine(probs):
	if any(p == 0 for p in probs):
	return 0
	prod = np.prod(probs)
	neg_prod = np.prod([1 - p for p in probs])
	if prod + neg_prod == 0: # Still possible due to floating point arithmetic
	return 0.5 # Assume that prod and neg_prod are equally small
	return prod / (prod + neg_prod)

	def get_interesting_probs(probs, intr_threshold):
	return sorted(probs,
	key=lambda p: abs(p - 0.5),
	reverse=True)[:intr_threshold]

	def unbias(p):
	return (2 * p) / (p + 1)

	def unbias_or_noop(p, noop=False):
	return unbias(p) if not noop else p

	def predict_bayes(text, intr_threshold, unbiased=False):
	words = tokenize(text)
	probs = [unbias_or_noop(model_probs.get(w, model_probs[UNK]), noop=not unbiased) for w in words]
	interesting_probs = get_interesting_probs(probs, intr_threshold)
	return combine(interesting_probs)

	DEFAULT_INTR_THRESHOLD = 15

	MODELS = [
	BAYES := "Bayes Enron1 spam",
	]

	def predict(model, unbiased, intr_threshold, input_txt):
	if model == BAYES:
	return predict_bayes(input_txt, unbiased=unbiased, intr_threshold=intr_threshold)

	demo = gr.Interface(
	fn=predict,
	inputs=[
	gr.Dropdown(choices=MODELS, value=BAYES, label="Model"),
	gr.Checkbox(label="Unbias", info="Correct Graham's bias?"),
	gr.Slider(minimum=1, maximum=DEFAULT_INTR_THRESHOLD + 5, step=1, value=DEFAULT_INTR_THRESHOLD,
	label="Interestingness threshold",
	info=f"How many of the most interesting words to select in the probability calculation? ({DEFAULT_INTR_THRESHOLD} for Graham)"),
	gr.TextArea(label="Email"),
	],
	outputs=[gr.Number(label="Spam probability")],
	title="Bayes or Spam?",
	description="Choose and configure your model, and predict if your email is a spam! 📨<br>COMING SOON: NN and LLM models.",
	examples=[
	[BAYES, False, DEFAULT_INTR_THRESHOLD, "enron actuals for june 26, 2000"],
	[BAYES, False, DEFAULT_INTR_THRESHOLD, nerissa_email := "stop the aging clock nerissa"],
	[BAYES, True, DEFAULT_INTR_THRESHOLD, nerissa_email],
	],
	article="This is a demo of the models in the [Bayes or Spam?](https://github.com/tbitai/bayes-or-spam) project.",
	)

	if __name__ == "__main__":
	demo.launch()