Spaces:
Runtime error
Runtime error
File size: 5,902 Bytes
4fdeb0d 0f1eb8b 4fdeb0d 0f1eb8b fb9c839 0f1eb8b 2584a60 0f1eb8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from pathlib import Path
import matplotlib.pyplot as plt
import gradio as gr
from huggingface_hub import from_pretrained_keras
from collections import defaultdict
import math
import networkx as nx
model = from_pretrained_keras("bpHigh/Node2Vec_MovieLens")
# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
movielens_data_file_url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
movielens_zipped_file = keras.utils.get_file("ml-latest-small.zip", movielens_data_file_url, extract=False)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"
# Only extract the data the first time the script is run.
if not movielens_dir.exists():
with ZipFile(movielens_zipped_file, "r") as zip:
# Extract files
print("Extracting all the files now...")
zip.extractall(path=keras_datasets_path)
print("Done!")
# Read the Movies csv
movies = pd.read_csv(f"{movielens_dir}/movies.csv")
# Create a `movieId` string.
movies["movieId"] = movies["movieId"].apply(lambda x: f"movie_{x}")
# Load ratings to a DataFrame.
ratings = pd.read_csv(f"{movielens_dir}/ratings.csv")
# Convert the `ratings` to floating point
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
# Create the `movie_id` string.
ratings["movieId"] = ratings["movieId"].apply(lambda x: f"movie_{x}")
# Implement two utility functions for the movies DataFrame.
def get_movie_title_by_id(movieId):
return list(movies[movies.movieId == movieId].title)[0]
def get_movie_id_by_title(title):
return list(movies[movies.title == title].movieId)[0]
# Create Weighted Edges between movies
min_rating = 5
pair_frequency = defaultdict(int)
item_frequency = defaultdict(int)
# Filter instances where rating is greater than or equal to min_rating.
rated_movies = ratings[ratings.rating >= min_rating]
# Group instances by user.
movies_grouped_by_users = list(rated_movies.groupby("userId"))
for group in movies_grouped_by_users:
# Get a list of movies rated by the user.
current_movies = list(group[1]["movieId"])
for i in range(len(current_movies)):
item_frequency[current_movies[i]] += 1
for j in range(i + 1, len(current_movies)):
x = min(current_movies[i], current_movies[j])
y = max(current_movies[i], current_movies[j])
pair_frequency[(x, y)] += 1
# Create the graph with the nodes and the edges
min_weight = 10
D = math.log(sum(item_frequency.values()))
# Create the movies undirected graph.
movies_graph = nx.Graph()
# Add weighted edges between movies.
# This automatically adds the movie nodes to the graph.
for pair in pair_frequency:
x, y = pair
xy_frequency = pair_frequency[pair]
x_frequency = item_frequency[x]
y_frequency = item_frequency[y]
pmi = math.log(xy_frequency) - math.log(x_frequency) - math.log(y_frequency) + D
weight = pmi * xy_frequency
# Only include edges with weight >= min_weight.
if weight >= min_weight:
movies_graph.add_edge(x, y, weight=weight)
# Create vocabulary and a mapping from tokens to integer indices
vocabulary = ["NA"] + list(movies_graph.nodes)
vocabulary_lookup = {token: idx for idx, token in enumerate(vocabulary)}
# Analyze the learnt embeddings.
movie_embeddings = model.get_layer("item_embeddings").get_weights()[0]
# Find Related Movies
movie_titles = list(movies['title'])
def find_related_movies(movie_title, k):
k = int(k)
query_embeddings = []
movieId = get_movie_id_by_title(movie_title)
token_id = vocabulary_lookup[movieId]
query_embedding = movie_embeddings[token_id]
query_embeddings.append(query_embedding)
query_embeddings = np.array(query_embeddings)
similarities = tf.linalg.matmul(
tf.math.l2_normalize(query_embeddings),
tf.math.l2_normalize(movie_embeddings),
transpose_b=True,
)
_, indices = tf.math.top_k(similarities, k)
indices = indices.numpy().tolist()
similar_tokens = indices[0]
related_movies = []
for token in similar_tokens:
similar_movieId = vocabulary[token]
similar_title = get_movie_title_by_id(similar_movieId)
related_movies.append(similar_title)
related_movies_df = pd.DataFrame({'Related Movies':related_movies})
return related_movies_df
demo = gr.Blocks()
with demo:
gr.Markdown("""
<div>
<h1 style='text-align: center'>Find Related Movies</h1>
<h2>Choose the specific movie from the dropdown and see the top k related Movies</h2>
Note: The dropdown menu provides movie options from the Movielens dataset.
</div>
""")
with gr.Box():
gr.Markdown(
"""
### Input
#### Select a movie to find other related movies.
""")
inp1 = gr.Dropdown(movie_titles)
gr.Markdown(
"""
<br>
""")
gr.Markdown(
"""
#### Number of related movies you wanna find?
""")
inp2 = gr.Number()
btn = gr.Button("Run")
with gr.Box():
gr.Markdown(
"""
### Output
#### Top K related movies.
""")
df1 = gr.DataFrame(headers=["title"], datatype=["str"], interactive=False)
with gr.Row():
gr.Markdown(
"""
<h4>Credits</h4>
Author: <a href="https://www.linkedin.com/in/khalid-salama-24403144/"> Khalid Salama</a>.<br>
Based on the following Keras example <a href="https://keras.io/examples/graph/node2vec_movielens/"> Graph representation learning with node2vec</a> by Khalid Salama<br>
Check out the model <a href="https://huggingface.co/keras-io/Node2Vec_MovieLens">here</a>
"""
)
btn.click(fn=find_related_movies, inputs=[inp1,inp2], outputs=df1)
demo.launch(debug=True) |