Spaces:
Sleeping
Sleeping
File size: 3,338 Bytes
9b744c5 ac33554 9b744c5 ac33554 9b744c5 b42fea9 9b744c5 b42fea9 9b744c5 c1fc690 9b744c5 c1fc690 9b744c5 b42fea9 9b744c5 b42fea9 9b744c5 b42fea9 9b744c5 c1fc690 b42fea9 c1fc690 9b744c5 c1fc690 9b744c5 b42fea9 9b744c5 6b0b6fd c1fc690 9b744c5 c1fc690 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import json
import argparse
import requests
import numpy as np
from sentence_transformers import SentenceTransformer
from .defaults import OWNER, REPO, TOKEN
model_id = "all-mpnet-base-v2"
model = SentenceTransformer(model_id)
def load_embeddings():
"""
Function to load embeddings from file
"""
embeddings = np.load("issue_embeddings.npy")
return embeddings
def load_issue_information(issue_type="issue"):
"""
Function to load issue information from file
"""
with open(f"embedding_index_to_{issue_type}.json", "r") as f:
embedding_index_to_issue = json.load(f)
with open("issues_dict.json", "r") as f:
issues = json.load(f)
return embedding_index_to_issue, issues
def cosine_similarity(a, b):
if a.ndim == 1:
a = a.reshape(1, -1)
if b.ndim == 1:
b = b.reshape(1, -1)
return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
def get_issue(issue_no, token=TOKEN, owner=OWNER, repo=REPO):
"""
Function to get issue from GitHub
"""
url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_no}"
headers = {
"Accept": "application/vnd.github+json",
"Authorization": f"{token}",
"X-GitHub-Api-Version": "2022-11-28",
"User-Agent": "amyeroberts",
}
request = requests.get(url, headers=headers)
if request.status_code != 200:
raise ValueError(f"Request failed with status code {request.status_code}")
return request.json()
def get_similar_issues(issue_no, query, top_k=5, token=TOKEN, owner=OWNER, repo=REPO, issue_type="issue"):
"""
Function to find similar issues
"""
if issue_no is not None and query is not None:
raise ValueError("Only one of issue_no or query can be provided")
if issue_no is not None and query is not None:
raise ValueError("Only one of issue_no or query can be provided")
if issue_no is not None:
issue = get_issue(issue_no, token=token, owner=owner, repo=repo)
query = issue["title"] + "\n" +issue["body"]
query_embedding = model.encode(query)
query_embedding = query_embedding.reshape(1, -1)
embeddings = load_embeddings()
# Calculate the cosine similarity between the query and all the issues
cosine_similarities = cosine_similarity(query_embedding, embeddings)
# Get the index of the most similar issue
most_similar_indices = np.argsort(cosine_similarities)
most_similar_indices = most_similar_indices[0][::-1]
embedding_index_to_issue, issues = load_issue_information(issue_type=issue_type)
similar_issues = []
for i in most_similar_indices[:top_k]:
issue_no = embedding_index_to_issue[str(i)]
similar_issues.append(issues[issue_no])
return similar_issues
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("")
parser.add_argument("--issue_no", type=int, default=None)
parser.add_argument("--query", type=str, default=None)
parser.add_argument("--top_k", type=int, default=5)
parser.add_argument("--token", type=str, default=TOKEN)
parser.add_argument("--owner", type=str, default=OWNER)
parser.add_argument("--repo", type=str, default=REPO)
args = parser.parse_args()
get_similar_issues(**vars(args))
|