import pandas as pd import arxiv import requests from pinecone import Pinecone, ServerlessSpec import logging import os script_dir = os.path.dirname(os.path.abspath(__file__)) os.chdir(script_dir) def get_zotero_ids(api_key, library_id, tag): base_url = 'https://api.zotero.org' suffix = '/users/'+ library_id +'/items?tag='+ tag header = {'Authorization': 'Bearer '+ api_key} request = requests.get(base_url + suffix, headers= header) return [data['data']['archiveID'].replace('arXiv:', '') for data in request.json()] def get_arxiv_papers(ids = None, category = None, comment = None): logging.getLogger('arxiv').setLevel(logging.WARNING) client = arxiv.Client() if category is None: search = arxiv.Search( id_list= ids, max_results= len(ids), ) else : if comment is None: custom_query = f'cat:{category}' else: custom_query = f'cat:{category} AND co:{comment}' search = arxiv.Search( query = custom_query, max_results= 15, sort_by= arxiv.SortCriterion.SubmittedDate ) if ids is None and category is None: raise ValueError('not a valid query') df = pd.DataFrame({'Title': [result.title for result in client.results(search)], 'Abstract': [result.summary.replace('\n', ' ') for result in client.results(search)], 'Date': [result.published.date().strftime('%Y-%m-%d') for result in client.results(search)], 'id': [result.entry_id for result in client.results(search)]}) if ids: df.to_csv('arxiv-scrape.csv', index = False) return df def get_hf_embeddings(api_key, df): title_abs = [title + '[SEP]' + abstract for title,abstract in zip(df['Title'], df['Abstract'])] API_URL = "https://api-inference.huggingface.co/models/malteos/scincl" headers = {"Authorization": f"Bearer {api_key}"} response = requests.post(API_URL, headers=headers, json={"inputs": title_abs, "options": {"wait_for_model": False}}) if response.status_code == 503: response = requests.post(API_URL, headers=headers, json={"inputs": title_abs, "options": {"wait_for_model": True}}) embeddings = response.json() return embeddings, len(embeddings[0]) def upload_to_pinecone(api_key, index, namespace, embeddings, dim, df): input = [{'id': df['id'][i], 'values': embeddings[i]} for i in range(len(embeddings))] pc = Pinecone(api_key = api_key) if index in pc.list_indexes().names(): while True: logging.warning(f'Index name : {index} already exists.') return f'Index name : {index} already exists' pc.create_index( name=index, dimension=dim, metric="cosine", deletion_protection="disabled", spec=ServerlessSpec( cloud='aws', region='us-east-1' ) ) index = pc.Index(index) return index.upsert(vectors=input, namespace=namespace) def get_new_papers(df): df_main = pd.read_csv('arxiv-scrape.csv') df.reset_index(inplace=True) df.drop(columns=['index'], inplace=True) union_df = df.merge(df_main, how='left', indicator=True) df = union_df[union_df['_merge'] == 'left_only'].drop(columns=['_merge']) if df.empty: return 'No New Papers Found' else: # df_main = pd.concat([df_main, df], ignore_index= True) #persistence of recommended paper removed for demo # df_main.drop_duplicates(inplace= True) # df_main.to_csv('arxiv-scrape.csv', index = False) return df def recommend_papers(api_key, index, namespace, embeddings, df, threshold): pc = Pinecone(api_key = api_key) if index in pc.list_indexes().names(): index = pc.Index(index) else: raise ValueError(f"{index} doesnt exist. Project isnt initialized properly") results = [] score_threshold = threshold for i,embedding in enumerate(embeddings): query = embedding result = index.query(namespace=namespace,vector=query,top_k=3,include_values=False) sum_score = sum(match['score'] for match in result['matches']) if sum_score > score_threshold: results.append(f"Paper-URL : [{df['id'][i]}]({df['id'][i]}) with score: {sum_score / 3}
") if results: return '\n'.join(results) else: return 'No Interesting Paper'