Spaces:
Runtime error
Runtime error
import os | |
from dotenv import load_dotenv | |
from langchain.document_loaders import GithubFileLoader | |
# from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain_text_splitters import CharacterTextSplitter | |
load_dotenv() | |
#get the GITHUB_ACCESS_TOKEN from the .env file | |
GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN") | |
USER = "heaversm" | |
REPO = "gdrive-docker" | |
GITHUB_BASE_URL = "https://github.com/" | |
def get_similar_files(query, db, embeddings): | |
# embedding_vector = embeddings.embed_query(query) | |
# docs_and_scores = db.similarity_search_by_vector(embedding_vector, k = 10) | |
docs_and_scores = db.similarity_search_with_score(query) | |
return docs_and_scores | |
def get_hugging_face_model(): | |
model_name = "mchochlov/codebert-base-cd-ft" | |
hf = HuggingFaceEmbeddings(model_name=model_name) | |
return hf | |
loader = GithubFileLoader( | |
#repo is USER/REPO | |
repo=f"{USER}/{REPO}", | |
access_token=GITHUB_ACCESS_TOKEN, | |
github_api_url="https://api.github.com", | |
file_filter=lambda file_path: file_path.endswith( | |
(".py", ".ts") | |
), # load all python and typescript files | |
) | |
documents = loader.load() | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
docs = text_splitter.split_documents(documents) | |
embedding_vector = get_hugging_face_model() | |
db = FAISS.from_documents(docs, embedding_vector) | |
model_name = "mchochlov/codebert-base-cd-ft" | |
query = """ | |
def create_app(): | |
app = connexion.FlaskApp(__name__, specification_dir="../.openapi") | |
app.add_api( | |
API_VERSION, resolver=connexion.resolver.RelativeResolver("provider.app") | |
) | |
""" | |
results_with_scores = get_similar_files(query, db, embedding_vector) | |
print ("retrieved!!!") | |
print(f"Number of results: {len(results_with_scores)}") | |
# score is a distance score, the lower the better | |
for doc, score in results_with_scores: | |
print(f"Metadata: {doc.metadata}, Score: {score}") | |
top_file_path = results_with_scores[0][0].metadata['path'] | |
top_file_content = results_with_scores[0][0].page_content | |
top_file_score = results_with_scores[0][1] | |
top_file_link = f"{GITHUB_BASE_URL}{USER}/{REPO}/blob/main/{top_file_path}" | |
print(f"Top file link: {top_file_link}") | |