Spaces:
Running
Running
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import Qdrant | |
| from gh_issue_loader import GHLoader | |
| from config import DB_CONFIG | |
| CHUNK_SIZE = 500 | |
| def get_text_chunk(docs): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=CHUNK_SIZE, chunk_overlap=0 | |
| ) | |
| texts = text_splitter.split_documents(docs) | |
| return texts | |
| def store(texts): | |
| model_name = "intfloat/multilingual-e5-large" | |
| model_kwargs = {"device": "cuda"} | |
| encode_kwargs = {"normalize_embeddings": False} | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name=model_name, | |
| model_kwargs=model_kwargs, | |
| encode_kwargs=encode_kwargs, | |
| ) | |
| db_url, db_api_key, db_collection_name = DB_CONFIG | |
| _ = Qdrant.from_documents( | |
| texts, | |
| embeddings, | |
| url=db_url, | |
| api_key=db_api_key, | |
| collection_name=db_collection_name, | |
| ) | |
| def main(repo_name: str, path: str) -> None: | |
| loader = GHLoader(repo_name, path) | |
| docs = loader.load() | |
| texts = get_text_chunk(docs) | |
| store(texts) | |
| if __name__ == "__main__": | |
| """ | |
| $ python store.py "REPO_NAME" "FILE_PATH" | |
| $ python store.py cocoa data/cocoa-issues.json | |
| """ | |
| import sys | |
| args = sys.argv | |
| if len(args) != 3: | |
| print("No args, you need two args for repo_name, json_file_path") | |
| else: | |
| repo_name = args[1] | |
| path = args[2] | |
| main(repo_name, path) | |