Spaces:

KTH
/

kth-qa

Runtime error

App Files Files Community

kth-qa / ingest_pinecone.py

erseux

broken imports

a106116 about 2 years ago

raw

history blame contribute delete

2.01 kB

	import logging
	logger = logging.getLogger()

	import os
	from langchain.docstore.document import Document
	from langchain.text_splitter import NLTKTextSplitter
	from langchain.callbacks import get_openai_callback

	from config import State

	FILE_DIR = 'files'
	KURS_URL = "https://www.kth.se/student/kurser/kurs/{course_code}?l={language}"
	DEFAULT_LANGUAGE = "en"
	CHUNK_SIZE = 1000

	def ingest(state: State):
	with get_openai_callback() as cb:
	# make sure pwd is kth_qa
	pwd = os.getcwd()
	if pwd.split('/')[-1] != 'kth_qa':
	logger.error(f"pwd is not kth_qa, but {pwd}. Please run from kth_qa directory.")
	return

	text_splitter = NLTKTextSplitter.from_tiktoken_encoder(
	chunk_size=CHUNK_SIZE,
	chunk_overlap=100,
	)

	file_folder_name = f'files/{DEFAULT_LANGUAGE}'
	file_folder = os.listdir(file_folder_name)
	all_langdocs = []
	for file in file_folder:
	raw_docs = []
	with open(f'{file_folder_name}/{file}', 'r') as f:
	text = f.read()
	filename = file.split('.')[0]
	course_code, language = filename.split('?l=')
	doc = Document(page_content=text, metadata={"course": course_code})
	raw_docs.append(doc)
	logger.debug(f"loaded file {file}")

	langdocs = text_splitter.split_documents(raw_docs)
	logger.debug(f"split documents into {len(langdocs)} chunks")
	all_langdocs.extend(langdocs)

	logger.info(f"split all documents into {len(all_langdocs)} chunks")

	logger.info(f"Adding documents to pinecone...")
	state.store.add_documents(all_langdocs)
	logger.info(f"...done!")

	logger.info(f"Total cost of openai api calls: {cb.total_cost}")

	if __name__ == "__main__":
	logging.basicConfig(level=logging.INFO)
	logger.setLevel(logging.INFO)

	state = State()
	ingest(state)