Spaces:

SorbonneUniversity
/

SorboBot

Sleeping

App Files Files Community

SorboBot / sorbobotapp /keyword_extraction.py

leo-bourrel

!feat: Import new sorbobot version

68a9b68 about 1 year ago

raw

history blame

2.22 kB

	from typing import Any

	from langchain.chat_models import ChatOpenAI
	from langchain.output_parsers import NumberedListOutputParser
	from langchain.prompts import ChatPromptTemplate
	from utils import str_to_list

	query_template = """
	You are a bi-lingual (french and english) linguistic teacher working at a top-tier university.
	We are conducting a research project that requires the extraction of keywords from chatbot queries.
	Below, you will find a query. Please identify and rank the three most important keywords or phrases (n-grams) based on their relevance to the main topic of the query.
	For each keyword or phrase, assign it to one of the following categories: ["University / Company", "Research domain", "Country", "Name", "Other"].
	An 'n-gram' refers to a contiguous sequence of words, where 'n' can be 1 for a single word, 2 for a pair of words, and so on, up to two words in length.
	Please ensure not to list more than three n-grams in total.
	Your expertise in linguistic analysis is crucial for the success of this project. Thank you for your contribution.

	Please attach your ranked list in the following format:
	1. Keyword/Phrase - Category
	2. Keyword/Phrase - Category
	3. Keyword/Phrase - Category

	You must be concise and don't need to justify your choices.
	```
	{query}
	```
	"""

	output_parser = NumberedListOutputParser()
	format_instructions = output_parser.get_format_instructions()


	class KeywordExtractor:
	def __init__(self):
	super().__init__()
	self.model = ChatOpenAI()
	self.prompt = ChatPromptTemplate.from_template(
	template=query_template,
	)

	self.chain = self.prompt \| self.model # \| output_parser

	def __call__(
	self, inputs: str, filter_categories: list[str] = ["Research domain"]
	) -> Any:
	output = self.chain.invoke({"query": inputs})

	keywords = output_parser.parse(output.content)

	filtered_keywords = []
	for keyword in keywords:
	if " - " not in keyword:
	continue

	keyword, category = keyword.split(" - ", maxsplit=2)
	if category in filter_categories:
	filtered_keywords.append(keyword)

	return filtered_keywords