Spaces:

AmithAdiraju1694
/

translatemyimage-beta

Paused

App Files Files Community

translatemyimage-beta / inference /preprocess_image.py

AmithAdiraju1694

feat_preo_cmod (#3)

2a12b77 verified 3 months ago

raw

history blame contribute delete

3.99 kB


	import numpy as np
	from typing import List, Tuple, Optional, AnyStr, Dict
	import nltk
	nltk.download("stopwords")
	nltk.download('punkt')

	from nltk.tokenize import RegexpTokenizer
	from nltk.corpus import stopwords
	import re


	def preprocess_text(sentence: AnyStr) -> AnyStr:

	"""
	Function that pre-processes input text by removing special characters, hyper links,
	numbers and by removing stop words

	Parameters:
	sentence: str, required -> A raw string which may have stop words, special chars etc.

	Returns:
	return_txt: str -> A clean string with all aforementioned, removed.
	"""

	sentence=sentence.lower().replace('{html}',"")
	cleanr = re.compile('<.*?>')
	cleantext = re.sub(cleanr, '', sentence)

	rem_url=re.sub(r'http\S+', '',cleantext)
	rem_num = re.sub('[0-9]+', '', rem_url)
	tokenizer = RegexpTokenizer(r'\w+')

	tokens = tokenizer.tokenize(rem_num)
	filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]

	return_txt = " ".join(filtered_words)

	return return_txt

	def image_to_np_arr(image) -> np.array:

	"""
	Function that converts a byte array image into a floating pointer numpy array.

	Parameters:
	inp_texts: List[str], required -> List of strings, containing item names of a menu in english.

	Returns:
	np.ndarray
	"""

	return np.array(image)

	async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
	"""
	Function that processes extracted text by removing numbers and special characters,
	and filters out text with less than 2 words.

	Parameters:
	raw_extrc_text: List[Tuple], required -> A list of tuples containing extracted text.

	Returns:
	List[AnyStr] -> A list of processed text strings.
	"""
	output_texts = []
	for _, extr_text, _ in raw_extrc_text:
	# remove all numbers, special characters from a string
	prcsd_txt = preprocess_text(extr_text)
	if len(prcsd_txt.split(" ")) >= 2:
	output_texts.append(prcsd_txt)

	return output_texts

	def post_process_gen_outputs(gen_output: List[str], header_pattern: str, dots_pattern:str) -> List[Dict]:

	# Define the regular expression pattern to match section names and placeholders
	headers = ["Item Name", "Major Ingredients", "Making Process", "Portion and Spice Level", "Pairs With", "Allergens", "Food Type"]

	# Function to clean the strings
	def clean_string(input_string):
	parts = input_string.split(',')
	cleaned_parts = [part.strip() for part in parts if part.strip()]
	return ', '.join(cleaned_parts)

	for i in range(len(gen_output)):
	# Find all matches
	matches = re.findall(header_pattern, gen_output[i])

	# Since re.findall returns a list of tuples, we need to extract the first tuple
	if matches:
	result = dict(zip(headers,matches[0]))
	result['Major Ingredients'] = clean_string(result['Major Ingredients'])

	# if any of dictionary values strings are emtpy, replace it with string "Sorry, can't explain this."
	for k in result.keys():
	if len(result[k]) < 3 or any(header in result[k] for header in headers):
	result[k] = "Sorry, can't explain this."

	gen_output[i] = result

	else:
	if headers[1] in gen_output[i]:

	gen_output[i] = {"May contain misleading explanation":
	dots_pattern.sub('' ,
	gen_output[i].split(headers[1]
	)[1].strip().replace('</s>', '')
	)
	}
	else:
	gen_output[i] = {"Sorry, can't explain this item": "NA"}

	gen_output[i].pop('Item Name', None)
	return gen_output