translatemyimage-beta / inference /preprocess_image.py
AmithAdiraju1694's picture
feat_preo_cmod (#3)
2a12b77 verified
import numpy as np
from typing import List, Tuple, Optional, AnyStr, Dict
import nltk
nltk.download("stopwords")
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
def preprocess_text(sentence: AnyStr) -> AnyStr:
"""
Function that pre-processes input text by removing special characters, hyper links,
numbers and by removing stop words
Parameters:
sentence: str, required -> A raw string which may have stop words, special chars etc.
Returns:
return_txt: str -> A clean string with all aforementioned, removed.
"""
sentence=sentence.lower().replace('{html}',"")
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', sentence)
rem_url=re.sub(r'http\S+', '',cleantext)
rem_num = re.sub('[0-9]+', '', rem_url)
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(rem_num)
filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
return_txt = " ".join(filtered_words)
return return_txt
def image_to_np_arr(image) -> np.array:
"""
Function that converts a byte array image into a floating pointer numpy array.
Parameters:
inp_texts: List[str], required -> List of strings, containing item names of a menu in english.
Returns:
np.ndarray
"""
return np.array(image)
async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
"""
Function that processes extracted text by removing numbers and special characters,
and filters out text with less than 2 words.
Parameters:
raw_extrc_text: List[Tuple], required -> A list of tuples containing extracted text.
Returns:
List[AnyStr] -> A list of processed text strings.
"""
output_texts = []
for _, extr_text, _ in raw_extrc_text:
# remove all numbers, special characters from a string
prcsd_txt = preprocess_text(extr_text)
if len(prcsd_txt.split(" ")) >= 2:
output_texts.append(prcsd_txt)
return output_texts
def post_process_gen_outputs(gen_output: List[str], header_pattern: str, dots_pattern:str) -> List[Dict]:
# Define the regular expression pattern to match section names and placeholders
headers = ["Item Name", "Major Ingredients", "Making Process", "Portion and Spice Level", "Pairs With", "Allergens", "Food Type"]
# Function to clean the strings
def clean_string(input_string):
parts = input_string.split(',')
cleaned_parts = [part.strip() for part in parts if part.strip()]
return ', '.join(cleaned_parts)
for i in range(len(gen_output)):
# Find all matches
matches = re.findall(header_pattern, gen_output[i])
# Since re.findall returns a list of tuples, we need to extract the first tuple
if matches:
result = dict(zip(headers,matches[0]))
result['Major Ingredients'] = clean_string(result['Major Ingredients'])
# if any of dictionary values strings are emtpy, replace it with string "Sorry, can't explain this."
for k in result.keys():
if len(result[k]) < 3 or any(header in result[k] for header in headers):
result[k] = "Sorry, can't explain this."
gen_output[i] = result
else:
if headers[1] in gen_output[i]:
gen_output[i] = {"May contain misleading explanation":
dots_pattern.sub('' ,
gen_output[i].split(headers[1]
)[1].strip().replace('</s>', '')
)
}
else:
gen_output[i] = {"Sorry, can't explain this item": "NA"}
gen_output[i].pop('Item Name', None)
return gen_output