|
|
|
import numpy as np |
|
from typing import List, Tuple, Optional, AnyStr, Dict |
|
import nltk |
|
nltk.download("stopwords") |
|
nltk.download('punkt') |
|
|
|
from nltk.tokenize import RegexpTokenizer |
|
from nltk.corpus import stopwords |
|
import re |
|
|
|
|
|
def preprocess_text(sentence: AnyStr) -> AnyStr: |
|
|
|
""" |
|
Function that pre-processes input text by removing special characters, hyper links, |
|
numbers and by removing stop words |
|
|
|
Parameters: |
|
sentence: str, required -> A raw string which may have stop words, special chars etc. |
|
|
|
Returns: |
|
return_txt: str -> A clean string with all aforementioned, removed. |
|
""" |
|
|
|
sentence=sentence.lower().replace('{html}',"") |
|
cleanr = re.compile('<.*?>') |
|
cleantext = re.sub(cleanr, '', sentence) |
|
|
|
rem_url=re.sub(r'http\S+', '',cleantext) |
|
rem_num = re.sub('[0-9]+', '', rem_url) |
|
tokenizer = RegexpTokenizer(r'\w+') |
|
|
|
tokens = tokenizer.tokenize(rem_num) |
|
filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')] |
|
|
|
return_txt = " ".join(filtered_words) |
|
|
|
return return_txt |
|
|
|
def image_to_np_arr(image) -> np.array: |
|
|
|
""" |
|
Function that converts a byte array image into a floating pointer numpy array. |
|
|
|
Parameters: |
|
inp_texts: List[str], required -> List of strings, containing item names of a menu in english. |
|
|
|
Returns: |
|
np.ndarray |
|
""" |
|
|
|
return np.array(image) |
|
|
|
async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]: |
|
""" |
|
Function that processes extracted text by removing numbers and special characters, |
|
and filters out text with less than 2 words. |
|
|
|
Parameters: |
|
raw_extrc_text: List[Tuple], required -> A list of tuples containing extracted text. |
|
|
|
Returns: |
|
List[AnyStr] -> A list of processed text strings. |
|
""" |
|
output_texts = [] |
|
for _, extr_text, _ in raw_extrc_text: |
|
|
|
prcsd_txt = preprocess_text(extr_text) |
|
if len(prcsd_txt.split(" ")) >= 2: |
|
output_texts.append(prcsd_txt) |
|
|
|
return output_texts |
|
|
|
def post_process_gen_outputs(gen_output: List[str], header_pattern: str, dots_pattern:str) -> List[Dict]: |
|
|
|
|
|
headers = ["Item Name", "Major Ingredients", "Making Process", "Portion and Spice Level", "Pairs With", "Allergens", "Food Type"] |
|
|
|
|
|
def clean_string(input_string): |
|
parts = input_string.split(',') |
|
cleaned_parts = [part.strip() for part in parts if part.strip()] |
|
return ', '.join(cleaned_parts) |
|
|
|
for i in range(len(gen_output)): |
|
|
|
matches = re.findall(header_pattern, gen_output[i]) |
|
|
|
|
|
if matches: |
|
result = dict(zip(headers,matches[0])) |
|
result['Major Ingredients'] = clean_string(result['Major Ingredients']) |
|
|
|
|
|
for k in result.keys(): |
|
if len(result[k]) < 3 or any(header in result[k] for header in headers): |
|
result[k] = "Sorry, can't explain this." |
|
|
|
gen_output[i] = result |
|
|
|
else: |
|
if headers[1] in gen_output[i]: |
|
|
|
gen_output[i] = {"May contain misleading explanation": |
|
dots_pattern.sub('' , |
|
gen_output[i].split(headers[1] |
|
)[1].strip().replace('</s>', '') |
|
) |
|
} |
|
else: |
|
gen_output[i] = {"Sorry, can't explain this item": "NA"} |
|
|
|
gen_output[i].pop('Item Name', None) |
|
return gen_output |
|
|
|
|