Spaces:

AmithAdiraju1694
/

translatemyimage-beta

Paused

App Files Files Community

feat_preo_cmod

by AmithAdiraju1694 - opened Dec 18, 2024

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+380

-223

Files changed (6) hide show

app.py +37 -177
inference/config.py +16 -26
inference/preprocess_image.py +57 -4
inference/translate.py +41 -16
pages.py +214 -0
utils.py +15 -0

app.py CHANGED Viewed

@@ -1,204 +1,64 @@
 import streamlit as st
 from streamlit import session_state as sst
-from typing import List, Optional
 import asyncio
-import pandas as pd
-from inference.translate import (
-    extract_filter_img,
-    transcribe_menu_model
-)
-from inference.config import DEBUG_MODE
-from PIL import Image
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import os
-# Setting workers to be 70% of all available virtual cpus in system
-cpu_count = os.cpu_count()
-pool = ThreadPoolExecutor(max_workers=int(cpu_count*0.7) )
 # Initialize session state variable to start with home page
 if "page" not in sst:
     sst["page"] = "Home"
-def navigate_to(page: str) -> None:
-    """
-    Function to set the current page in the state of streamlit. A helper for
-    simulating navigation in streamlit.
-    Parameters:
-        page: str, required.
-    Returns:
-        None
-    """
-    sst["page"] = page
-async def main_page() -> None:
-    """
-    Function that contains content of main page i.e., image uploader and submit button to navigate to next page.
-    Upon submit , control goes to model inference 'page'.
-    Parameters:
-        None
-    Returns:
-        None
-    """
-    # Streamlit app
-    first_title = st.empty()
-    first_title.title("App that explains your menu items ")
-    # Streamlit function to upload an image from any device
-    uploaded_file = st.file_uploader("Choose an image...",
-                                 type=["jpg", "jpeg", "png"])
-    # Remove preivous states' value of input image if it exists
-    sst.pop('input_image', None)
-    # Submit button
-    if uploaded_file is not None:
-        image = Image.open(uploaded_file)
-        # Only show if user wants to see
-        if st.checkbox('Show Uploaded Image'):
-            st.image(image,
-                    caption='Uploaded Image',
-                    use_column_width=True)
-        sst["input_image"] = image
-        # Submit button
-        st.button("Submit",
-                  on_click = navigate_to,
-                  args = ("Inference",))
-        st.info("""This application is for education purposes only. It uses AI, hence it's dietary
-                    recommendations are not to be taken as medical advice, author doesn't bear responsibility
-                    for incorrect dietary recommendations. Please proceed with caution.
-                    """)
-async def dist_llm_inference(inp_texts: List[str]) -> None:
-    """
-    Function that performs concurrent LLM inference using threadpool. It displays
-    results of those threads that are done with execution, as a dynamic row to streamlit table, rather than
-    waiting for all threads to be done.
-    Parameters:
-        inp_texts: List[str], required -> List of strings, containing item names of a menu in english.
-    Returns:
-        None
-    """
-    df = pd.DataFrame([('ITEM NAME', 'EXPLANATION')]
-                     )
-    sl_table = st.table(df)
-    tp_futures = { pool.submit(transcribe_menu_model, mi): mi for mi in inp_texts }
-    for tpftr in as_completed(tp_futures):
-        item = tp_futures[tpftr]
-        try:
-            exp = tpftr.result()
-            sl_table.add_rows([(item,exp)] )
-        except Exception as e:
-            print("Could not add a new row dynamically, because of this error:", e)
-    return
-async def model_inference():
     """
-    Function that pre-processes input text from state variables, does concurrent inference
-    and toggles state between pages if needed.
-    Parameters:
-        None
     Returns:
         None
     """
-    second_title = st.empty()
-    second_title.title(" Using ML to explain your menu items ... ")
-    if "input_image" in sst:
-        image = sst["input_image"]
-        msg1 = st.empty()
-        msg1.write("Pre-processing and extracting text out of your image ....")
-        st_filter = time.perf_counter()
-        # Call the extract_filter_img function
-        filtered_text = await extract_filter_img(image)
-        en_filter = time.perf_counter()
-        num_items_detected = len(filtered_text)
-        if num_items_detected == 0:
-            st.write("We couldn't detect any menu items ( indian for now ) from your image, please try a different image.")
-        elif num_items_detected > 0:
-            st.write(f"Detected {num_items_detected} menu items from your input image ... ")
-            msg2 = st.empty()
-            msg2.write("All pre-processing done, transcribing your menu items now ....")
-            st_trans_llm = time.perf_counter()
-            await dist_llm_inference(filtered_text)
-            msg3 = st.empty()
-            msg3.write("Done transcribing ... ")
-            en_trans_llm = time.perf_counter()
-            msg1.empty(); msg2.empty(); msg3.empty()
-            st.success("Image processed successfully! " )
-            if DEBUG_MODE:
-                filter_time_sec = en_filter - st_filter
-                llm_time_sec = en_trans_llm - st_trans_llm
-                total_time_sec = filter_time_sec + llm_time_sec
-                st.write("Time took to extract and filter text {}".format(filter_time_sec))
-                st.write("Time took to summarize by LLM {}".format(llm_time_sec))
-                st.write('Overall time taken in seconds: {}'.format(total_time_sec))
-            st.button("translate another",
-                      on_click=navigate_to,
-                      args=("Home",))
-    else:
-        st.write("Looks like image upload failed, please try uploading it again ... ")
-async def main():
-    """
-    Function that toggles between pages based on state variables.
-    Parameters:
-        None
-    Returns:
-        None
-    """
-    if sst["page"] == "Home":
-        await main_page()
     elif sst["page"] == "Inference":
-        await model_inference()
 asyncio.run(main())

+from utils import navigate_to
+from pages import manual_input_page, image_input_page, model_inference_page
 import streamlit as st
 from streamlit import session_state as sst
 import asyncio
+#TODO: Fix model inference and post processing function befor emoving ot production.
 # Initialize session state variable to start with home page
 if "page" not in sst:
     sst["page"] = "Home"
+# function to remove all sesion variables from sst, except page.
+def reset_sst():
+    for key in list(sst.keys()):
+        if key != "page":
+            sst.pop(key, None)
+# Landing page function
+async def landing_page():
+    st.title("We will explain your menu like never before!")
+    st.write("\n")
+    st.write("\n")
+    st.write("\n")
+    c1, c2= st.columns(2)
+    with c1:
+        # Navigate to manual input page if user clicks on the button
+        st.button("Enter Items Manually", on_click=navigate_to, args=("ManualInput",))
+    with c2:
+        # Navigate to image input page if user clicks on the button
+        st.button("Upload Items from Image", on_click=navigate_to, args=("ImageInput",))
+# Main function to handle navigation
+async def main():
     """
+    Main function that handles the navigation logic based on the current page.
     Returns:
         None
     """
+    # Navigation logic
+    if sst["page"] == "Home":
+        reset_sst() # reset all session state variables before navigating to the landing page
+        await landing_page()  # Call the landing page function
+    elif sst["page"] == "ManualInput":
+        reset_sst() # reset all session state variables before navigating to the landing page
+        await manual_input_page()  # Call the manual input page function
+    elif sst["page"] == "ImageInput":
+        reset_sst() # reset all session state variables before navigating to the landing page
+        await image_input_page()  # Call the image input page function
     elif sst["page"] == "Inference":
+        await model_inference_page()  # Call the model inference page function
 asyncio.run(main())

inference/config.py CHANGED Viewed

@@ -1,33 +1,23 @@
-INSTRUCTION_PROMPT = """
-The following text contains examples of three items and their corresponding explanations in the required format.\n
-Item -> palak paneer.\n
-Explanation -> Major Ingredients here: paneer ( a.k.a cottage cheese ) , palak ( spinach ).\n
-How it is made: It's a savory item, made like a gravy; usually made by sauteing spices and mixing saute with boiled paneer and palak.\n
-It goes well with: White basmati rice or Indian flat bread.\n
-Allergens: Paneer may cause digestive discomfort and intolerance to some.\n
-Food Category: Vegetarian, Vegans may not like it, as paneer is usually made from cow milk.
-Item -> rumali roti.\n
-Explanation -> Major Ingredients here: roti.\n
-How it is made: A small soft bread, made to size of a napkin ( a.k.a 'rumal' in hindi ); usually made with a combination of whole wheat and all purpose flour.\n
-It goes well with: Most indian gravies such as palak paneer, tomato curry etc.\n
-Allergens: May contain gluten, which is known to cause digestive discomfort and intolerance to some.\n
-Food Category: Vegetarian, Vegan.
-Item -> nizami handi.\n
-Explanation -> Major Ingredients here: Different veggies, makhani sauce (skimmed milk, tomato and cashew paste , indian spices), combination of nuts.\n
-How it is made: Makhani sauce is added to onion-tomato based paste and bought to a boil; a Medley of veggies and gently flavored whole spices are added and boiled for small time.\n
-It goes well with: Different kinds of indian flat breads, white basmati and sonamasoori rice.\n
-Allergens: Presence of nuts, butter cream and makhani sauce are known to cause digestive discomfort and intolerance to some.\n
-Food Category: Usually vegetarian, may include chicken or animal meat sometimes, please check with hotel.
-Based on Item and explanation pairs provided above, provide similar explanation ('Major Ingredients', 'How is it made', 'It goes well with', 'Allergens' and 'Food Category') to the below item.\n
-Item ->
-"""
-DEBUG_MODE = False
-DEVICE = 'cpu'

+import torch
+import re
+model_inf_inp_prompt = "INSTRUCTION: given food item name, explain these things:(major ingredients,making process,portion & spicy/sweet,pairs with,allergens,food type(veg/non-veg/vegan)). ensure to get allergens and food category factually correct.Item Name: {} "
+header_pattern = r'Item Name: (.*?)\. Major Ingredients: (.*?)\. Making Process: (.*?)\. Portion and Spice Level: (.*?)\. Pairs With: (.*?)\. Allergens: (.*?)\. Food Type: (.*?)\.\s*</s>'
+dots_pattern = re.compile(r'\.{3,}')
+DEBUG_MODE = True
+model_name = "AmithAdiraju1694/gpt-neo-125M_menuitemexp"
+def get_device():
+  if torch.cuda.is_available():
+    device = torch.device("cuda")
+    print(f"Using GPU: {torch.cuda.get_device_name(0)}") #get the name of the GPU being used.
+  else:
+      device = torch.device("cpu")
+      print("Using CPU")
+  return device
+DEVICE = get_device()

inference/preprocess_image.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import numpy as np
-from typing import List, Tuple, Optional, AnyStr
 import nltk
 nltk.download("stopwords")
 nltk.download('punkt')
@@ -53,11 +53,64 @@ def image_to_np_arr(image) -> np.array:
     return np.array(image)
 async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
     output_texts = []
     for _, extr_text, _ in raw_extrc_text:
         # remove all numbers, special characters from a string
         prcsd_txt = preprocess_text(extr_text)
-        if len(prcsd_txt.split(" ") ) >= 2: output_texts.append(prcsd_txt)
-    return output_texts

 import numpy as np
+from typing import List, Tuple, Optional, AnyStr, Dict
 import nltk
 nltk.download("stopwords")
 nltk.download('punkt')
     return np.array(image)
 async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
+    """
+    Function that processes extracted text by removing numbers and special characters,
+    and filters out text with less than 2 words.
+    Parameters:
+        raw_extrc_text: List[Tuple], required -> A list of tuples containing extracted text.
+    Returns:
+        List[AnyStr] -> A list of processed text strings.
+    """
     output_texts = []
     for _, extr_text, _ in raw_extrc_text:
         # remove all numbers, special characters from a string
         prcsd_txt = preprocess_text(extr_text)
+        if len(prcsd_txt.split(" ")) >= 2:
+            output_texts.append(prcsd_txt)
+    return output_texts
+def post_process_gen_outputs(gen_output: List[str], header_pattern: str, dots_pattern:str) -> List[Dict]:
+    # Define the regular expression pattern to match section names and placeholders
+    headers = ["Item Name", "Major Ingredients", "Making Process", "Portion and Spice Level", "Pairs With", "Allergens", "Food Type"]
+    # Function to clean the strings
+    def clean_string(input_string):
+        parts = input_string.split(',')
+        cleaned_parts = [part.strip() for part in parts if part.strip()]
+        return ', '.join(cleaned_parts)
+    for i in range(len(gen_output)):
+        # Find all matches
+        matches = re.findall(header_pattern, gen_output[i])
+        # Since re.findall returns a list of tuples, we need to extract the first tuple
+        if matches:
+            result = dict(zip(headers,matches[0]))
+            result['Major Ingredients'] = clean_string(result['Major Ingredients'])
+            # if any of dictionary values strings are emtpy, replace it with string "Sorry, can't explain this."
+            for k in result.keys():
+                if len(result[k]) < 3 or any(header in result[k] for header in headers):
+                    result[k] = "Sorry, can't explain this."
+            gen_output[i] = result
+        else:
+            if headers[1] in gen_output[i]:
+                gen_output[i] = {"May contain misleading explanation":
+                                 dots_pattern.sub('' ,
+                                                  gen_output[i].split(headers[1]
+                                                   )[1].strip().replace('</s>', '')
+                                                  )
+                             }
+            else:
+                gen_output[i] = {"Sorry, can't explain this item": "NA"}
+        gen_output[i].pop('Item Name', None)
+    return gen_output

inference/translate.py CHANGED Viewed

@@ -2,29 +2,50 @@ import streamlit as st
 from inference.preprocess_image import (
     image_to_np_arr,
-    process_extracted_text
 )
-from inference.config import INSTRUCTION_PROMPT, DEVICE
 from typing import List, Tuple, Optional, AnyStr, Dict
-from transformers import T5Tokenizer, T5ForConditionalGeneration
 import easyocr
 import time
 use_gpu = True
-if DEVICE == 'cpu': use_gpu = False
 @st.cache_resource
 def load_models(item_summarizer: AnyStr) -> Tuple:
     text_extractor = easyocr.Reader(['en'],
                                     gpu = use_gpu
                                     )
-    tokenizer = T5Tokenizer.from_pretrained(item_summarizer)
-    model = T5ForConditionalGeneration.from_pretrained(item_summarizer)
     return (text_extractor, tokenizer, model)
-text_extractor,item_tokenizer,item_summarizer = load_models(item_summarizer = "google/flan-t5-large")
 # Define your extract_filter_img function
@@ -78,20 +99,24 @@ async def extract_filter_img(image) -> Dict:
 def transcribe_menu_model(menu_text: List[AnyStr]) -> Dict:
-    prompt_item = INSTRUCTION_PROMPT + " " + menu_text + """
-"""
     input_ids = item_tokenizer(prompt_item, return_tensors="pt").input_ids
     outputs = item_summarizer.generate(input_ids,
-                                        max_new_tokens = 512
                                         )
-    return item_tokenizer.decode(
-        outputs[0],
-        skip_special_tokens = True
-        )
 def classify_menu_text(extrc_str: List[AnyStr]) -> List[AnyStr]:
     return extrc_str

 from inference.preprocess_image import (
     image_to_np_arr,
+    process_extracted_text,
+    post_process_gen_outputs
 )
+from inference.config import (
+     model_inf_inp_prompt,
+    header_pattern,
+    dots_pattern,
+    DEVICE,
+    model_name
+                             )
 from typing import List, Tuple, Optional, AnyStr, Dict
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import easyocr
 import time
 use_gpu = True
+if DEVICE.type == 'cpu': use_gpu = False
 @st.cache_resource
 def load_models(item_summarizer: AnyStr) -> Tuple:
+    """
+    Function to load the models required for the inference process. Cached to avoid loading the models, every time the function is called.
+    Parameters:
+        item_summarizer: str, required -> The LLM model name to be used for item summarization.
+    Returns:
+        Tuple -> Tuple containing the required models for the inference process.
+    """
+    # model to extract text from image
     text_extractor = easyocr.Reader(['en'],
                                     gpu = use_gpu
                                     )
+    # tokenizer and model to generate item summary
+    tokenizer = AutoTokenizer.from_pretrained(item_summarizer)
+    model = AutoModelForCausalLM.from_pretrained(item_summarizer)
     return (text_extractor, tokenizer, model)
+text_extractor,item_tokenizer,item_summarizer = load_models(item_summarizer = model_name)
 # Define your extract_filter_img function
 def transcribe_menu_model(menu_text: List[AnyStr]) -> Dict:
+    prompt_item = model_inf_inp_prompt.format(menu_text)
     input_ids = item_tokenizer(prompt_item, return_tensors="pt").input_ids
     outputs = item_summarizer.generate(input_ids,
+                                       max_new_tokens = 512,
+                                       num_beams = 4,
+                                       pad_token_id = item_tokenizer.pad_token_id,
+                                       eos_token_id = item_tokenizer.eos_token_id,
+                                       bos_token_id = item_tokenizer.bos_token_id
+                                       )
+    prediction = item_tokenizer.batch_decode(outputs,
+                                        skip_special_tokens=False
                                         )
+    postpro_output = post_process_gen_outputs( prediction, header_pattern, dots_pattern )[0]
+    return postpro_output
 def classify_menu_text(extrc_str: List[AnyStr]) -> List[AnyStr]:
     return extrc_str

pages.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import streamlit as st
+from streamlit import session_state as sst
+from utils import navigate_to
+from inference.config import DEBUG_MODE
+from inference.translate import extract_filter_img, transcribe_menu_model,classify_menu_text
+from inference.preprocess_image import preprocess_text
+import os
+import time
+import pandas as pd
+from PIL import Image
+from typing import List
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+# Setting workers to be 70% of all available virtual cpus in system
+cpu_count = os.cpu_count()
+pool = ThreadPoolExecutor(max_workers=int(cpu_count*0.7) )
+# Function that handles logic of explaining menu items from manual input
+async def manual_input_page():
+    """
+    Function that takes text input from user in input box of streamlit, user can add multiple text boxes and submit finally.
+    Parameters:
+        None
+    Returns:
+        List[str]: List of strings, containing item names of a menu in english.
+    """
+    st.write("This is the Manual Input Page.")
+    st.write("Once done, click on 'Explain My Menu' button to get explanations for each item ... ")
+    inp_texts = []
+    num_text_boxes = st.number_input("Number of text boxes", min_value=1, step=1)
+    for i in range(num_text_boxes):
+        text_box = st.text_input(f"Food item {i+1}")
+        if text_box:
+            inp_texts.append(text_box)
+    if len(inp_texts) > 0:
+        # Show user submit button only if they have entered some text and set text in session state
+        sst["user_entered_items"] = inp_texts
+        st.button("Explain My Menu",on_click=navigate_to,args=("Inference",))
+    else:
+        st.write("Please enter some items to proceed ...")
+    st.button("Go back Home", on_click=navigate_to, args=("Home",))
+# Function that handles logic of explaining menu items from image uploads
+async def image_input_page():
+    """
+    Function that contains content of main page i.e., image uploader and submit button to navigate to next page.
+    Upon submit , control goes to model inference 'page'.
+    Parameters:
+        None
+    Returns:
+        None
+    """
+    st.write("This is the Image Input Page.")
+    # Streamlit function to upload an image from any device
+    uploaded_file = st.file_uploader("Choose an image...",
+                                 type=["jpg", "jpeg", "png"])
+    # Remove preivous states' value of input image if it exists
+    sst.pop('input_image', None)
+    # Submit button
+    if uploaded_file is not None:
+        image = Image.open(uploaded_file)
+        # Only show if user wants to see
+        if st.checkbox('Show Uploaded Image'):
+            st.image(image,
+                    caption='Uploaded Image',
+                    use_column_width=True)
+        sst["input_image"] = image
+        # Show user submit button only if they have uploaded an image
+        st.button("Translate My Menu",
+                  on_click = navigate_to,
+                  args = ("Inference",))
+        # Warning message to user
+        st.info("""This application is for education purposes only. It uses AI, hence it's dietary
+                    recommendations are not to be taken as medical advice, author doesn't bear responsibility
+                    for incorrect dietary recommendations. Please proceed with caution.
+                    """)
+    # if user wants to go back, make sure to reset the session state
+    st.button("Go back Home", on_click=navigate_to, args=("Home",))
+# Function that handles model inference
+async def model_inference_page():
+    """
+    Function that pre-processes input text from state variables, does concurrent inference
+    and toggles state between pages if needed.
+    Parameters:
+        None
+    Returns:
+        None
+    """
+    second_title = st.empty()
+    second_title.title(" Using ML to explain your menu items ... ")
+    # User can either upload an image or enter text manually, we check for both
+    if "input_image" in sst:
+        image = sst["input_image"]
+        msg1 = st.empty()
+        msg1.write("Pre-processing and extracting text out of your image ....")
+        # Call the extract_filter_img function
+        filtered_text = await extract_filter_img(image)
+        num_items_detected = len(filtered_text)
+    if "user_entered_items" in sst:
+        user_text = sst["user_entered_items"]
+        st.write("Pre-processing and filtering text from user input ....")
+        filtered_text = [preprocess_text(ut) for ut in user_text]
+        num_items_detected = len(filtered_text)
+    # irrespective of source of user entry , we check if we have any items to process
+    if num_items_detected == 0:
+        st.write("We couldn't detect any menu items ( indian for now ) from your image, please try a different image by going back.")
+    elif num_items_detected > 0:
+        st.write(f"Detected {num_items_detected} menu items from your input image ... ")
+        msg2 = st.empty()
+        msg2.write("All pre-processing done, transcribing your menu items now ....")
+        st_trans_llm = time.perf_counter()
+        await dist_llm_inference(filtered_text)
+        msg3 = st.empty()
+        msg3.write("Done transcribing ... ")
+        en_trans_llm = time.perf_counter()
+        msg2.empty(); msg3.empty()
+        st.success("Image processed successfully! " )
+        # Some basic stats for debug mode
+        if DEBUG_MODE:
+            llm_time_sec = en_trans_llm - st_trans_llm
+            st.write("Time took to summarize by LLM {}".format(llm_time_sec))
+    # If user clicked in "translate_another" button reset all session state variables and go back to home
+    st.button("Go back Home", on_click=navigate_to, args=("Home",))
+# Function that performs LLM inference on a single item
+async def dist_llm_inference(inp_texts: List[str]) -> None:
+    """
+    Function that performs concurrent LLM inference using threadpool. It displays
+    results of those threads that are done with execution, as a dynamic row to streamlit table, rather than
+    waiting for all threads to be done.
+    Parameters:
+        inp_texts: List[str], required -> List of strings, containing item names of a menu in english.
+    Returns:
+        None
+    """
+    df = pd.DataFrame([('ITEM NAME', 'EXPLANATION')]
+                     )
+    sl_table = st.table(df)
+    tp_futures = { pool.submit(transcribe_menu_model, mi): mi for mi in inp_texts }
+    for tpftr in as_completed(tp_futures):
+        item = tp_futures[tpftr]
+        try:
+            exp = tpftr.result()
+            sl_table.add_rows([(item,
+                                str(exp ))
+                                ]
+                                )
+        except Exception as e:
+            print("Could not add a new row dynamically, because of this error:", e)
+    return

utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from streamlit import session_state as sst
+def navigate_to(page: str) -> None:
+    """
+    Function to set the current page in the state of streamlit. A helper for
+    simulating navigation in streamlit.
+    Parameters:
+        page: str, required.
+    Returns:
+        None
+    """
+    sst["page"] = page