import streamlit as st import bm25s from operator import itemgetter import os import re @st.cache_data def load_data(): df = pd.read_csv("cleaned_list.csv",header = None) df.columns = ['document'] corpus = [doc for doc in df['document'].to_list()] retriever = bm25s.BM25(corpus=corpus) retriever.index(bm25s.tokenize(corpus)) return retriever def extract_hscode(text): match = re.search(r'hs_code:\s*(\d+)', text) if match: return match.group(1) return None df2 = pd.read_csv("hscode_main.csv") new_col = [len(str(code))for code in df2['hs_code'].to_list()] df2['len'] = new_col new_hscode = [str(code) for code in df2['hs_code']] for i in range(len(new_col)): if new_col[i]==5: new_hscode[i] = '0'+ new_hscode[i] df2['hs_code'] = new_hscode df2=df2.drop(columns='len') if 'retriever' not in st.session_state: st.session_state.retriever = None if st.session_state.retriever is None: st.session_state.retriever = load data() sentence = st.text_input("please enter description:") if sentence !='': results,_ = st.session_state.retriever.retrieve(bm25s.tokenize(sentence), k=5) doc = [d for d in results] hscodes = [extract_hscode(item) for item in doc[0]] for code in hscodes: filter_df = df2[df2['hs_code']==code] answer = filter_df['full_description'].iloc[0] st.write("Hscode:",code) st.write("answer:",answer)