import streamlit as st from easynmt import EasyNMT from nltk import word_tokenize from simalign import SentenceAligner import re import penman import amrlib from amrlib.graph_processing.annotator import add_lemmas from amrlib.alignments.rbw_aligner import RBWAligner @st.cache_resource def load_easynmt(): return EasyNMT('opus-mt') @st.cache_resource def load_stog_model(): return amrlib.load_stog_model(model_dir='model_stog') @st.cache_resource def load_gtos_model(): return amrlib.load_gtos_model(model_dir='model_gtos') # Find a node corresponding targetWord in the graph: def getTargetWordNode(segmentTokens, aligner, alignments, target): # Get target word in English: if target in segmentTokens: targetIndexFr = segmentTokens.index(target) targetIndexesEn = [i for i in alignments['mwmf'] if i[0]==targetIndexFr] if len(targetIndexesEn) > 0: targetIndexEn = targetIndexesEn[0][1] # Get a full name of the graph node: if aligner.alignments[targetIndexEn] != None: nodeConcepts = [i for i in re.split(',|\(|\"|\'', str(aligner.alignments[targetIndexEn])) if i.strip() != ''] return nodeConcepts[0]+' / '+nodeConcepts[2] else: return 'Error!' # Alignment between target word in French and its English instance not found else: return 'Error!' # Alignment between target word in French and its English instance not found else: return 'Error!' # Alignment between target word in French and its English instance not found # Extract a subgraph containing target word with full path (all the node) to it: def getTargetWordSubGraphFullPath(amrGraph, target): stringTmp = [i+' ' for i in re.split('\n', amrGraph) if i[0] !='#'] stringTmp2 = [] for s in stringTmp: stringTmp2+=[i for i in re.split('(:\w+\s|:\w+-\w+\s)', s) if i.strip() !=''] string = [] for s in stringTmp2: string+=[i for i in re.split('(\(|\))', s) if i.strip() !=''] openListGlobal = [] openList = [] subGraph = "" subGraphGlobal = [] flag = False stop = False for i in range(len(string)): if flag: if string[i] == '(': openList.append('(') subGraph+=string[i] elif string[i] == ')': openList.pop() if openList == []: flag = False stop = True subGraph+=')' subGraphGlobal.append(subGraph) else: subGraph+=string[i] else: subGraph+=string[i] else: if target in string[i].strip(): flag = True subGraph+=string[i] openList.append('(') else: if not stop and string[i] == '(': openListGlobal.append('(') subGraphGlobal.append(string[i]) elif not stop and string[i] == ')': openListGlobal.pop() while subGraphGlobal[-1] != '(': subGraphGlobal.pop() subGraphGlobal.pop() subGraphGlobal.pop() elif not stop: subGraphGlobal.append(string[i]) for i in openListGlobal: if i=='(': subGraphGlobal.append(')') resultGraph = "" for i in subGraphGlobal: resultGraph+=i # Fix the formatting: g = penman.decode(resultGraph) return penman.encode(g) def main(): st.header('Abstract Meaning Representation based summary of French text', divider='blue') segmentFr = st.text_area( "Text to summarize:", "Article 2 : Occupations ou utilisations du sol soumises à des conditions particulières\n\n" "2) Dans les périmètres en bordure des cours d’eau définis dans les annexes sanitaires du PLU :\n\n" "− Seules les clôtures en grillage pourront être autorisées à condition qu'elles soient conçues de\n" "manière à ne pas faire obstacle au libre écoulement des eaux.", height=170, ) ## Alternative example: #segmentFr = st.text_area( #"Text to summarize:", #"Article 1: Le classement interdit tout changement d'affectation ou tout mode d'occupation du sol de nature à compromettre la conservation, la protection ou la création des boisements. Dans les bois, forêts ou parcs situés sur le territoire de communes où l'établissement d'un plan d'occupation des sols a été prescrit mais où ce plan n'a pas encore été rendu public, ainsi que dans tout espace boisé classé, les coupes et abattages d'arbres sont soumis à autorisation préalable.", #height=170, #) targetWord = st.text_input('Keyword:', 'clôtures') ##targetWord = st.text_input('Keyword:', 'compromettre') if st.button('Summarize'): # Fix input formatting: segmentFr = segmentFr.replace('\n',' ') # Translate segment into English: model = load_easynmt() segmentEn = model.translate(segmentFr , source_lang='fr', target_lang='en') # Get an AMR graph: stog = load_stog_model() inputGraph = stog.parse_sents([segmentEn]) # Get tokenized representation of segment in French: segmentFrTokens = word_tokenize(segmentFr, language='french') # Get tokenized representation of segment in English: penmanGraph = add_lemmas(inputGraph[0], snt_key='snt') aligner = RBWAligner.from_penman_w_json(penmanGraph) segmentEnTokens = aligner.lemmas # Get alignments between original version and translation: myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai") alignments = myaligner.get_word_aligns(segmentFrTokens, segmentEnTokens) # Find a node corresponding targetWord in the graph: targetNode = getTargetWordNode(segmentFrTokens, aligner, alignments, targetWord) # Check if targetNode is in the graph: errorFlag = False if targetNode not in inputGraph[0]: #if targetWord in inputGraph[0]: if targetWord in ''.join(inputGraph[0].split('\n')[1:]): targetNode = targetWord else: errorFlag = True # Extract a subgraph containing target word with full path (all the node) to it: if not errorFlag: if targetNode != 'Error!': targetSubGraph = getTargetWordSubGraphFullPath(inputGraph[0], targetNode) # Generate text from given AMR-graph: gtos = load_gtos_model() rulesEn, _ = gtos.generate([targetSubGraph]) # Remove "1." from the text: rulesEn = [re.sub('\d. ', '', rulesEn[0])] # Translate it back to French rulesFr = model.translate(rulesEn[0], source_lang='en', target_lang='fr') st.write("Summary: ", rulesFr) else: st.write('Error! Alignment between target word in French and its English instance not found') else: st.write('Error! Cannot find keyword in the graph') if __name__ == "__main__": main()