koptelovmax commited on
Commit
b27de9c
1 Parent(s): 13d2655

Add application file

Browse files
README.md CHANGED
@@ -1,12 +1 @@
1
- ---
2
- title: Test2
3
- emoji: 🔥
4
- colorFrom: pink
5
- colorTo: pink
6
- sdk: streamlit
7
- sdk_version: 1.33.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # amr_demo
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from easynmt import EasyNMT
4
+
5
+ from nltk import word_tokenize
6
+ from simalign import SentenceAligner
7
+
8
+ import re
9
+ import penman
10
+
11
+ import amrlib
12
+ from amrlib.graph_processing.annotator import add_lemmas
13
+ from amrlib.alignments.rbw_aligner import RBWAligner
14
+
15
+ # Find a node corresponding targetWord in the graph:
16
+ def getTargetWordNode(segmentTokens, aligner, alignments, target):
17
+ # Get target word in English:
18
+ if target in segmentTokens:
19
+ targetIndexFr = segmentTokens.index(target)
20
+
21
+ targetIndexesEn = [i for i in alignments['mwmf'] if i[0]==targetIndexFr]
22
+ if len(targetIndexesEn) > 0:
23
+ targetIndexEn = targetIndexesEn[0][1]
24
+
25
+ # Get a full name of the graph node:
26
+ if aligner.alignments[targetIndexEn] != None:
27
+ nodeConcepts = [i for i in re.split(',|\(|\"|\'', str(aligner.alignments[targetIndexEn])) if i.strip() != '']
28
+ return nodeConcepts[0]+' / '+nodeConcepts[2]
29
+ else:
30
+ return 'Error!' # Alignment between target word in French and its English instance not found
31
+ else:
32
+ return 'Error!' # Alignment between target word in French and its English instance not found
33
+ else:
34
+ return 'Error!' # Alignment between target word in French and its English instance not found
35
+
36
+ # Extract a subgraph containing target word with full path (all the node) to it:
37
+ def getTargetWordSubGraphFullPath(amrGraph, target):
38
+ stringTmp = [i+' ' for i in re.split('\n', amrGraph) if i[0] !='#']
39
+
40
+ stringTmp2 = []
41
+ for s in stringTmp:
42
+ stringTmp2+=[i for i in re.split('(:\w+\s|:\w+-\w+\s)', s) if i.strip() !='']
43
+
44
+ string = []
45
+ for s in stringTmp2:
46
+ string+=[i for i in re.split('(\(|\))', s) if i.strip() !='']
47
+
48
+ openListGlobal = []
49
+ openList = []
50
+ subGraph = ""
51
+ subGraphGlobal = []
52
+
53
+ flag = False
54
+ stop = False
55
+ for i in range(len(string)):
56
+ if flag:
57
+ if string[i] == '(':
58
+ openList.append('(')
59
+ subGraph+=string[i]
60
+ elif string[i] == ')':
61
+ openList.pop()
62
+ if openList == []:
63
+ flag = False
64
+ stop = True
65
+ subGraph+=')'
66
+ subGraphGlobal.append(subGraph)
67
+ else:
68
+ subGraph+=string[i]
69
+ else:
70
+ subGraph+=string[i]
71
+ else:
72
+ if target in string[i].strip():
73
+ flag = True
74
+ subGraph+=string[i]
75
+ openList.append('(')
76
+ else:
77
+ if not stop and string[i] == '(':
78
+ openListGlobal.append('(')
79
+ subGraphGlobal.append(string[i])
80
+ elif not stop and string[i] == ')':
81
+ openListGlobal.pop()
82
+ while subGraphGlobal[-1] != '(':
83
+ subGraphGlobal.pop()
84
+ subGraphGlobal.pop()
85
+ subGraphGlobal.pop()
86
+ elif not stop:
87
+ subGraphGlobal.append(string[i])
88
+
89
+ for i in openListGlobal:
90
+ if i=='(':
91
+ subGraphGlobal.append(')')
92
+
93
+ resultGraph = ""
94
+ for i in subGraphGlobal:
95
+ resultGraph+=i
96
+
97
+ # Fix the formatting:
98
+ g = penman.decode(resultGraph)
99
+
100
+ return penman.encode(g)
101
+
102
+ def main():
103
+ st.header('Abstract Meaning Representation based summary of French text', divider='blue')
104
+
105
+ #segmentFr = st.text_area(
106
+ #"Text to summarize:",
107
+ #"Article 1 : Occupations ou utilisations du sol interdites\n\n"
108
+ #"1) Dans l’ensemble de la zone sont interdits :\n\n"
109
+ #"Les pylônes et poteaux, supports d’enseignes et d’antennes d’émission ou de réception de \n"
110
+ #"signaux radioélectriques.",
111
+ #height=170,
112
+ #)#.replace('\n',' ')
113
+
114
+ segmentFr = st.text_area(
115
+ "Text to summarize:",
116
+ "Article 2 : Occupations ou utilisations du sol soumises à des conditions particulières\n\n"
117
+ "2) Dans les périmètres en bordure des cours d’eau définis dans les annexes sanitaires du PLU :\n\n"
118
+ "− Seules les clôtures en grillage pourront être autorisées à condition qu'elles soient conçues de\n"
119
+ "manière à ne pas faire obstacle au libre écoulement des eaux.",
120
+ height=170,
121
+ )
122
+
123
+ ## Alternative example:
124
+ #segmentFr = st.text_area(
125
+ #"Text to summarize:",
126
+ #"Article 1: Le classement interdit tout changement d'affectation ou tout mode d'occupation du sol de nature à compromettre la conservation, la protection ou la création des boisements. Dans les bois, forêts ou parcs situés sur le territoire de communes où l'établissement d'un plan d'occupation des sols a été prescrit mais où ce plan n'a pas encore été rendu public, ainsi que dans tout espace boisé classé, les coupes et abattages d'arbres sont soumis à autorisation préalable.",
127
+ #height=170,
128
+ #)
129
+
130
+ #targetWord = st.text_input('Keyword:', 'Occupations')
131
+ targetWord = st.text_input('Keyword:', 'clôtures')
132
+ ##targetWord = st.text_input('Keyword:', 'compromettre')
133
+
134
+ if st.button('Summarize'):
135
+ # Fix input formatting:
136
+ segmentFr = segmentFr.replace('\n',' ')
137
+
138
+ #st.code(segmentFr)
139
+
140
+ # Translate segment into English:
141
+ model = EasyNMT('opus-mt')
142
+ #segmentEn = model.translate(segmentFr.lower() , source_lang='fr', target_lang='en')
143
+ segmentEn = model.translate(segmentFr , source_lang='fr', target_lang='en')
144
+
145
+ # Get an AMR graph:
146
+ stog = amrlib.load_stog_model(model_dir='resources/model_stog')
147
+ inputGraph = stog.parse_sents([segmentEn])
148
+
149
+ ## Output the resulting graph:
150
+ #print(inputGraph[0])
151
+
152
+ # Get tokenized representation of segment in French:
153
+ segmentFrTokens = word_tokenize(segmentFr, language='french')
154
+
155
+ # Get tokenized representation of segment in English:
156
+ penmanGraph = add_lemmas(inputGraph[0], snt_key='snt')
157
+
158
+ aligner = RBWAligner.from_penman_w_json(penmanGraph)
159
+ segmentEnTokens = aligner.lemmas
160
+
161
+ # Get alignments between original version and translation:
162
+ myaligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")
163
+ alignments = myaligner.get_word_aligns(segmentFrTokens, segmentEnTokens)
164
+
165
+ # Find a node corresponding targetWord in the graph:
166
+ targetNode = getTargetWordNode(segmentFrTokens, aligner, alignments, targetWord)
167
+
168
+ # Check if targetNode is in the graph:
169
+ errorFlag = False
170
+ if targetNode not in inputGraph[0]:
171
+ #if targetWord in inputGraph[0]:
172
+ if targetWord in ''.join(inputGraph[0].split('\n')[1:]):
173
+ targetNode = targetWord
174
+ else:
175
+ errorFlag = True
176
+
177
+ # Extract a subgraph containing target word with full path (all the node) to it:
178
+ if not errorFlag:
179
+ if targetNode != 'Error!':
180
+ targetSubGraph = getTargetWordSubGraphFullPath(inputGraph[0], targetNode)
181
+ #print(targetSubGraph)
182
+
183
+ # Generate text from given AMR-graph:
184
+ gtos = amrlib.load_gtos_model(model_dir='resources/model_gtos')
185
+ rulesEn, _ = gtos.generate([targetSubGraph])
186
+
187
+ # Remove "1." from the text:
188
+ rulesEn = [re.sub('\d. ', '', rulesEn[0])]
189
+
190
+ # Translate it back to French
191
+ rulesFr = model.translate(rulesEn[0], source_lang='en', target_lang='fr')
192
+ #print(rulesEn[0])
193
+ #print(rulesFr)
194
+
195
+ st.write("Summary: ", rulesFr)
196
+ else:
197
+ #print('Alignment between target word in French and its English instance not found')
198
+ st.write('Error! Alignment between target word in French and its English instance not found')
199
+ else:
200
+ #print('Error! Cannot find keyword in the graph')
201
+ st.write('Error! Cannot find keyword in the graph')
202
+
203
+ if __name__ == "__main__":
204
+ main()
model_gtos/amrlib_meta.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type":"gtos",
3
+ "version":"0.1.0",
4
+ "date":"2020-12-30",
5
+ "inference_module":".generate_t5wtense.inference",
6
+ "inference_class":"Inference",
7
+ "model_fn":"pytorch_model.bin",
8
+ "kwargs":{}
9
+ }
model_gtos/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "t5-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 3072,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "relu",
13
+ "initializer_factor": 1.0,
14
+ "is_encoder_decoder": true,
15
+ "layer_norm_epsilon": 1e-06,
16
+ "model_type": "t5",
17
+ "n_positions": 512,
18
+ "num_decoder_layers": 12,
19
+ "num_heads": 12,
20
+ "num_layers": 12,
21
+ "output_past": true,
22
+ "pad_token_id": 0,
23
+ "relative_attention_num_buckets": 32,
24
+ "task_specific_params": {
25
+ "translation_amr_to_text": {
26
+ "corpus_dir": "data/tdata_generate_t5wtense/",
27
+ "max_in_len": 512,
28
+ "max_out_len": 90,
29
+ "model_name_or_path": "t5-base",
30
+ "train_fn": "train.txt.features.nowiki.tdata",
31
+ "valid_fn": "dev.txt.features.nowiki.tdata"
32
+ }
33
+ },
34
+ "use_cache": true,
35
+ "vocab_size": 32128
36
+ }
model_gtos/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df94c52acfeb55f655113784b6ccc324615dfb9e68b8e69c9c3a05d9afaa4954
3
+ size 891734433
model_gtos/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c00118711daf521f494ac5b629f8b170e607cc9741fbdcaa05fe4938dbf4432
3
+ size 1775
model_stog/amrlib_meta.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type":"stog",
3
+ "version":"0.1.0",
4
+ "date":"2022-02-16",
5
+ "inference_module":".parse_xfm.inference",
6
+ "inference_class":"Inference",
7
+ "model_fn":"pytorch_model.bin",
8
+ "base_model":"facebook/bart-large",
9
+ "kwargs":{}
10
+ }
model_stog/config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-large",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 12,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 16,
23
+ "encoder_ffn_dim": 4096,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 12,
26
+ "eos_token_id": 2,
27
+ "forced_eos_token_id": 2,
28
+ "gradient_checkpointing": false,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1",
32
+ "2": "LABEL_2"
33
+ },
34
+ "init_std": 0.02,
35
+ "is_encoder_decoder": true,
36
+ "label2id": {
37
+ "LABEL_0": 0,
38
+ "LABEL_1": 1,
39
+ "LABEL_2": 2
40
+ },
41
+ "max_position_embeddings": 1024,
42
+ "model_type": "bart",
43
+ "normalize_before": false,
44
+ "num_beams": 4,
45
+ "num_hidden_layers": 12,
46
+ "pad_token_id": 1,
47
+ "scale_embedding": false,
48
+ "task_specific_params": {
49
+ "parse_amr": {
50
+ "corpus_dir": "data/tdata_xfm/",
51
+ "custom_save_checkpoint": true,
52
+ "eval_batch_size": 32,
53
+ "eval_fn": "dev.txt.nowiki",
54
+ "eval_num_beams": 1,
55
+ "first_eval_epoch": 1,
56
+ "max_in_len": 1024,
57
+ "max_out_len": 1024,
58
+ "max_train_graph_len": 512,
59
+ "max_train_sent_len": 100,
60
+ "model_name_or_path": "facebook/bart-large",
61
+ "save_at_end": false,
62
+ "save_tokenizer": false,
63
+ "train_fn": "train.txt.nowiki"
64
+ }
65
+ },
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.16.2",
68
+ "use_cache": true,
69
+ "vocab_size": 50265
70
+ }
model_stog/model_parse_xfm_bart_large.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ { "gen_args" :
2
+ {
3
+ "model_name_or_path" : "facebook/bart-large",
4
+ "corpus_dir" : "data/tdata_xfm/",
5
+ "train_fn" : "train.txt.nowiki",
6
+ "eval_fn" : "dev.txt.nowiki",
7
+ "custom_save_checkpoint" : true,
8
+ "save_tokenizer" : false,
9
+ "save_at_end" : false,
10
+ "first_eval_epoch" : 1,
11
+ "eval_batch_size" : 32,
12
+ "eval_num_beams" : 1,
13
+ "max_in_len" : 1024,
14
+ "max_out_len" : 1024,
15
+ "max_train_sent_len" : 100,
16
+ "max_train_graph_len" : 512
17
+ },
18
+ "model_args":
19
+ {
20
+ "no_repeat_ngram_size" : 0
21
+ },
22
+ "hf_args" :
23
+ {
24
+ "output_dir" : "data/model_parse_xfm",
25
+ "save_strategy" : "epoch",
26
+ "evaluation_strategy" : "epoch",
27
+ "fp16" : true,
28
+ "group_by_length" : true,
29
+ "do_train" : true,
30
+ "do_eval" : true,
31
+ "save_total_limit" : 1,
32
+ "logging_steps" : 300,
33
+ "num_train_epochs" : 16,
34
+ "per_device_train_batch_size" : 8,
35
+ "gradient_accumulation_steps" : 4,
36
+ "weight_decay" : 0.004,
37
+ "learning_rate" : 5e-5,
38
+ "max_grad_norm" : 1.0,
39
+ "warmup_steps" : 5200,
40
+ "seed" : 42
41
+ }
42
+ }
model_stog/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9bdcde696e63224ba56853469689d2e22e64c324eedee6e48348e806eedd45b
3
+ size 1625557313
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ penman
3
+ torch
4
+ numpy
5
+ spacy
6
+ tqdm
7
+ transformers
8
+ smatch
9
+ cached_property
10
+ networkx
11
+ nltk
12
+ unidecode
13
+ requests
14
+ word2number
15
+ amrlib
16
+ easynmt
17
+ simalign
18
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz#egg=en_core_web_sm