File size: 2,949 Bytes
b84549f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
from utils.common.data_record import read_json, write_json
import requests
import random
import hashlib
import tqdm
import time
session = requests.Session()
def translate(sentence):
app_id = '20221004001369410'
salt = str(random.randint(1000000000, 9999999999))
key = 'XEsBS6babmp9wz5bcoEs'
sign = hashlib.md5(f'{app_id}{sentence}{salt}{key}'.encode('utf8')).hexdigest()
response = requests.get(
'https://fanyi-api.baidu.com/api/trans/vip/translate',
params={
'q': sentence,
'from': 'en',
'to': 'zh',
'appid': app_id,
'salt': salt,
'sign': sign
}
).json()
if 'trans_result' not in response.keys():
print(response)
raise RuntimeError
return response['trans_result'][0]['src'], response['trans_result'][0]['dst']
def gen_label_from_sen_cls_json(sen_cls_json_path):
# generate Chinese translation
texts = []
anns = read_json(sen_cls_json_path)
for v in anns.values():
texts += [v['sentence']]
assert '\n' not in texts[-1]
texts = list(set(texts))
res_json = []
for text in tqdm.tqdm(texts):
time.sleep(1.2)
src_text, dst_text = translate(text)
res_json += [{
'src': src_text,
'dst': dst_text
}]
write_json(sen_cls_json_path + '.translate_data', res_json, backup=False)
if __name__ == '__main__':
# res = translate('I am a doctor.\nHello world!')
# print(res)
import os
data_dir_paths = {
**{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing5Domains/asc/{k.split("-")[1]}'
for k in ['HL5Domains-ApexAD2600Progressive', 'HL5Domains-CanonG3', 'HL5Domains-CreativeLabsNomadJukeboxZenXtra40GB',
'HL5Domains-NikonCoolpix4300', 'HL5Domains-Nokia6610']},
**{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing3Domains/asc/{k.split("-")[1]}'
for k in ['Liu3Domains-Computer', 'Liu3Domains-Router', 'Liu3Domains-Speaker']},
**{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing9Domains/asc/{k.split("-")[1]}'
for k in [f'Ding9Domains-{d}' for d in os.listdir('/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing9Domains/asc')]},
**{f'SemEval-{k[0].upper()}{k[1:]}': f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/XuSemEval/asc/14/{k}'
for k in ['laptop', 'rest']},
}
json_paths = []
for p in data_dir_paths.values():
json_paths += [os.path.join(p, f'{split}.json') for split in ['train', 'dev', 'test']]
assert all([os.path.exists(p) for p in json_paths])
# print(len(json_paths))
# exit()
for p in tqdm.tqdm(json_paths[23:]):
print(p)
gen_label_from_sen_cls_json(p) |