|
from utils.common.data_record import read_json, write_json |
|
import requests |
|
import random |
|
import hashlib |
|
import tqdm |
|
import time |
|
|
|
|
|
session = requests.Session() |
|
|
|
|
|
def translate(sentence): |
|
app_id = '20221004001369410' |
|
salt = str(random.randint(1000000000, 9999999999)) |
|
key = 'XEsBS6babmp9wz5bcoEs' |
|
|
|
sign = hashlib.md5(f'{app_id}{sentence}{salt}{key}'.encode('utf8')).hexdigest() |
|
|
|
response = requests.get( |
|
'https://fanyi-api.baidu.com/api/trans/vip/translate', |
|
params={ |
|
'q': sentence, |
|
'from': 'en', |
|
'to': 'zh', |
|
'appid': app_id, |
|
'salt': salt, |
|
'sign': sign |
|
} |
|
).json() |
|
|
|
if 'trans_result' not in response.keys(): |
|
print(response) |
|
raise RuntimeError |
|
|
|
return response['trans_result'][0]['src'], response['trans_result'][0]['dst'] |
|
|
|
|
|
def gen_label_from_sen_cls_json(sen_cls_json_path): |
|
|
|
|
|
texts = [] |
|
anns = read_json(sen_cls_json_path) |
|
for v in anns.values(): |
|
texts += [v['sentence']] |
|
assert '\n' not in texts[-1] |
|
|
|
texts = list(set(texts)) |
|
|
|
res_json = [] |
|
|
|
for text in tqdm.tqdm(texts): |
|
time.sleep(1.2) |
|
|
|
src_text, dst_text = translate(text) |
|
res_json += [{ |
|
'src': src_text, |
|
'dst': dst_text |
|
}] |
|
|
|
write_json(sen_cls_json_path + '.translate_data', res_json, backup=False) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
import os |
|
|
|
data_dir_paths = { |
|
**{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing5Domains/asc/{k.split("-")[1]}' |
|
for k in ['HL5Domains-ApexAD2600Progressive', 'HL5Domains-CanonG3', 'HL5Domains-CreativeLabsNomadJukeboxZenXtra40GB', |
|
'HL5Domains-NikonCoolpix4300', 'HL5Domains-Nokia6610']}, |
|
|
|
**{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing3Domains/asc/{k.split("-")[1]}' |
|
for k in ['Liu3Domains-Computer', 'Liu3Domains-Router', 'Liu3Domains-Speaker']}, |
|
|
|
**{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing9Domains/asc/{k.split("-")[1]}' |
|
for k in [f'Ding9Domains-{d}' for d in os.listdir('/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing9Domains/asc')]}, |
|
|
|
**{f'SemEval-{k[0].upper()}{k[1:]}': f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/XuSemEval/asc/14/{k}' |
|
for k in ['laptop', 'rest']}, |
|
} |
|
|
|
json_paths = [] |
|
for p in data_dir_paths.values(): |
|
json_paths += [os.path.join(p, f'{split}.json') for split in ['train', 'dev', 'test']] |
|
|
|
assert all([os.path.exists(p) for p in json_paths]) |
|
|
|
|
|
|
|
|
|
for p in tqdm.tqdm(json_paths[23:]): |
|
print(p) |
|
gen_label_from_sen_cls_json(p) |