File size: 2,949 Bytes
b84549f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from utils.common.data_record import read_json, write_json
import requests
import random
import hashlib
import tqdm
import time


session = requests.Session()


def translate(sentence):
    app_id = '20221004001369410'
    salt = str(random.randint(1000000000, 9999999999))
    key = 'XEsBS6babmp9wz5bcoEs'
    
    sign = hashlib.md5(f'{app_id}{sentence}{salt}{key}'.encode('utf8')).hexdigest()
    
    response = requests.get(
        'https://fanyi-api.baidu.com/api/trans/vip/translate',
        params={
            'q': sentence,
            'from': 'en',
            'to': 'zh',
            'appid': app_id,
            'salt': salt,
            'sign': sign
        }
    ).json()
    
    if 'trans_result' not in response.keys():
        print(response)
        raise RuntimeError
    
    return response['trans_result'][0]['src'], response['trans_result'][0]['dst']


def gen_label_from_sen_cls_json(sen_cls_json_path):
    # generate Chinese translation
    
    texts = []
    anns = read_json(sen_cls_json_path)
    for v in anns.values():
        texts += [v['sentence']]
        assert '\n' not in texts[-1]
        
    texts = list(set(texts))
        
    res_json = []

    for text in tqdm.tqdm(texts):
        time.sleep(1.2)
        
        src_text, dst_text = translate(text)
        res_json += [{
            'src': src_text,
            'dst': dst_text
        }]
        
        write_json(sen_cls_json_path + '.translate_data', res_json, backup=False)
        
    
if __name__ == '__main__':
    # res = translate('I am a doctor.\nHello world!')
    # print(res)
    import os
    
    data_dir_paths = {
        **{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing5Domains/asc/{k.split("-")[1]}' 
             for k in ['HL5Domains-ApexAD2600Progressive', 'HL5Domains-CanonG3', 'HL5Domains-CreativeLabsNomadJukeboxZenXtra40GB',
                              'HL5Domains-NikonCoolpix4300', 'HL5Domains-Nokia6610']},
        
        **{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing3Domains/asc/{k.split("-")[1]}' 
             for k in ['Liu3Domains-Computer', 'Liu3Domains-Router', 'Liu3Domains-Speaker']},
        
        **{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing9Domains/asc/{k.split("-")[1]}' 
             for k in [f'Ding9Domains-{d}' for d in os.listdir('/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing9Domains/asc')]},
        
        **{f'SemEval-{k[0].upper()}{k[1:]}': f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/XuSemEval/asc/14/{k}' 
             for k in ['laptop', 'rest']},
    }
    
    json_paths = []
    for p in data_dir_paths.values():
        json_paths += [os.path.join(p, f'{split}.json') for split in ['train', 'dev', 'test']]
        
    assert all([os.path.exists(p) for p in json_paths])
    
    # print(len(json_paths))
    # exit()
    
    for p in tqdm.tqdm(json_paths[23:]):
        print(p)
        gen_label_from_sen_cls_json(p)