File size: 2,330 Bytes
4738a88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def get_span(orig, new, editType):
    orig_list = orig.split(" ")
    new_list = new.split(" ")
    
    flag = False # this indicate whether the actual edit follow the specified editType
    if editType == "deletion":
        assert len(orig_list) > len(new_list), f"the edit type is deletion, but new is not shorter than original:\n new: {new}\n orig: {orig}"
        diff = len(orig_list) - len(new_list)
        for i, (o, n) in enumerate(zip(orig_list, new_list)):
            if o != n: # assume the index of the first different word is the starting index of the orig_span
            
                orig_span = [i, i + diff - 1] # assume that the indices are starting and ending index of the deleted part
                new_span = [i-1, i] # but for the new span, the starting and ending index is the two words that surround the deleted part
                flag = True
                break


    elif editType == "insertion": 
        assert len(orig_list) < len(new_list), f"the edit type is insertion, but the new is not longer than the original:\n new: {new}\n orig: {orig}"
        diff = len(new_list) - len(orig_list)
        for i, (o, n) in enumerate(zip(orig_list, new_list)):
            if o != n: # insertion is just the opposite of deletion
                new_span = [i, i + diff - 1] # NOTE if only inserted one word, s and e will be the same
                orig_span = [i-1, i]
                flag = True
                break

    elif editType == "substitution":
        new_span = []
        orig_span = []
        for i, (o, n) in enumerate(zip(orig_list, new_list)):
            if o != n:
                new_span = [i]
                orig_span = [i]
                break
        assert len(new_span) == 1 and len(orig_span) == 1, f"new_span: {new_span}, orig_span: {orig_span}"
        for j, (o, n) in enumerate(zip(orig_list[::-1], new_list[::-1])):
            if o != n:
                new_span.append(len(new_list) - j -1)
                orig_span.append(len(orig_list) - j - 1)
                flag = True
                break
    else:
        raise RuntimeError(f"editType unknown: {editType}")

    if not flag:
        raise RuntimeError(f"wrong editing with the specified edit type:\n original: {orig}\n new: {new}\n, editType: {editType}")

    return orig_span, new_span