aboutsummaryrefslogtreecommitdiff
path: root/ass1/editdist.py
blob: c00fb69e84d4b07731b8e944faf32e8fb320eba9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import sys

OUTPUT_PATH = "output_editdist.txt"

def compare(string, i, end):
    for j in range(end):
        if i+j == end or string[i+j] != string[j]:
            return j


def gusfield(string):
    z = [0 for _ in string]
    z[0] = len(string)
    r = 0
    l = 0
    for i in range(1, len(string)):
        if i == 1:  # base case
            z[1] = compare(string, i, len(string))
            if z[1] > 0:
                r = z[1] + 1
                l = 1
        elif i > r: # Case 1
            z[i] = compare(string, i, len(string))
            if z[i] > 0:
                q = i + z[i]
                r = q - 1
                l = i
        elif i <= r:  # Case 2
            if z[i-l] < r-i:  # Case 2a
                z[i] = z[i-l]
            else:  # Case 2b
                q = compare(string, i, len(string))
                z[i] = q
                r = q
                l = i
    return z


def reverse(string):
    return string[::-1]
    # new = " " * len(string)
    # for i in range(len(new)):
    #     new[i] = string[-i]

    # return [string[c] for c in range(len(string)-1, -1, -1)]


def comapre_zs(z_prefix, z_suffix, pat_length):
    with open(OUTPUT_PATH, "w") as file:
        for i in range(pat_length+1, len(z_prefix)-pat_length-1):
            distance = None
            if z_prefix[i] == pat_length:
                distance = 0
            elif z_prefix[i] + z_suffix[i] >= pat_length-1:
                distance = 1
            if distance is not None:
                file.write(f"{i-pat_length-1} {distance}\n")
                print(f"===={i-pat_length-1} {distance}")
            print(i-pat_length-1, z_prefix[i], z_suffix[i])

def match(z_prefix, z_suffix, pat_length):
    matches = []
    l = pat_length+1
    r = len(z_prefix)-1
    while l < len(z_prefix): # while l hasn't reached the end
        if z_prefix[l] != 0 and z_suffix[r] != 0:
            distance = None
            if z_prefix[l] == pat_length:
                distance = 0
            elif z_prefix[l] + z_suffix[r] >= pat_length - 1:
                distance = 1
            if distance is not None:
                matches.append((l-pat_length-1, distance))
            l += 1
            r += 1
        if z_prefix[l] == 0:
            l += 1
        if z_suffix[r] == 0:
            r -= 1
    return matches


def edit_distance(txt, pat):
    forwards = pat + "$" + txt
    backwards = reverse(pat) + "$" + reverse(txt)
    print(list(forwards))
    print(list(backwards))
    print(gusfield(forwards))
    print(gusfield(backwards))
    print("="*15)
    # comapre_zs(gusfield(forwards), gusfield(backwards), len(pat))
    print(match(gusfield(forwards), gusfield(backwards), len(pat)))

def read_args():
    with open(sys.argv[1], "r") as txt_file:
        txt = txt_file.read()
    with open(sys.argv[2], "r") as pat_file:
        pat = pat_file.read()
    return txt, pat

def main():
    if len(sys.argv) < 3:
        print("Not enough arguments!")
    else:
        txt, pat = read_args()
        edit_distance(txt, pat)

if __name__ == '__main__':
    main()