1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
# Alexander Occhipinti - 29994705 - FIT3155 S1/2021
import sys
def compare(string, i, end):
for j in range(end):
if i+j == end or string[i+j] != string[j]:
return j
def gusfield(string):
z = [0 for _ in string]
z[0] = len(string)
r = 0
l = 0
for i in range(1, len(string)):
if i == 1: # base case
z[1] = compare(string, i, len(string))
if z[1] > 0:
r = z[1] + 1
l = 1
elif i > r: # Case 1
z[i] = compare(string, i, len(string))
if z[i] > 0:
q = i + z[i]
r = q - 1
l = i
elif i <= r: # Case 2
if z[i-l] < r-i: # Case 2a
z[i] = z[i-l]
else: # Case 2b
q = compare(string, i, len(string))
z[i] = q
r = q
l = i
return z
def reverse(string):
return string[::-1]
def match(z_prefix, z_suffix, pat_length):
matches = []
l = pat_length+1
r = len(z_prefix)-1
while l < len(z_prefix): # while l hasn't reached the end
if z_prefix[l] != 0 and z_suffix[r] != 0:
distance = None
if z_prefix[l] == pat_length:
distance = 0 # The left and right pointer align at an exact pattern match
elif z_prefix[l] + z_suffix[r] >= pat_length - 1:
distance = 1 # The left and right pointer align at a match of distance one
if distance is not None:
matches.append((l-pat_length-1, distance))
l += 1
r += 1
# Progress each pointer until matches are found
if z_prefix[l] == 0:
l += 1
if z_suffix[r] == 0:
r -= 1
return matches
def edit_distance(txt, pat):
# Do Z algorithm twice, forwards (prefix) and backwards (suffix)
forwards = pat + "$" + txt
backwards = reverse(pat) + "$" + reverse(txt)
# Match the Z_prefix values with the Z_suffix backwards
tuples = match(gusfield(forwards), gusfield(backwards), len(pat))
print(f"Found {len(tuples)} matches of edit distance <= 1.")
output(tuples)
print("Done.")
def output(tuples):
OUTPUT_PATH = "output_editdist.txt"
with open(OUTPUT_PATH, "w") as file:
for pair in tuples:
file.write(f"{pair[0]} {pair[1]}\n")
def read_args():
with open(sys.argv[1], "r") as txt_file:
txt = txt_file.read()
with open(sys.argv[2], "r") as pat_file:
pat = pat_file.read()
return txt, pat
def main():
if len(sys.argv) < 3:
print("Not enough arguments!")
else:
txt, pat = read_args()
edit_distance(txt, pat)
main()
|