From 556fe538ddd5580f966d31d3167ecc764434fcb9 Mon Sep 17 00:00:00 2001 From: akiyamn Date: Wed, 24 Mar 2021 20:22:45 +1100 Subject: Clean boyer-moore --- ass1/binary_boyermoore.py | 166 ++++++++++++++++++++++++++-------------------- 1 file changed, 93 insertions(+), 73 deletions(-) diff --git a/ass1/binary_boyermoore.py b/ass1/binary_boyermoore.py index 0c48b78..42d7d01 100644 --- a/ass1/binary_boyermoore.py +++ b/ass1/binary_boyermoore.py @@ -1,9 +1,9 @@ -txt = "zzzzzzzzzbaabzzzzzzzzzzzzzzzbaab" -pat = "baab" -m = len(pat) -n = len(txt) -R = [[m for __ in range(m)] for _ in range(0, 26)] -good_suffix = [0 for _ in range(0, m+1)] +# txt = "zzzzzzzzzbaabzzzzzzzzzzzzzzzbaab" +# pat = "baab" +# m = len(pat) +# n = len(txt) +# R = [[m for __ in range(m)] for _ in range(0, 26)] +# good_suffix = [0 for _ in range(0, m+1)] def alpha_number(char): return ord(char) - 97 @@ -46,73 +46,93 @@ def gusfield(string): l = i return z -for j in range(m): - for i in range(j): - print(pat[i], i) - R[alpha_number(pat[i])][j] = i - - -Z = reverse(gusfield(reverse(pat)))+[0] -print(list(pat)) -print(R) -print(Z) - -for i in range(m): - j = m - Z[i] - good_suffix[j] = i+1 - -print("g", good_suffix) - -matched_prefix = gusfield(pat)+[0] -for i in range(m-1, -1, -1): - matched_prefix[i] = max(matched_prefix[i], matched_prefix[i+1]) - -print(matched_prefix) - -print("="*15) - -print(5*" " + txt) - -j = 0 -occurence = 0 -while j <= n-m: - matches = 0 +def gen_jump_table(pat): + m = len(pat) + R = [[m for __ in range(m)] for _ in range(0, 26)] + for j in range(m): + for i in range(j): + R[alpha_number(pat[i])][j] = i + return R + +def gen_z_suffix(pat): + return reverse(gusfield(reverse(pat)))+[0] + +# print(list(pat)) +# print(R) +# print(Z) + +def gen_good_suffix(pat, Z): + m = len(pat) + good_suffix = [0 for _ in range(0, m + 1)] + for i in range(m): + j = m - Z[i] + good_suffix[j] = i+1 + return good_suffix + +# print("g", good_suffix) + +def gen_matched_prefix(pat): + m = len(pat) + matched_prefix = gusfield(pat)+[0] for i in range(m-1, -1, -1): - print(f"{j=} {' ' * j}", end="") - for x in range(len(pat)): - if x == i: - print(pat[x].upper(), end="") + matched_prefix[i] = max(matched_prefix[i], matched_prefix[i+1]) + return matched_prefix + + +def preprocess(pat): + R = gen_jump_table(pat) + Z = gen_z_suffix(pat) + good_suffix = gen_good_suffix(pat, Z) + matched_prefix = gen_matched_prefix(pat) + return R, good_suffix, matched_prefix + +def boyer_moore(pat, txt): + R, good_suffix, matched_prefix = preprocess(pat) + m = len(pat) + n = len(txt) + j = 0 + occurrence = 0 + + print("="*15) + print(5*" " + txt) + + while j <= n-m: + matches = 0 + for i in range(m-1, -1, -1): + print(f"{j=} {' ' * j}", end="") + for x in range(len(pat)): + if x == i: + print(pat[x].upper(), end="") + else: + print(pat[x], end="") + print() + match = pat[i] == txt[j+i] + if match: + matches += 1 + if matches == m: + good_suffix_shift = m - matched_prefix[1] + j += good_suffix_shift + i = m-1 + occurrence += 1 + break else: - print(pat[x], end="") - print() - match = pat[i] == txt[j+i] - if match: - matches += 1 - if matches == m: - # print(f" Match at {j}", end="") - good_suffix_shift = m - matched_prefix[1] - j += good_suffix_shift - i = m-1 - occurence += 1 + mismatched = txt[j+i] + bad_char_shift = R[alpha_number(mismatched)][i]-1 # Used to be i-R + good_suffix_shift = 1 + if good_suffix[i+1] > 0: + good_suffix_shift = m - good_suffix[i+1] + elif good_suffix[i+1] == 0: + good_suffix_shift = m - matched_prefix[i+1] + elif matches == m: + print(f"Match at {j}") + good_suffix_shift = m - matched_prefix[1] + j += max(good_suffix_shift, bad_char_shift) break - # print(i) - else: - mismatched = txt[j+i] - bad_char_shift = R[alpha_number(mismatched)][i]-1 # Used to be i-R - good_suffix_shift = 1 - if good_suffix[i+1] > 0: - good_suffix_shift = m - good_suffix[i+1] - elif good_suffix[i+1] == 0: - good_suffix_shift = m - matched_prefix[i+1] - elif matches == m: - print(f"Match at {j}") - good_suffix_shift = m - matched_prefix[1] - j += max(good_suffix_shift, bad_char_shift) - break - # if j > n-m: - # break -print(f"It found {occurence} occurences.") - -print(f"\n {list(range(m))}") -for i, a in enumerate(R): - print(chr(i+97), a) \ No newline at end of file + + print(f"It found {occurrence} occurences.") + print(f"\n {list(range(m))}") + for i, a in enumerate(R): + print(chr(i+97), a) + +pat = "abbabba" +boyer_moore(pat, "zzzzzzzzzzzzzzzabbazzzzzzzzzzzzzzzzzzzzz") \ No newline at end of file -- cgit v1.2.3