diff options
Diffstat (limited to 'ass1/binary_boyermoore.py')
| -rw-r--r-- | ass1/binary_boyermoore.py | 68 |
1 files changed, 28 insertions, 40 deletions
diff --git a/ass1/binary_boyermoore.py b/ass1/binary_boyermoore.py index 4c41c77..df75d62 100644 --- a/ass1/binary_boyermoore.py +++ b/ass1/binary_boyermoore.py @@ -1,12 +1,6 @@ -# txt = "zzzzzzzzzabczzzzzzzzzz" -# pat = "abczzzabc" -# m = len(pat) -# n = len(txt) -# R = [[m for __ in range(m)] for _ in range(0, 26)] -# good_suffix = [0 for _ in range(0, m+1)] - import sys + def alpha_number(char): if char == "0" or char == "1": return int(char) @@ -22,6 +16,7 @@ def compare(string, i, end): if i+j == end or string[i+j] != string[j]: return j + def condense(binary, offset=0, size=2): out = "" for i in range(offset, len(binary)-offset, size): @@ -31,7 +26,6 @@ def condense(binary, offset=0, size=2): return out - def gusfield(string): z = [0 for _ in string] z[0] = len(string) @@ -59,20 +53,19 @@ def gusfield(string): l = i return z + def gen_jump_table(pat): m = len(pat) - R = [[-1 for __ in range(m)] for _ in range(0, 256)] + R = [[-1 for __ in range(m)] for _ in range(0, 4)] # A space for a, b, c and d used in groupings for j in range(m): for i in range(j+1): R[alpha_number(pat[i])][j] = i return R + def gen_z_suffix(pat): return reverse(gusfield(reverse(pat)))+[0] -# print(list(pat)) -# print(R) -# print(Z) def gen_good_suffix(pat, Z): m = len(pat) @@ -82,7 +75,6 @@ def gen_good_suffix(pat, Z): good_suffix[j] = i+1 return good_suffix -# print("g", good_suffix) def gen_matched_prefix(pat): m = len(pat) @@ -100,7 +92,6 @@ def preprocess(pat): return R, good_suffix, matched_prefix - def boyer_moore(pat, txt): R, good_suffix, matched_prefix = preprocess(pat) m = len(pat) @@ -108,70 +99,65 @@ def boyer_moore(pat, txt): i = m-1 j = 0 occurrences = [] - galils = 0 comps = 0 galil = False start = 0 stop = 0 while j <= n-m: - match = pat[i] == txt[j+i] + match = pat[i] == txt[j+i] # The comparison comps += 1 if match: - if galil and stop >= i > start: - galils += 1 + if galil and stop >= i > start: # Able to perform Galil i = max(start-1, 0) galil = False - if i == 0: + if i == 0: # A full match of pat good_suffix_shift = m - matched_prefix[1] j += good_suffix_shift occurrences.append(j) i = m-1 else: - i -= 1 + i -= 1 # A match, but not a full match yet else: mismatched = txt[j + i] - bad_char_shift = i - R[alpha_number(mismatched)][i] + bad_char_shift = i - R[alpha_number(mismatched)][i] # Bad character shift good_suffix_shift = 1 - if good_suffix[i+1] > 0: + if good_suffix[i+1] > 0: # Good Suffix case 1a good_suffix_shift = m - good_suffix[i+1] start = good_suffix[i+1] - m + i + 1 stop = good_suffix[i+1] - elif good_suffix[i+1] == 0: + elif good_suffix[i+1] == 0: # Good Suffix case 1b (i.e. matched prefix) good_suffix_shift = m - matched_prefix[i+1] start = 0 stop = matched_prefix[i + 1] - best_shift = max(good_suffix_shift, bad_char_shift) + best_shift = max(good_suffix_shift, bad_char_shift) # Choose the best shift j += best_shift galil = best_shift == good_suffix_shift i = m-1 - print(comps) return comps, occurrences + def two_to_the(n): return 1 << n -def chunky_search(pat, txt, factor=2): + +def chunky_search(pat, txt): + # I settled on grouping in 2s, since higher groupings took more runs of BMs than comparisons it saved + # tl;dr: 2 was the fastest for me + factor = 2 occurrences = [] comps = 0 + # Do two separate BMs for each binary offset of the grouped bits in txt (i.e. offset by 0 and 1 bits to the right) for offset in range(two_to_the(factor-1)): - padding = format(offset, f"0{factor-1}b") if len(pat) % factor else "" + padding = offset if len(pat) % factor else "" # Padding to ensure each BM matches the correct groups augmented_pat = f"{pat}{padding}" c, o = boyer_moore(condense(augmented_pat, 0, factor), condense(txt, offset, factor)) comps += c - print(offset, o) - occurrences += o - base_comps, base_occur = boyer_moore(pat, txt) - print(base_occur[:20]) - print(occurrences[:10]) - print("*"*20) - print(f"Chunky Optimisation: {len(occurrences)} occurences in {comps} comparisons.") - print(f"Normal: {len(base_occur)} occurences in {base_comps} comparisons.") - if base_comps > 0: - print(f"{comps * 100 / base_comps:.3f}% of normal Boyer-Moore") - print(f"{comps * 100 / 642096:.3f}% of their Boyer-Moore") + # Convert grouped match indices to normal ones with this O(n) operation + occurrences += list(map(lambda x: x*factor+offset, o)) return comps, occurrences + def read_args(): with open(sys.argv[1], "r") as txt_file: txt = txt_file.read() @@ -179,19 +165,21 @@ def read_args(): pat = pat_file.read() return txt, pat + def output_matches(occurrences): with open("output_binary_boyermoore.txt", "w") as file: for o in occurrences: file.write(f"{o}\n") + def main(): - factor = 2 if len(sys.argv) < 3: print("Not enough arguments!") else: txt, pat = read_args() - comps, occurrences = chunky_search(pat, txt, factor) + comps, occurrences = chunky_search(pat, txt) print(comps) output_matches(occurrences) + main()
\ No newline at end of file |
