# Alexander Occhipinti - 29994705 - FIT3155 S1/2021 import sys def alpha_number(char): return ord(char) - 97 def reverse(string): return string[::-1] def compare(string, i, end): for j in range(end): if i+j == end or string[i+j] != string[j]: return j def condense(binary, offset=0, size=2): out = "" for i in range(offset, len(binary)-offset, size): slice = binary[i:i+size] if len(slice) == size: out += chr(97 + int(slice, 2)) return out def gusfield(string): z = [0 for _ in string] z[0] = len(string) r = 0 l = 0 for i in range(1, len(string)): if i == 1: # base case z[1] = compare(string, i, len(string)) if z[1] > 0: r = z[1] + 1 l = 1 elif i > r: # Case 1 z[i] = compare(string, i, len(string)) if z[i] > 0: q = i + z[i] r = q - 1 l = i elif i <= r: # Case 2 if z[i-l] < r-i: # Case 2a z[i] = z[i-l] else: # Case 2b q = compare(string, i, len(string)) z[i] = q r = q l = i return z def gen_jump_table(pat): m = len(pat) R = [[-1 for __ in range(m)] for _ in range(0, 4)] # A space for a, b, c and d used in groupings for j in range(m): for i in range(j+1): R[alpha_number(pat[i])][j] = i return R def gen_z_suffix(pat): return reverse(gusfield(reverse(pat)))+[0] def gen_good_suffix(pat, Z): m = len(pat) good_suffix = [0 for _ in range(0, m + 1)] for i in range(m): j = m - Z[i] good_suffix[j] = i+1 return good_suffix def gen_matched_prefix(pat): m = len(pat) matched_prefix = gusfield(pat)+[0] for i in range(m-1, -1, -1): matched_prefix[i] = max(matched_prefix[i], matched_prefix[i+1]) return matched_prefix def preprocess(pat): R = gen_jump_table(pat) Z = gen_z_suffix(pat) good_suffix = gen_good_suffix(pat, Z) matched_prefix = gen_matched_prefix(pat) return R, good_suffix, matched_prefix def boyer_moore(pat, txt): R, good_suffix, matched_prefix = preprocess(pat) m = len(pat) n = len(txt) i = m-1 j = 0 occurrences = [] comps = 0 galil = False start = 0 stop = 0 while j <= n-m: match = pat[i] == txt[j+i] # The comparison comps += 1 if match: if galil and stop >= i > start: # Able to perform Galil i = max(start-1, 0) galil = False if i == 0: # A full match of pat good_suffix_shift = m - matched_prefix[1] j += good_suffix_shift occurrences.append(j) i = m-1 else: i -= 1 # A match, but not a full match yet else: mismatched = txt[j + i] bad_char_shift = i - R[alpha_number(mismatched)][i] # Bad character shift good_suffix_shift = 1 if good_suffix[i+1] > 0: # Good Suffix case 1a good_suffix_shift = m - good_suffix[i+1] start = good_suffix[i+1] - m + i + 1 stop = good_suffix[i+1] elif good_suffix[i+1] == 0: # Good Suffix case 1b (i.e. matched prefix) good_suffix_shift = m - matched_prefix[i+1] start = 0 stop = matched_prefix[i + 1] best_shift = max(good_suffix_shift, bad_char_shift) # Choose the best shift j += best_shift galil = best_shift == good_suffix_shift i = m-1 return comps, occurrences def two_to_the(n): return 1 << n def chunky_search(pat, txt): # I settled on grouping in 2s, since higher groupings took more runs of BMs than comparisons it saved # tl;dr: 2 was the fastest for me factor = 2 occurrences = [] comps = 0 # Do two separate BMs for each binary offset of the grouped bits in txt (i.e. offset by 0 and 1 bits to the right) for offset in range(two_to_the(factor-1)): padding = offset if len(pat) % factor else "" # Padding to ensure each BM matches the correct groups augmented_pat = f"{pat}{padding}" c, o = boyer_moore(condense(augmented_pat, 0, factor), condense(txt, offset, factor)) comps += c # Convert grouped match indices to normal ones with this O(n) operation occurrences += list(map(lambda x: x*factor+offset, o)) return comps, occurrences def read_args(): with open(sys.argv[1], "r") as txt_file: txt = txt_file.read() with open(sys.argv[2], "r") as pat_file: pat = pat_file.read() return txt, pat def output_matches(occurrences): with open("output_binary_boyermoore.txt", "w") as file: for o in occurrences: file.write(f"{o}\n") def main(): if len(sys.argv) < 3: print("Not enough arguments!") else: txt, pat = read_args() comps, occurrences = chunky_search(pat, txt) print(comps) output_matches(occurrences) main()