# txt = "zzzzzzzzzabczzzzzzzzzz" # pat = "abczzzabc" # m = len(pat) # n = len(txt) # R = [[m for __ in range(m)] for _ in range(0, 26)] # good_suffix = [0 for _ in range(0, m+1)] import sys def alpha_number(char): if char == "0" or char == "1": return int(char) return ord(char) - 97 # return int(char) def reverse(string): return string[::-1] def compare(string, i, end): for j in range(end): if i+j == end or string[i+j] != string[j]: return j def condense(binary, offset=0, size=2): out = "" for i in range(offset, len(binary)-offset, size): slice = binary[i:i+size] if len(slice) == size: out += chr(97 + int(slice, 2)) return out def gusfield(string): z = [0 for _ in string] z[0] = len(string) r = 0 l = 0 for i in range(1, len(string)): if i == 1: # base case z[1] = compare(string, i, len(string)) if z[1] > 0: r = z[1] + 1 l = 1 elif i > r: # Case 1 z[i] = compare(string, i, len(string)) if z[i] > 0: q = i + z[i] r = q - 1 l = i elif i <= r: # Case 2 if z[i-l] < r-i: # Case 2a z[i] = z[i-l] else: # Case 2b q = compare(string, i, len(string)) z[i] = q r = q l = i return z def gen_jump_table(pat): m = len(pat) R = [[-1 for __ in range(m)] for _ in range(0, 256)] for j in range(m): for i in range(j+1): R[alpha_number(pat[i])][j] = i return R def gen_z_suffix(pat): return reverse(gusfield(reverse(pat)))+[0] # print(list(pat)) # print(R) # print(Z) def gen_good_suffix(pat, Z): m = len(pat) good_suffix = [0 for _ in range(0, m + 1)] for i in range(m): j = m - Z[i] good_suffix[j] = i+1 return good_suffix # print("g", good_suffix) def gen_matched_prefix(pat): m = len(pat) matched_prefix = gusfield(pat)+[0] for i in range(m-1, -1, -1): matched_prefix[i] = max(matched_prefix[i], matched_prefix[i+1]) return matched_prefix def preprocess(pat): R = gen_jump_table(pat) Z = gen_z_suffix(pat) good_suffix = gen_good_suffix(pat, Z) matched_prefix = gen_matched_prefix(pat) return R, good_suffix, matched_prefix def boyer_moore(pat, txt): R, good_suffix, matched_prefix = preprocess(pat) m = len(pat) n = len(txt) j = 0 occurrence = 0 galils = 0 comps = 0 # print("="*15) # print(6*" " + txt) i = m-1 galil = False start = None stop = None while j <= n-m: # print(f"{j=:02} {' ' * j}", end="") # for x in range(len(pat)): # if x == i: # print(pat[x].upper(), end="") # else: # print(pat[x], end="") # print() match = pat[i] == txt[j+i] comps += 1 if match: if galil and stop >= i > start: galils += 1 assert start >= 0 i = max(start-1, 0) galil = False if i == 0: good_suffix_shift = m - matched_prefix[1] j += good_suffix_shift occurrence += 1 i = m-1 else: i -= 1 else: galil = False mp = False gs = False mismatched = txt[j + i] bad_char_shift = i - R[alpha_number(mismatched)][i] good_suffix_shift = 1 if good_suffix[i+1] > 0: good_suffix_shift = m - good_suffix[i+1] gs = True start = good_suffix[i+1] - m + i + 1 stop = good_suffix[i+1] elif good_suffix[i+1] == 0: good_suffix_shift = m - matched_prefix[i+1] mp = True start = 0 stop = matched_prefix[i + 1] best_shift = max(good_suffix_shift, bad_char_shift) j += best_shift galil = (mp or gs) and best_shift == good_suffix_shift i = m-1 print(f"It found {occurrence} occurences.") print(f"Galil'd {galils} times.") # print(f"\n {list(range(m))}") # print("" + str(list(map(int, pat)))) # for i, a in enumerate(R): # print(chr(i+97), a) # print(good_suffix) # print(matched_prefix) print(f"{comps} comparisons.") return comps, occurrence def two_to_the(n): return 1 << n def chunky_search(pat, txt, factor=2): occurrence = 0 comps = 0 for offset in range(two_to_the(factor-1)): padding = format(offset, f"0{factor-1}b") if len(pat) % factor else "" augmented_pat = f"{pat}{padding}" # print(padding) print(condense(format(two_to_the(factor)-1, f"0b"), 0, factor)) c, o = boyer_moore(condense(augmented_pat, 0, factor), condense(txt, offset, factor)) comps += c occurrence += o base_comps, base_occur = boyer_moore(pat, txt) print("*"*20) print(f"Chunky Optimisation: {occurrence} occurences in {comps} comparisons.") print(f"Normal: {base_occur} occurences in {base_comps} comparisons.") # assert base_occur == occurrence if base_comps > 0: print(f"{comps * 100 / base_comps:.3f}% of normal Boyer-Moore") print(f"{comps * 100 / 642096:.3f}% of their Boyer-Moore") def read_args(): with open(sys.argv[1], "r") as txt_file: txt = txt_file.read() with open(sys.argv[2], "r") as pat_file: pat = pat_file.read() return txt, pat # boyer_moore(condense("01100010"), condense(a)) # boyer_moore(condense("01100011"), condense(a, offset=1)) def main(): F = 2 if len(sys.argv) < 3: print("Not enough arguments!") else: txt, pat = read_args() chunky_search(pat, txt, factor=F) main() # boyer_moore("111000110", "01110111010101110100101011101101111011111111111111100011001110111010101110100101011101101101110111010110111010010101110110110111011111011011") # print(condense("1110000110")) # print(condense("1110000110", offset=1)) # boyer_moore("111011011001110", "101010111010010101110111010101110100101011101101111011111111111101110111010101110100101011101101101110111010101110100101011101101101110111110110110011100000110101011101001010111011011011101100001010101110100101011101110101011101001010111011011011101110101011101001010111011011011101110101011101001010111011011011101100001101010111010010101110110110111011000010111011111011101110110101110110101101100000001011001010101010101110111110111011101101011101101011011000000010110010101010101000010111011111011101110110101110110101101100000001011001010101010100001011101111101110111011010111011010110110000000101100101010101010101011101001010111011011011101100001011101111101110111011010101110100101011101101101110110000101110111110101010111010010101110110110111011000010111011101010111010010101110110110111011000010111010101110100101011101101101010101110100101011101101101110110000101110111110111011101101011101101011011000000010110010101010101111011000010111011111011101110110101110110101101100000001011001010101010110111110111011101101011101101011011000000010110010101010101111011101110110101110110101101100000001011001010101010111101110110101110110101101100000001011001010101010110101110110101101100000001011001010101010110110111011000010101011101001010111011101010111010010101110110110111011101010111010010101110110110111011101010111010010101110110110111011000011010101110100101011101101101110110000101110111110111011101101011101101011011000000010110010101010101011101111101110111011010111011010110110000000101100101010101010000101110111110111011101101011101101011011000000010110010101010101000010111011111011101110110101110110101101100000001011001010101010101010111010010101110110110111011000010111011111011101110110101011101001010111011011011101100001011101111101010101110100101011101101101110110000101110111010101110100101011101101101110110000101110101011101001010111011011010101011101001010111011011011101100001011101111101110111011010111011010110110000000101100101010101011110110000101110111110111011101101011101101011011000000010110010101010101101111101110111011010111011010110110000000101100101010101011110111011101101011101101011011000000010110010101010101111011101101011101101011011000000010110010101010101101011101101011011000000010110010101010101101101110110000101110111110111011101101011101101011011000000010110010101010101101110111110111011101101011101101011011000000010110010101010101101110111110111011101101011101101011011000000010110010101010101011101111101110111011010111011010110110000000101100101010101010000101110111110111011101101011101101011011000000010110010101010101000010111011111011101110110101110110101101100000001011001010101010101010111101111111111111010010101110110110111011000010111011111011101110110101011101001010111011011011101100001011101111101010101110100101011101101101110110000101110111010101110100101011101101101110110000101110101011101001010111011011010101011101001010111011011011101100001011101111101110111011010111011010110110000000101100101010101011110110000101110111110111011101101011101101011011000000010110010101010101101111101110111011010111011010110110000000101100101010101011110111011101101011101101011011000000010110010101010101111011101101011101101011011000000010110010101010101101011101101011011000000010110010101010101101101110110000101110111110111011101101011101101011011000000010110010101010101")