diff options
Diffstat (limited to 'ass1/binary_boyermoore.py')
| -rw-r--r-- | ass1/binary_boyermoore.py | 118 |
1 files changed, 104 insertions, 14 deletions
diff --git a/ass1/binary_boyermoore.py b/ass1/binary_boyermoore.py index bbc030f..9e00ed2 100644 --- a/ass1/binary_boyermoore.py +++ b/ass1/binary_boyermoore.py @@ -5,8 +5,13 @@ # R = [[m for __ in range(m)] for _ in range(0, 26)] # good_suffix = [0 for _ in range(0, m+1)] +import sys + def alpha_number(char): - return int(char) + if char == "0" or char == "1": + return int(char) + return ord(char) - 97 + # return int(char) def reverse(string): @@ -18,6 +23,15 @@ def compare(string, i, end): if i+j == end or string[i+j] != string[j]: return j +def condense(binary, offset=0, size=2): + out = "" + for i in range(offset, len(binary)-offset, size): + slice = binary[i:i+size] + if len(slice) == size: + out += chr(97 + int(slice, 2)) + return out + + def gusfield(string): z = [0 for _ in string] @@ -48,7 +62,7 @@ def gusfield(string): def gen_jump_table(pat): m = len(pat) - R = [[-1 for __ in range(m)] for _ in range(0, 2)] + R = [[-1 for __ in range(m)] for _ in range(0, 256)] for j in range(m): for i in range(j+1): R[alpha_number(pat[i])][j] = i @@ -85,25 +99,39 @@ def preprocess(pat): matched_prefix = gen_matched_prefix(pat) return good_suffix, matched_prefix + + def boyer_moore(pat, txt): good_suffix, matched_prefix = preprocess(pat) m = len(pat) n = len(txt) j = 0 occurrence = 0 - start = None - stop = None - galil_index = -1 + galils = 0 comps = 0 - print("="*15) + # print("="*15) # print(6*" " + txt) i = m-1 - + galil = False + start = None + stop = None while j <= n-m: + # print(f"{j=:02} {' ' * j}", end="") + # for x in range(len(pat)): + # if x == i: + # print(pat[x].upper(), end="") + # else: + # print(pat[x], end="") + # print() match = pat[i] == txt[j+i] comps += 1 if match: + if galil and stop >= i > start: + galils += 1 + assert start >= 0 + i = max(start-1, 0) + galil = False if i == 0: good_suffix_shift = m - matched_prefix[1] j += good_suffix_shift @@ -112,24 +140,86 @@ def boyer_moore(pat, txt): else: i -= 1 else: + galil = False + mp = False + gs = False + mismatched = txt[j + i] + bad_char_shift = i - R[alpha_number(mismatched)][i] good_suffix_shift = 1 if good_suffix[i+1] >= 0: good_suffix_shift = m - good_suffix[i+1] - j += good_suffix_shift + gs = True + start = good_suffix[i+1] - m + i + 1 + stop = good_suffix[i+1] + elif good_suffix[i+1] == 0: + good_suffix_shift = m - matched_prefix[i+1] + mp = True + start = 0 + stop = matched_prefix[i + 1] + best_shift = max(good_suffix_shift, bad_char_shift) + j += best_shift + galil = (mp or gs) and best_shift == good_suffix_shift i = m-1 + print(f"It found {occurrence} occurences.") + print(f"Galil'd {galils} times.") # print(f"\n {list(range(m))}") - print("" + str(list(map(int, pat)))) + # print("" + str(list(map(int, pat)))) # for i, a in enumerate(R): # print(chr(i+97), a) - print(list(map(lambda x: m - x, good_suffix))) - print(good_suffix) - print(matched_prefix) + # print(good_suffix) + # print(matched_prefix) print(f"{comps} comparisons.") + return comps, occurrence + +def two_to_the(n): + return 1 << n + +def chunky_search(pat, txt, factor=2): + occurrence = 0 + comps = 0 + for offset in range(two_to_the(factor-1)): + padding = format(offset, f"0{factor-1}b") if len(pat) % factor else "" + augmented_pat = f"{pat}{padding}" + # print(padding) + print(condense(format(two_to_the(factor)-1, f"0b"), 0, factor)) + c, o = boyer_moore(condense(augmented_pat, 0, factor), condense(txt, offset, factor)) + comps += c + occurrence += o + base_comps, base_occur = boyer_moore(pat, txt) + print("*"*20) + print(f"Chunky Optimisation: {occurrence} occurences in {comps} comparisons.") + print(f"Normal: {base_occur} occurences in {base_comps} comparisons.") + # assert base_occur == occurrence + if base_comps > 0: + print(f"{comps * 100 / base_comps:.3f}% of normal Boyer-Moore") + print(f"{comps * 100 / 642096:.3f}% of their Boyer-Moore") + +def read_args(): + with open(sys.argv[1], "r") as txt_file: + txt = txt_file.read() + with open(sys.argv[2], "r") as pat_file: + pat = pat_file.read() + return txt, pat + +# boyer_moore(condense("01100010"), condense(a)) +# boyer_moore(condense("01100011"), condense(a, offset=1)) + +def main(): + F = 2 + if len(sys.argv) < 3: + print("Not enough arguments!") + else: + txt, pat = read_args() + chunky_search(pat, txt, factor=F) + +main() + + -#boyer_moore("2346857", "7263480726134809726305872634582732346857658927635023657293592837562867237968237689035609230896723908029367856087236450823609487623904620934") -boyer_moore("1110001101", "011101110001101010111010010101111100011011111000110111000110101101111011111111111111100011001110111010101110100101011101101101110111010110111010010101110110110111011111011011") # boyer_moore("111000110", "01110111010101110100101011101101111011111111111111100011001110111010101110100101011101101101110111010110111010010101110110110111011111011011") +# print(condense("1110000110")) +# print(condense("1110000110", offset=1)) # boyer_moore("111011011001110", "101010111010010101110111010101110100101011101101111011111111111101110111010101110100101011101101101110111010101110100101011101101101110111110110110011100000110101011101001010111011011011101100001010101110100101011101110101011101001010111011011011101110101011101001010111011011011101110101011101001010111011011011101100001101010111010010101110110110111011000010111011111011101110110101110110101101100000001011001010101010101110111110111011101101011101101011011000000010110010101010101000010111011111011101110110101110110101101100000001011001010101010100001011101111101110111011010111011010110110000000101100101010101010101011101001010111011011011101100001011101111101110111011010101110100101011101101101110110000101110111110101010111010010101110110110111011000010111011101010111010010101110110110111011000010111010101110100101011101101101010101110100101011101101101110110000101110111110111011101101011101101011011000000010110010101010101111011000010111011111011101110110101110110101101100000001011001010101010110111110111011101101011101101011011000000010110010101010101111011101110110101110110101101100000001011001010101010111101110110101110110101101100000001011001010101010110101110110101101100000001011001010101010110110111011000010101011101001010111011101010111010010101110110110111011101010111010010101110110110111011101010111010010101110110110111011000011010101110100101011101101101110110000101110111110111011101101011101101011011000000010110010101010101011101111101110111011010111011010110110000000101100101010101010000101110111110111011101101011101101011011000000010110010101010101000010111011111011101110110101110110101101100000001011001010101010101010111010010101110110110111011000010111011111011101110110101011101001010111011011011101100001011101111101010101110100101011101101101110110000101110111010101110100101011101101101110110000101110101011101001010111011011010101011101001010111011011011101100001011101111101110111011010111011010110110000000101100101010101011110110000101110111110111011101101011101101011011000000010110010101010101101111101110111011010111011010110110000000101100101010101011110111011101101011101101011011000000010110010101010101111011101101011101101011011000000010110010101010101101011101101011011000000010110010101010101101101110110000101110111110111011101101011101101011011000000010110010101010101101110111110111011101101011101101011011000000010110010101010101101110111110111011101101011101101011011000000010110010101010101011101111101110111011010111011010110110000000101100101010101010000101110111110111011101101011101101011011000000010110010101010101000010111011111011101110110101110110101101100000001011001010101010101010111101111111111111010010101110110110111011000010111011111011101110110101011101001010111011011011101100001011101111101010101110100101011101101101110110000101110111010101110100101011101101101110110000101110101011101001010111011011010101011101001010111011011011101100001011101111101110111011010111011010110110000000101100101010101011110110000101110111110111011101101011101101011011000000010110010101010101101111101110111011010111011010110110000000101100101010101011110111011101101011101101011011000000010110010101010101111011101101011101101011011000000010110010101010101101011101101011011000000010110010101010101101101110110000101110111110111011101101011101101011011000000010110010101010101")
\ No newline at end of file |
