aboutsummaryrefslogtreecommitdiff
path: root/ass1/binary_boyermoore.py
diff options
context:
space:
mode:
Diffstat (limited to 'ass1/binary_boyermoore.py')
-rw-r--r--ass1/binary_boyermoore.py68
1 files changed, 28 insertions, 40 deletions
diff --git a/ass1/binary_boyermoore.py b/ass1/binary_boyermoore.py
index 4c41c77..df75d62 100644
--- a/ass1/binary_boyermoore.py
+++ b/ass1/binary_boyermoore.py
@@ -1,12 +1,6 @@
-# txt = "zzzzzzzzzabczzzzzzzzzz"
-# pat = "abczzzabc"
-# m = len(pat)
-# n = len(txt)
-# R = [[m for __ in range(m)] for _ in range(0, 26)]
-# good_suffix = [0 for _ in range(0, m+1)]
-
import sys
+
def alpha_number(char):
if char == "0" or char == "1":
return int(char)
@@ -22,6 +16,7 @@ def compare(string, i, end):
if i+j == end or string[i+j] != string[j]:
return j
+
def condense(binary, offset=0, size=2):
out = ""
for i in range(offset, len(binary)-offset, size):
@@ -31,7 +26,6 @@ def condense(binary, offset=0, size=2):
return out
-
def gusfield(string):
z = [0 for _ in string]
z[0] = len(string)
@@ -59,20 +53,19 @@ def gusfield(string):
l = i
return z
+
def gen_jump_table(pat):
m = len(pat)
- R = [[-1 for __ in range(m)] for _ in range(0, 256)]
+ R = [[-1 for __ in range(m)] for _ in range(0, 4)] # A space for a, b, c and d used in groupings
for j in range(m):
for i in range(j+1):
R[alpha_number(pat[i])][j] = i
return R
+
def gen_z_suffix(pat):
return reverse(gusfield(reverse(pat)))+[0]
-# print(list(pat))
-# print(R)
-# print(Z)
def gen_good_suffix(pat, Z):
m = len(pat)
@@ -82,7 +75,6 @@ def gen_good_suffix(pat, Z):
good_suffix[j] = i+1
return good_suffix
-# print("g", good_suffix)
def gen_matched_prefix(pat):
m = len(pat)
@@ -100,7 +92,6 @@ def preprocess(pat):
return R, good_suffix, matched_prefix
-
def boyer_moore(pat, txt):
R, good_suffix, matched_prefix = preprocess(pat)
m = len(pat)
@@ -108,70 +99,65 @@ def boyer_moore(pat, txt):
i = m-1
j = 0
occurrences = []
- galils = 0
comps = 0
galil = False
start = 0
stop = 0
while j <= n-m:
- match = pat[i] == txt[j+i]
+ match = pat[i] == txt[j+i] # The comparison
comps += 1
if match:
- if galil and stop >= i > start:
- galils += 1
+ if galil and stop >= i > start: # Able to perform Galil
i = max(start-1, 0)
galil = False
- if i == 0:
+ if i == 0: # A full match of pat
good_suffix_shift = m - matched_prefix[1]
j += good_suffix_shift
occurrences.append(j)
i = m-1
else:
- i -= 1
+ i -= 1 # A match, but not a full match yet
else:
mismatched = txt[j + i]
- bad_char_shift = i - R[alpha_number(mismatched)][i]
+ bad_char_shift = i - R[alpha_number(mismatched)][i] # Bad character shift
good_suffix_shift = 1
- if good_suffix[i+1] > 0:
+ if good_suffix[i+1] > 0: # Good Suffix case 1a
good_suffix_shift = m - good_suffix[i+1]
start = good_suffix[i+1] - m + i + 1
stop = good_suffix[i+1]
- elif good_suffix[i+1] == 0:
+ elif good_suffix[i+1] == 0: # Good Suffix case 1b (i.e. matched prefix)
good_suffix_shift = m - matched_prefix[i+1]
start = 0
stop = matched_prefix[i + 1]
- best_shift = max(good_suffix_shift, bad_char_shift)
+ best_shift = max(good_suffix_shift, bad_char_shift) # Choose the best shift
j += best_shift
galil = best_shift == good_suffix_shift
i = m-1
- print(comps)
return comps, occurrences
+
def two_to_the(n):
return 1 << n
-def chunky_search(pat, txt, factor=2):
+
+def chunky_search(pat, txt):
+ # I settled on grouping in 2s, since higher groupings took more runs of BMs than comparisons it saved
+ # tl;dr: 2 was the fastest for me
+ factor = 2
occurrences = []
comps = 0
+ # Do two separate BMs for each binary offset of the grouped bits in txt (i.e. offset by 0 and 1 bits to the right)
for offset in range(two_to_the(factor-1)):
- padding = format(offset, f"0{factor-1}b") if len(pat) % factor else ""
+ padding = offset if len(pat) % factor else "" # Padding to ensure each BM matches the correct groups
augmented_pat = f"{pat}{padding}"
c, o = boyer_moore(condense(augmented_pat, 0, factor), condense(txt, offset, factor))
comps += c
- print(offset, o)
- occurrences += o
- base_comps, base_occur = boyer_moore(pat, txt)
- print(base_occur[:20])
- print(occurrences[:10])
- print("*"*20)
- print(f"Chunky Optimisation: {len(occurrences)} occurences in {comps} comparisons.")
- print(f"Normal: {len(base_occur)} occurences in {base_comps} comparisons.")
- if base_comps > 0:
- print(f"{comps * 100 / base_comps:.3f}% of normal Boyer-Moore")
- print(f"{comps * 100 / 642096:.3f}% of their Boyer-Moore")
+ # Convert grouped match indices to normal ones with this O(n) operation
+ occurrences += list(map(lambda x: x*factor+offset, o))
return comps, occurrences
+
def read_args():
with open(sys.argv[1], "r") as txt_file:
txt = txt_file.read()
@@ -179,19 +165,21 @@ def read_args():
pat = pat_file.read()
return txt, pat
+
def output_matches(occurrences):
with open("output_binary_boyermoore.txt", "w") as file:
for o in occurrences:
file.write(f"{o}\n")
+
def main():
- factor = 2
if len(sys.argv) < 3:
print("Not enough arguments!")
else:
txt, pat = read_args()
- comps, occurrences = chunky_search(pat, txt, factor)
+ comps, occurrences = chunky_search(pat, txt)
print(comps)
output_matches(occurrences)
+
main() \ No newline at end of file