1 files changed, 28 insertions, 40 deletions
diff --git a/ass1/binary_boyermoore.py b/ass1/binary_boyermoore.py
index 4c41c77..df75d62 100644
--- a/ass1/binary_boyermoore.py
+++ b/ass1/binary_boyermoore.py
@@ -1,12 +1,6 @@
-# txt = "zzzzzzzzzabczzzzzzzzzz"
-# pat = "abczzzabc"
-# m = len(pat)
-# n = len(txt)
-# R = [[m for __ in range(m)] for _ in range(0, 26)]
-# good_suffix = [0 for _ in range(0, m+1)]
-
 import sys
 
+
 def alpha_number(char):
     if char == "0" or char == "1":
         return int(char)
@@ -22,6 +16,7 @@ def compare(string, i, end):
         if i+j == end or string[i+j] != string[j]:
             return j
 
+
 def condense(binary, offset=0, size=2):
     out = ""
     for i in range(offset, len(binary)-offset, size):
@@ -31,7 +26,6 @@ def condense(binary, offset=0, size=2):
     return out
 
 
-
 def gusfield(string):
     z = [0 for _ in string]
     z[0] = len(string)
@@ -59,20 +53,19 @@ def gusfield(string):
                 l = i
     return z
 
+
 def gen_jump_table(pat):
     m = len(pat)
-    R = [[-1 for __ in range(m)] for _ in range(0, 256)]
+    R = [[-1 for __ in range(m)] for _ in range(0, 4)]  # A space for a, b, c and d used in groupings
     for j in range(m):
         for i in range(j+1):
             R[alpha_number(pat[i])][j] = i
     return R
 
+
 def gen_z_suffix(pat):
     return reverse(gusfield(reverse(pat)))+[0]
 
-# print(list(pat))
-# print(R)
-# print(Z)
 
 def gen_good_suffix(pat, Z):
     m = len(pat)
@@ -82,7 +75,6 @@ def gen_good_suffix(pat, Z):
         good_suffix[j] = i+1
     return good_suffix
 
-# print("g", good_suffix)
 
 def gen_matched_prefix(pat):
     m = len(pat)
@@ -100,7 +92,6 @@ def preprocess(pat):
     return R, good_suffix, matched_prefix
 
 
-
 def boyer_moore(pat, txt):
     R, good_suffix, matched_prefix = preprocess(pat)
     m = len(pat)
@@ -108,70 +99,65 @@ def boyer_moore(pat, txt):
     i = m-1
     j = 0
     occurrences = []
-    galils = 0
     comps = 0
     galil = False
     start = 0
     stop = 0
     while j <= n-m:
-        match = pat[i] == txt[j+i]
+        match = pat[i] == txt[j+i]  # The comparison
         comps += 1
         if match:
-            if galil and stop >= i > start:
-                galils += 1
+            if galil and stop >= i > start:  # Able to perform Galil
                 i = max(start-1, 0)
                 galil = False
-            if i == 0:
+            if i == 0: # A full match of pat
                 good_suffix_shift = m - matched_prefix[1]
                 j += good_suffix_shift
                 occurrences.append(j)
                 i = m-1
             else:
-                i -= 1
+                i -= 1  # A match, but not a full match yet
         else:
             mismatched = txt[j + i]
-            bad_char_shift = i - R[alpha_number(mismatched)][i]
+            bad_char_shift = i - R[alpha_number(mismatched)][i]  # Bad character shift
             good_suffix_shift = 1
-            if good_suffix[i+1] > 0:
+            if good_suffix[i+1] > 0:  # Good Suffix case 1a
                 good_suffix_shift = m - good_suffix[i+1]
                 start = good_suffix[i+1] - m + i + 1
                 stop = good_suffix[i+1]
-            elif good_suffix[i+1] == 0:
+            elif good_suffix[i+1] == 0: # Good Suffix case 1b (i.e. matched prefix)
                 good_suffix_shift = m - matched_prefix[i+1]
                 start = 0
                 stop = matched_prefix[i + 1]
-            best_shift = max(good_suffix_shift, bad_char_shift)
+            best_shift = max(good_suffix_shift, bad_char_shift)  # Choose the best shift
             j += best_shift
             galil = best_shift == good_suffix_shift
             i = m-1
 
-    print(comps)
     return comps, occurrences
 
+
 def two_to_the(n):
     return 1 << n
 
-def chunky_search(pat, txt, factor=2):
+
+def chunky_search(pat, txt):
+    # I settled on grouping in 2s, since higher groupings took more runs of BMs than comparisons it saved
+    # tl;dr: 2 was the fastest for me
+    factor = 2
     occurrences = []
     comps = 0
+    # Do two separate BMs for each binary offset of the grouped bits in txt (i.e. offset by 0 and 1 bits to the right)
     for offset in range(two_to_the(factor-1)):
-        padding = format(offset, f"0{factor-1}b") if len(pat) % factor else ""
+        padding = offset if len(pat) % factor else ""  # Padding to ensure each BM matches the correct groups
         augmented_pat = f"{pat}{padding}"
         c, o = boyer_moore(condense(augmented_pat, 0, factor), condense(txt, offset, factor))
         comps += c
-        print(offset, o)
-        occurrences += o
-    base_comps, base_occur = boyer_moore(pat, txt)
-    print(base_occur[:20])
-    print(occurrences[:10])
-    print("*"*20)
-    print(f"Chunky Optimisation: {len(occurrences)} occurences in {comps} comparisons.")
-    print(f"Normal: {len(base_occur)} occurences in {base_comps} comparisons.")
-    if base_comps > 0:
-        print(f"{comps * 100 / base_comps:.3f}% of normal Boyer-Moore")
-        print(f"{comps * 100 / 642096:.3f}% of their Boyer-Moore")
+        # Convert grouped match indices to normal ones with this O(n) operation
+        occurrences += list(map(lambda x: x*factor+offset, o))
     return comps, occurrences
 
+
 def read_args():
     with open(sys.argv[1], "r") as txt_file:
         txt = txt_file.read()
@@ -179,19 +165,21 @@ def read_args():
         pat = pat_file.read()
     return txt, pat
 
+
 def output_matches(occurrences):
     with open("output_binary_boyermoore.txt", "w") as file:
         for o in occurrences:
             file.write(f"{o}\n")
 
+
 def main():
-    factor = 2
     if len(sys.argv) < 3:
         print("Not enough arguments!")
     else:
         txt, pat = read_args()
-        comps, occurrences = chunky_search(pat, txt, factor)
+        comps, occurrences = chunky_search(pat, txt)
         print(comps)
         output_matches(occurrences)
 
+
 main()
 \ No newline at end of file