diff options
| author | akiyamn | 2021-03-26 21:22:12 +1100 | 
|---|---|---|
| committer | akiyamn | 2021-03-26 21:22:12 +1100 | 
| commit | 39d3e7ef0bd79e8213c1cce58dd9e2f14485b3da (patch) | |
| tree | 498db0de1fe04a90d9699c7d988a4542c6253e39 /ass1/binary_boyermoore.py | |
| parent | 94b72fd6b72cf52cdbabdb2a512363f4207e23d5 (diff) | |
| download | fit3155-39d3e7ef0bd79e8213c1cce58dd9e2f14485b3da.tar.gz fit3155-39d3e7ef0bd79e8213c1cce58dd9e2f14485b3da.zip | |
BM: It's doneと思う
Diffstat (limited to 'ass1/binary_boyermoore.py')
| -rw-r--r-- | ass1/binary_boyermoore.py | 68 | 
1 files changed, 28 insertions, 40 deletions
| diff --git a/ass1/binary_boyermoore.py b/ass1/binary_boyermoore.py index 4c41c77..df75d62 100644 --- a/ass1/binary_boyermoore.py +++ b/ass1/binary_boyermoore.py @@ -1,12 +1,6 @@ -# txt = "zzzzzzzzzabczzzzzzzzzz" -# pat = "abczzzabc" -# m = len(pat) -# n = len(txt) -# R = [[m for __ in range(m)] for _ in range(0, 26)] -# good_suffix = [0 for _ in range(0, m+1)] -  import sys +  def alpha_number(char):      if char == "0" or char == "1":          return int(char) @@ -22,6 +16,7 @@ def compare(string, i, end):          if i+j == end or string[i+j] != string[j]:              return j +  def condense(binary, offset=0, size=2):      out = ""      for i in range(offset, len(binary)-offset, size): @@ -31,7 +26,6 @@ def condense(binary, offset=0, size=2):      return out -  def gusfield(string):      z = [0 for _ in string]      z[0] = len(string) @@ -59,20 +53,19 @@ def gusfield(string):                  l = i      return z +  def gen_jump_table(pat):      m = len(pat) -    R = [[-1 for __ in range(m)] for _ in range(0, 256)] +    R = [[-1 for __ in range(m)] for _ in range(0, 4)]  # A space for a, b, c and d used in groupings      for j in range(m):          for i in range(j+1):              R[alpha_number(pat[i])][j] = i      return R +  def gen_z_suffix(pat):      return reverse(gusfield(reverse(pat)))+[0] -# print(list(pat)) -# print(R) -# print(Z)  def gen_good_suffix(pat, Z):      m = len(pat) @@ -82,7 +75,6 @@ def gen_good_suffix(pat, Z):          good_suffix[j] = i+1      return good_suffix -# print("g", good_suffix)  def gen_matched_prefix(pat):      m = len(pat) @@ -100,7 +92,6 @@ def preprocess(pat):      return R, good_suffix, matched_prefix -  def boyer_moore(pat, txt):      R, good_suffix, matched_prefix = preprocess(pat)      m = len(pat) @@ -108,70 +99,65 @@ def boyer_moore(pat, txt):      i = m-1      j = 0      occurrences = [] -    galils = 0      comps = 0      galil = False      start = 0      stop = 0      while j <= n-m: -        match = pat[i] == txt[j+i] +        match = pat[i] == txt[j+i]  # The comparison          comps += 1          if match: -            if galil and stop >= i > start: -                galils += 1 +            if galil and stop >= i > start:  # Able to perform Galil                  i = max(start-1, 0)                  galil = False -            if i == 0: +            if i == 0: # A full match of pat                  good_suffix_shift = m - matched_prefix[1]                  j += good_suffix_shift                  occurrences.append(j)                  i = m-1              else: -                i -= 1 +                i -= 1  # A match, but not a full match yet          else:              mismatched = txt[j + i] -            bad_char_shift = i - R[alpha_number(mismatched)][i] +            bad_char_shift = i - R[alpha_number(mismatched)][i]  # Bad character shift              good_suffix_shift = 1 -            if good_suffix[i+1] > 0: +            if good_suffix[i+1] > 0:  # Good Suffix case 1a                  good_suffix_shift = m - good_suffix[i+1]                  start = good_suffix[i+1] - m + i + 1                  stop = good_suffix[i+1] -            elif good_suffix[i+1] == 0: +            elif good_suffix[i+1] == 0: # Good Suffix case 1b (i.e. matched prefix)                  good_suffix_shift = m - matched_prefix[i+1]                  start = 0                  stop = matched_prefix[i + 1] -            best_shift = max(good_suffix_shift, bad_char_shift) +            best_shift = max(good_suffix_shift, bad_char_shift)  # Choose the best shift              j += best_shift              galil = best_shift == good_suffix_shift              i = m-1 -    print(comps)      return comps, occurrences +  def two_to_the(n):      return 1 << n -def chunky_search(pat, txt, factor=2): + +def chunky_search(pat, txt): +    # I settled on grouping in 2s, since higher groupings took more runs of BMs than comparisons it saved +    # tl;dr: 2 was the fastest for me +    factor = 2      occurrences = []      comps = 0 +    # Do two separate BMs for each binary offset of the grouped bits in txt (i.e. offset by 0 and 1 bits to the right)      for offset in range(two_to_the(factor-1)): -        padding = format(offset, f"0{factor-1}b") if len(pat) % factor else "" +        padding = offset if len(pat) % factor else ""  # Padding to ensure each BM matches the correct groups          augmented_pat = f"{pat}{padding}"          c, o = boyer_moore(condense(augmented_pat, 0, factor), condense(txt, offset, factor))          comps += c -        print(offset, o) -        occurrences += o -    base_comps, base_occur = boyer_moore(pat, txt) -    print(base_occur[:20]) -    print(occurrences[:10]) -    print("*"*20) -    print(f"Chunky Optimisation: {len(occurrences)} occurences in {comps} comparisons.") -    print(f"Normal: {len(base_occur)} occurences in {base_comps} comparisons.") -    if base_comps > 0: -        print(f"{comps * 100 / base_comps:.3f}% of normal Boyer-Moore") -        print(f"{comps * 100 / 642096:.3f}% of their Boyer-Moore") +        # Convert grouped match indices to normal ones with this O(n) operation +        occurrences += list(map(lambda x: x*factor+offset, o))      return comps, occurrences +  def read_args():      with open(sys.argv[1], "r") as txt_file:          txt = txt_file.read() @@ -179,19 +165,21 @@ def read_args():          pat = pat_file.read()      return txt, pat +  def output_matches(occurrences):      with open("output_binary_boyermoore.txt", "w") as file:          for o in occurrences:              file.write(f"{o}\n") +  def main(): -    factor = 2      if len(sys.argv) < 3:          print("Not enough arguments!")      else:          txt, pat = read_args() -        comps, occurrences = chunky_search(pat, txt, factor) +        comps, occurrences = chunky_search(pat, txt)          print(comps)          output_matches(occurrences) +  main()
\ No newline at end of file | 
