aboutsummaryrefslogtreecommitdiff
path: root/ass1/binary_boyermoore.py
blob: aa5d0c5bb107ce370196f2d50f24ae9a39570133 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# txt = "zzzzzzzzzabczzzzzzzzzz"
# pat = "abczzzabc"
# m = len(pat)
# n = len(txt)
# R = [[m for __ in range(m)] for _ in range(0, 26)]
# good_suffix = [0 for _ in range(0, m+1)]

def alpha_number(char):
    return int(char)


def reverse(string):
    return string[::-1]


def compare(string, i, end):
    for j in range(end):
        if i+j == end or string[i+j] != string[j]:
            return j


def gusfield(string):
    z = [0 for _ in string]
    z[0] = len(string)
    r = 0
    l = 0
    for i in range(1, len(string)):
        if i == 1:  # base case
            z[1] = compare(string, i, len(string))
            if z[1] > 0:
                r = z[1] + 1
                l = 1
        elif i > r: # Case 1
            z[i] = compare(string, i, len(string))
            if z[i] > 0:
                q = i + z[i]
                r = q - 1
                l = i
        elif i <= r:  # Case 2
            if z[i-l] < r-i:  # Case 2a
                z[i] = z[i-l]
            else:  # Case 2b
                q = compare(string, i, len(string))
                z[i] = q
                r = q
                l = i
    return z

def gen_jump_table(pat):
    m = len(pat)
    R = [[-1 for __ in range(m)] for _ in range(0, 2)]
    for j in range(m):
        for i in range(j+1):
            R[alpha_number(pat[i])][j] = i
    return R

def gen_z_suffix(pat):
    return reverse(gusfield(reverse(pat)))+[0]

# print(list(pat))
# print(R)
# print(Z)

def gen_good_suffix(pat, Z):
    m = len(pat)
    good_suffix = [0 for _ in range(0, m + 1)]
    for i in range(m):
        j = m - Z[i]
        good_suffix[j] = i+1
    return good_suffix

# print("g", good_suffix)

def gen_matched_prefix(pat):
    m = len(pat)
    matched_prefix = gusfield(pat)+[0]
    for i in range(m-1, -1, -1):
        matched_prefix[i] = max(matched_prefix[i], matched_prefix[i+1])
    return matched_prefix


def preprocess(pat):
    R = gen_jump_table(pat)
    Z = gen_z_suffix(pat)
    good_suffix = gen_good_suffix(pat, Z)
    matched_prefix = gen_matched_prefix(pat)
    return R, good_suffix, matched_prefix

def boyer_moore(pat, txt):
    R, good_suffix, matched_prefix = preprocess(pat)
    m = len(pat)
    n = len(txt)
    j = 0
    occurrence = 0
    start = None
    stop = None
    galil_index = -1
    comps = 0

    print("="*15)
    print(6*" " + txt)

    while j <= n-m:
        matches = 0
        for i in range(m-1, -1, -1):
            # if galil_index == j and start <= i < stop:
            #     print("pee")
            #     i = max(start + 1, 1)
            #     continue

            print(f"{j=:02}  {' ' * j}", end="")
            for x in range(len(pat)):
                if x == i:
                    print(pat[x].upper(), end="")
                else:
                    print(pat[x], end="")
            print()
            match = pat[i] == txt[j+i]
            comps += 1
            if match:
                matches += 1
                if matches == m:
                    good_suffix_shift = m - matched_prefix[1]
                    j += good_suffix_shift
                    i = m-1
                    occurrence += 1
                    break
            else:
                mismatched = txt[j+i]
                bad_char_shift = i-R[alpha_number(mismatched)][i]

                good_suffix_shift = 1
                if good_suffix[i+1] > 0:
                    good_suffix_shift = m - good_suffix[i+1]
                    start = good_suffix[i+1] - m + i
                    stop = good_suffix[i+1]
                    next_galil = j+1
                elif good_suffix[i+1] == 0:
                    good_suffix_shift = m - matched_prefix[i+1]
                    start = 1
                    stop = matched_prefix[i+1]
                    next_galil = j+1
                elif matches == m:
                    print(f"Match at {j}")
                    good_suffix_shift = m - matched_prefix[1]
                # print(start, stop)
                j += max(good_suffix_shift, bad_char_shift)
                # print(good_suffix_shift, bad_char_shift)
                if good_suffix_shift >= bad_char_shift:
                    print("good suff", good_suffix_shift)
                else:
                    print("bad char", bad_char_shift)
                    print(i)
                if good_suffix[i+1] >= 0 and good_suffix_shift >= bad_char_shift:
                    galil_index = j
                break

    print(f"It found {occurrence} occurences.")
    print(f"\n  {list(range(m))}")
    for i, a in enumerate(R):
        print(chr(i+97), a)
    print(f"{comps} comparisons.")

boyer_moore("111101111111111", "101010111010010101110111010101110100101011101101111011111111111101110111010101110100101011101101101110111010101110100101011101101101110110000110101011101001010111011011011101100001010101110100101011101110101011101001010111011011011101110101011101001010111011011011101110101011101001010111011011011101100001101010111010010101110110110111011000010111011111011101110110101110110101101100000001011001010101010101110111110111011101101011101101011011000000010110010101010101000010111011111011101110110101110110101101100000001011001010101010100001011101111101110111011010111011010110110000000101100101010101010101011101001010111011011011101100001011101111101110111011010101110100101011101101101110110000101110111110101010111010010101110110110111011000010111011101010111010010101110110110111011000010111010101110100101011101101101010101110100101011101101101110110000101110111110111011101101011101101011011000000010110010101010101111011000010111011111011101110110101110110101101100000001011001010101010110111110111011101101011101101011011000000010110010101010101111011101110110101110110101101100000001011001010101010111101110110101110110101101100000001011001010101010110101110110101101100000001011001010101010110110111011000010101011101001010111011101010111010010101110110110111011101010111010010101110110110111011101010111010010101110110110111011000011010101110100101011101101101110110000101110111110111011101101011101101011011000000010110010101010101011101111101110111011010111011010110110000000101100101010101010000101110111110111011101101011101101011011000000010110010101010101000010111011111011101110110101110110101101100000001011001010101010101010111010010101110110110111011000010111011111011101110110101011101001010111011011011101100001011101111101010101110100101011101101101110110000101110111010101110100101011101101101110110000101110101011101001010111011011010101011101001010111011011011101100001011101111101110111011010111011010110110000000101100101010101011110110000101110111110111011101101011101101011011000000010110010101010101101111101110111011010111011010110110000000101100101010101011110111011101101011101101011011000000010110010101010101111011101101011101101011011000000010110010101010101101011101101011011000000010110010101010101101101110110000101110111110111011101101011101101011011000000010110010101010101101110111110111011101101011101101011011000000010110010101010101101110111110111011101101011101101011011000000010110010101010101011101111101110111011010111011010110110000000101100101010101010000101110111110111011101101011101101011011000000010110010101010101000010111011111011101110110101110110101101100000001011001010101010101010111101111111111111010010101110110110111011000010111011111011101110110101011101001010111011011011101100001011101111101010101110100101011101101101110110000101110111010101110100101011101101101110110000101110101011101001010111011011010101011101001010111011011011101100001011101111101110111011010111011010110110000000101100101010101011110110000101110111110111011101101011101101011011000000010110010101010101101111101110111011010111011010110110000000101100101010101011110111011101101011101101011011000000010110010101010101111011101101011101101011011000000010110010101010101101011101101011011000000010110010101010101101101110110000101110111110111011101101011101101011011000000010110010101010101")