aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ass1/binary_boyermoore.py68
-rw-r--r--ass1/output_binary_boyermoore.txt1882
2 files changed, 969 insertions, 981 deletions
diff --git a/ass1/binary_boyermoore.py b/ass1/binary_boyermoore.py
index 4c41c77..df75d62 100644
--- a/ass1/binary_boyermoore.py
+++ b/ass1/binary_boyermoore.py
@@ -1,12 +1,6 @@
-# txt = "zzzzzzzzzabczzzzzzzzzz"
-# pat = "abczzzabc"
-# m = len(pat)
-# n = len(txt)
-# R = [[m for __ in range(m)] for _ in range(0, 26)]
-# good_suffix = [0 for _ in range(0, m+1)]
-
import sys
+
def alpha_number(char):
if char == "0" or char == "1":
return int(char)
@@ -22,6 +16,7 @@ def compare(string, i, end):
if i+j == end or string[i+j] != string[j]:
return j
+
def condense(binary, offset=0, size=2):
out = ""
for i in range(offset, len(binary)-offset, size):
@@ -31,7 +26,6 @@ def condense(binary, offset=0, size=2):
return out
-
def gusfield(string):
z = [0 for _ in string]
z[0] = len(string)
@@ -59,20 +53,19 @@ def gusfield(string):
l = i
return z
+
def gen_jump_table(pat):
m = len(pat)
- R = [[-1 for __ in range(m)] for _ in range(0, 256)]
+ R = [[-1 for __ in range(m)] for _ in range(0, 4)] # A space for a, b, c and d used in groupings
for j in range(m):
for i in range(j+1):
R[alpha_number(pat[i])][j] = i
return R
+
def gen_z_suffix(pat):
return reverse(gusfield(reverse(pat)))+[0]
-# print(list(pat))
-# print(R)
-# print(Z)
def gen_good_suffix(pat, Z):
m = len(pat)
@@ -82,7 +75,6 @@ def gen_good_suffix(pat, Z):
good_suffix[j] = i+1
return good_suffix
-# print("g", good_suffix)
def gen_matched_prefix(pat):
m = len(pat)
@@ -100,7 +92,6 @@ def preprocess(pat):
return R, good_suffix, matched_prefix
-
def boyer_moore(pat, txt):
R, good_suffix, matched_prefix = preprocess(pat)
m = len(pat)
@@ -108,70 +99,65 @@ def boyer_moore(pat, txt):
i = m-1
j = 0
occurrences = []
- galils = 0
comps = 0
galil = False
start = 0
stop = 0
while j <= n-m:
- match = pat[i] == txt[j+i]
+ match = pat[i] == txt[j+i] # The comparison
comps += 1
if match:
- if galil and stop >= i > start:
- galils += 1
+ if galil and stop >= i > start: # Able to perform Galil
i = max(start-1, 0)
galil = False
- if i == 0:
+ if i == 0: # A full match of pat
good_suffix_shift = m - matched_prefix[1]
j += good_suffix_shift
occurrences.append(j)
i = m-1
else:
- i -= 1
+ i -= 1 # A match, but not a full match yet
else:
mismatched = txt[j + i]
- bad_char_shift = i - R[alpha_number(mismatched)][i]
+ bad_char_shift = i - R[alpha_number(mismatched)][i] # Bad character shift
good_suffix_shift = 1
- if good_suffix[i+1] > 0:
+ if good_suffix[i+1] > 0: # Good Suffix case 1a
good_suffix_shift = m - good_suffix[i+1]
start = good_suffix[i+1] - m + i + 1
stop = good_suffix[i+1]
- elif good_suffix[i+1] == 0:
+ elif good_suffix[i+1] == 0: # Good Suffix case 1b (i.e. matched prefix)
good_suffix_shift = m - matched_prefix[i+1]
start = 0
stop = matched_prefix[i + 1]
- best_shift = max(good_suffix_shift, bad_char_shift)
+ best_shift = max(good_suffix_shift, bad_char_shift) # Choose the best shift
j += best_shift
galil = best_shift == good_suffix_shift
i = m-1
- print(comps)
return comps, occurrences
+
def two_to_the(n):
return 1 << n
-def chunky_search(pat, txt, factor=2):
+
+def chunky_search(pat, txt):
+ # I settled on grouping in 2s, since higher groupings took more runs of BMs than comparisons it saved
+ # tl;dr: 2 was the fastest for me
+ factor = 2
occurrences = []
comps = 0
+ # Do two separate BMs for each binary offset of the grouped bits in txt (i.e. offset by 0 and 1 bits to the right)
for offset in range(two_to_the(factor-1)):
- padding = format(offset, f"0{factor-1}b") if len(pat) % factor else ""
+ padding = offset if len(pat) % factor else "" # Padding to ensure each BM matches the correct groups
augmented_pat = f"{pat}{padding}"
c, o = boyer_moore(condense(augmented_pat, 0, factor), condense(txt, offset, factor))
comps += c
- print(offset, o)
- occurrences += o
- base_comps, base_occur = boyer_moore(pat, txt)
- print(base_occur[:20])
- print(occurrences[:10])
- print("*"*20)
- print(f"Chunky Optimisation: {len(occurrences)} occurences in {comps} comparisons.")
- print(f"Normal: {len(base_occur)} occurences in {base_comps} comparisons.")
- if base_comps > 0:
- print(f"{comps * 100 / base_comps:.3f}% of normal Boyer-Moore")
- print(f"{comps * 100 / 642096:.3f}% of their Boyer-Moore")
+ # Convert grouped match indices to normal ones with this O(n) operation
+ occurrences += list(map(lambda x: x*factor+offset, o))
return comps, occurrences
+
def read_args():
with open(sys.argv[1], "r") as txt_file:
txt = txt_file.read()
@@ -179,19 +165,21 @@ def read_args():
pat = pat_file.read()
return txt, pat
+
def output_matches(occurrences):
with open("output_binary_boyermoore.txt", "w") as file:
for o in occurrences:
file.write(f"{o}\n")
+
def main():
- factor = 2
if len(sys.argv) < 3:
print("Not enough arguments!")
else:
txt, pat = read_args()
- comps, occurrences = chunky_search(pat, txt, factor)
+ comps, occurrences = chunky_search(pat, txt)
print(comps)
output_matches(occurrences)
+
main() \ No newline at end of file
diff --git a/ass1/output_binary_boyermoore.txt b/ass1/output_binary_boyermoore.txt
index 92be1cb..193dbaa 100644
--- a/ass1/output_binary_boyermoore.txt
+++ b/ass1/output_binary_boyermoore.txt
@@ -1,942 +1,942 @@
-464
-1461
-4048
-5775
-6661
-6819
-8353
-8391
-8800
-10744
-12431
-14295
-14313
-15495
-16320
-18062
-18837
-19870
-21321
-21545
-22689
-23072
-24058
-24511
-26796
-27628
-29179
-30672
-31637
-32922
-37188
-37553
-38562
-40261
-44478
-45943
-46537
-46599
-46732
-47037
-48324
-50004
-50737
-51431
-51541
-51619
-51816
-53408
-53745
-54342
-55701
-57662
-59420
-60430
-61220
-62869
-63052
-63440
-63680
-64410
-64584
-64895
-66451
-67633
-68932
-69006
-69430
-70768
-71354
-71668
-73099
-73141
-73445
-73532
-73717
-74341
-74484
-74697
-74944
-75882
-76226
-76229
-76644
-76967
-77383
-77958
-79124
-79592
-79601
-81792
-83645
-83985
-84428
-84891
-85579
-87019
-87855
-88385
-89718
-89817
-90909
-92056
-94324
-97761
-99091
-99274
-99507
-102387
-102920
-103093
-103220
-103433
-103607
-103942
-104451
-104586
-105690
-108171
-109210
-109308
-109573
-110114
-111484
-112487
-113353
-113705
-116426
-117642
-118401
-118607
-118613
-118709
-121242
-127074
-128971
-129068
-130717
-130832
-131375
-132275
-134020
-134185
-136858
-138541
-138580
-144970
-146265
-147890
-148974
-149106
-149612
-150069
-154000
-155528
-155843
-156723
-159185
-160711
-161060
-162522
-162775
-162944
-163711
-163812
-164775
-165854
-165867
-165991
-166150
-170165
-170651
-170963
-171778
-172703
-173047
-179304
-180494
-180961
-181396
-182310
-182313
-182650
-183955
-184019
-185576
-185851
-185909
-188230
-188342
-188510
-188972
-190307
-191123
-191802
-193285
-193341
-194573
-196399
-196523
-196555
-197858
-200973
-201042
-201528
-204030
-204582
-204786
-205381
-206801
-208914
-209829
-210013
-210132
-212030
-212290
-212756
-213964
-214285
-215178
-215549
-218844
-220018
-220497
-220728
-220819
-223943
-224918
-226885
-227893
-227896
-228421
-228733
-230760
-232688
-232746
-234461
-234757
-234963
-236477
-239505
-239806
-240103
-240891
-242853
-244107
-245007
-245202
-245757
-247858
-250154
-250633
-250825
-251687
-251923
-253977
-257404
-257643
-257865
-260857
-262853
-263053
-263992
-264312
-264829
-266105
-267867
-268124
-268127
-268460
-270263
-271536
-271629
-272157
-272548
-273051
-273187
-273522
-274815
-275320
-277573
-277845
-278554
-279853
-281974
-282164
-282804
-285832
-287094
-287226
-288105
-289820
-290381
-291828
-292572
-293132
-294405
-298406
-298409
-300370
-301186
-302277
-302615
-302688
-303139
-303458
-303476
-303653
-303670
-303983
-304212
-305278
-306305
-307044
-307061
-308654
-309314
-310783
-311126
-311562
-313104
-313935
-314008
-314932
-318305
-318488
-318806
-319488
-319601
-319890
-321425
-322729
-323700
-324226
-325292
-325502
-326081
-328472
-328543
-328650
-328874
-328981
-330329
-331105
-338768
-340487
-340660
-341268
-342153
-344842
-348092
-348691
-350424
-350608
-350860
-351729
-352937
-353153
-353169
-353588
-354125
-355247
-355843
-356889
-357821
-358065
-358622
-359508
-359523
-359869
-360710
-362821
-364372
-367952
-371670
-371839
-372276
-373592
-374138
-374445
-375843
-377427
-378257
-380666
-381365
-383748
-385566
-387357
-387484
-387487
-388776
-390280
-392833
-392861
-393182
-393253
-393415
-393908
-394255
-395919
-396419
-396556
-396643
-396915
-397201
-398935
-399082
-400031
-400423
-401283
-401975
-403063
-403348
-406828
-409492
-410896
-411139
-413040
-413045
-414448
-414743
-415439
-416292
-418648
-419237
-421069
-421227
-423158
-424213
-425534
-428816
-429366
-430474
-430745
-431317
-431615
-432919
-433297
-433960
-434390
-434490
-435357
-436736
-436771
-437284
-440334
-440739
-443027
-443642
-443942
-444847
-446069
-447584
-448983
-449892
-451653
-451857
-452515
-453090
-453729
-454721
-455522
-455701
-456498
-458215
-459335
-461790
-466643
-470243
-472263
-475232
-479635
-480151
-480303
-482847
-483094
-483400
-485401
-485667
-487886
-488163
-488215
-491796
-495958
-496184
-497624
-2359
-3217
-4843
-7294
-8518
-8670
-9478
-9765
-9875
-10841
-13537
-14748
-15620
-15623
-16324
-22451
-23706
-24400
-24444
-25093
-27404
-28370
-32603
-32641
-32822
-35400
-37634
-38255
-38968
-39179
-41318
-41855
-42134
-42151
-42293
-42353
-42360
-43499
-44099
-44747
-45238
-46385
-46830
-47613
-48008
-48451
-48707
-50870
-54510
-54654
-55800
-57679
-57961
-58508
-59523
-60454
-60947
-62475
-62527
-63176
-64433
-66467
-67914
-68081
-70000
-71066
-72626
-72897
-73862
-76385
-77200
-77709
-77722
-77807
-78452
-83238
-83811
-84207
-84621
-84899
-85616
-85621
-86170
-88051
-88108
-88634
-88776
-88922
-88978
-89524
-89786
-90633
-91238
-92661
-93093
-95566
-102928
-103019
-107432
-108357
-109322
-110032
-110626
-113623
-113929
-114184
-114279
-114434
-115012
-115771
-119641
-120246
-121618
-125572
-127566
-128979
-129141
-134179
-137231
-137360
-137802
-137868
-137871
-139351
-140226
-140603
-140987
-141461
-142365
-142418
-143772
-144712
-145824
-150373
-152408
-153657
-156690
-157001
-158111
-158193
-158267
-158296
-159569
-164517
-164520
-165099
-166258
-169130
-171036
-176307
-176854
-177148
-178581
-178726
-178965
-179825
-179872
-181261
-182691
-183226
-183473
-187045
-187361
-187973
-188721
-188841
-190498
-191376
-194731
-197627
-198698
-201550
-203585
-203668
-203926
-204649
-206898
-209400
-210065
-210232
-210604
-211409
-212942
-213244
-213750
-215576
-216909
-217269
-217375
-220707
-224454
-225877
-225902
-226167
-226791
-227635
-227937
-230783
-231996
-233517
-237629
-237979
-239031
-239049
-239219
-240896
-242162
-243149
-245113
-245254
-245528
-246545
-247169
-247553
-248932
-249203
-249788
-251404
-251541
-252266
-253723
-254446
-254991
-255376
-255609
-256091
-259463
-261288
-262235
-262864
-270983
-271414
-272403
-272509
-272828
-273489
-273976
-274032
+928
+2922
+8096
+11550
+13322
+13638
+16706
+16782
+17600
+21488
+24862
+28590
+28626
+30990
+32640
+36124
+37674
+39740
+42642
+43090
+45378
+46144
+48116
+49022
+53592
+55256
+58358
+61344
+63274
+65844
+74376
+75106
+77124
+80522
+88956
+91886
+93074
+93198
+93464
+94074
+96648
+100008
+101474
+102862
+103082
+103238
+103632
+106816
+107490
+108684
+111402
+115324
+118840
+120860
+122440
+125738
+126104
+126880
+127360
+128820
+129168
+129790
+132902
+135266
+137864
+138012
+138860
+141536
+142708
+143336
+146198
+146282
+146890
+147064
+147434
+148682
+148968
+149394
+149888
+151764
+152452
+152458
+153288
+153934
+154766
+155916
+158248
+159184
+159202
+163584
+167290
+167970
+168856
+169782
+171158
+174038
+175710
+176770
+179436
+179634
+181818
+184112
+188648
+195522
+198182
+198548
+199014
+204774
+205840
+206186
+206440
+206866
+207214
+207884
+208902
+209172
+211380
+216342
+218420
+218616
+219146
+220228
+222968
+224974
+226706
+227410
+232852
+235284
+236802
+237214
+237226
+237418
+242484
+254148
+257942
+258136
+261434
+261664
+262750
+264550
+268040
+268370
+273716
+277082
+277160
+289940
+292530
+295780
+297948
+298212
+299224
+300138
+308000
+311056
+311686
+313446
+318370
+321422
+322120
+325044
+325550
+325888
+327422
+327624
+329550
+331708
+331734
+331982
+332300
+340330
+341302
+341926
+343556
+345406
+346094
+358608
+360988
+361922
+362792
+364620
+364626
+365300
+367910
+368038
+371152
+371702
+371818
+376460
+376684
+377020
+377944
+380614
+382246
+383604
+386570
+386682
+389146
+392798
+393046
+393110
+395716
+401946
+402084
+403056
+408060
+409164
+409572
+410762
+413602
+417828
+419658
+420026
+420264
+424060
+424580
+425512
+427928
+428570
+430356
+431098
+437688
+440036
+440994
+441456
+441638
+447886
+449836
+453770
+455786
+455792
+456842
+457466
+461520
+465376
+465492
+468922
+469514
+469926
+472954
+479010
+479612
+480206
+481782
+485706
+488214
+490014
+490404
+491514
+495716
+500308
+501266
+501650
+503374
+503846
+507954
+514808
+515286
+515730
+521714
+525706
+526106
+527984
+528624
+529658
+532210
+535734
+536248
+536254
+536920
+540526
+543072
+543258
+544314
+545096
+546102
+546374
+547044
+549630
+550640
+555146
+555690
+557108
+559706
+563948
+564328
+565608
+571664
+574188
+574452
+576210
+579640
+580762
+583656
+585144
+586264
+588810
+596812
+596818
+600740
+602372
+604554
+605230
+605376
+606278
+606916
+606952
+607306
+607340
+607966
+608424
+610556
+612610
+614088
+614122
+617308
+618628
+621566
+622252
+623124
+626208
+627870
+628016
+629864
+636610
+636976
+637612
+638976
+639202
+639780
+642850
+645458
+647400
+648452
+650584
+651004
+652162
+656944
+657086
+657300
+657748
+657962
+660658
+662210
+677536
+680974
+681320
+682536
+684306
+689684
+696184
+697382
+700848
+701216
+701720
+703458
+705874
+706306
+706338
+707176
+708250
+710494
+711686
+713778
+715642
+716130
+717244
+719016
+719046
+719738
+721420
+725642
+728744
+735904
+743340
+743678
+744552
+747184
+748276
+748890
+751686
+754854
+756514
+761332
+762730
+767496
+771132
+774714
+774968
+774974
+777552
+780560
+785666
+785722
+786364
+786506
+786830
+787816
+788510
+791838
+792838
+793112
+793286
+793830
+794402
+797870
+798164
+800062
+800846
+802566
+803950
+806126
+806696
+813656
+818984
+821792
+822278
+826080
+826090
+828896
+829486
+830878
+832584
+837296
+838474
+842138
+842454
+846316
+848426
+851068
+857632
+858732
+860948
+861490
+862634
+863230
+865838
+866594
+867920
+868780
+868980
+870714
+873472
+873542
+874568
+880668
+881478
+886054
+887284
+887884
+889694
+892138
+895168
+897966
+899784
+903306
+903714
+905030
+906180
+907458
+909442
+911044
+911402
+912996
+916430
+918670
+923580
+933286
+940486
+944526
+950464
+959270
+960302
+960606
+965694
+966188
+966800
+970802
+971334
+975772
+976326
+976430
+983592
+991916
+992368
+995248
+4719
+6435
+9687
+14589
+17037
+17341
+18957
+19531
+19751
+21683
+27075
+29497
+31241
+31247
+32649
+44903
+47413
+48801
+48889
+50187
+54809
+56741
+65207
+65283
+65645
+70801
+75269
+76511
+77937
+78359
+82637
+83711
+84269
+84303
+84587
+84707
+84721
+86999
+88199
+89495
+90477
+92771
+93661
+95227
+96017
+96903
+97415
+101741
+109021
+109309
+111601
+115359
+115923
+117017
+119047
+120909
+121895
+124951
+125055
+126353
+128867
+132935
+135829
+136163
+140001
+142133
+145253
+145795
+147725
+152771
+154401
+155419
+155445
+155615
+156905
+166477
+167623
+168415
+169243
+169799
+171233
+171243
+172341
+176103
+176217
+177269
+177553
+177845
+177957
+179049
+179573
+181267
+182477
+185323
+186187
+191133
+205857
+206039
+214865
+216715
+218645
+220065
+221253
+227247
+227859
+228369
+228559
+228869
+230025
+231543
+239283
+240493
+243237
+251145
+255133
+257959
+258283
+268359
+274463
274721
-275058
-275367
-276219
-276992
-278448
-279219
-279929
-280113
-280490
-283328
-285863
-286528
-286534
-286947
-287759
-288806
-288843
-288911
-289479
-291432
-292360
-292648
-292707
-296090
-298451
-298630
-299816
-301218
-301484
-302457
-303241
-303648
-305813
-307809
-308262
-309118
-309420
-311143
-312921
-314304
-315955
-316844
-316847
-317509
-318434
-319178
-319255
-320141
-320633
-320656
-320779
-321005
-323265
-324316
-325662
-328316
-328960
-329440
-331389
-334246
-334975
-335031
-335672
-337369
-337397
-337478
-338142
-339312
-340106
-343232
-343235
-343299
-343910
-344967
-345175
-345329
-345790
-347511
-347994
-348415
-348880
-349163
-349185
-351119
-352260
-353413
-355098
-356599
-358233
-360033
-360036
-362141
-367612
-367862
-368768
-371462
-372167
-372497
-372664
-374788
-376922
-377086
-377145
-377749
-378128
-378913
-382473
-385185
-386361
-387981
-389800
-390639
-391624
-391879
-393878
-396167
-396872
-397244
-397801
-398552
-400255
-401703
-402890
-404461
-406297
-406583
-408130
-408246
-408278
-409641
-410605
-410804
-411225
-411262
-411521
-413329
-413754
-415508
-418754
-421379
-422308
-422775
-422880
-423210
-423442
-423789
-423792
-424072
-424134
-425285
-425288
-425755
-427064
-428490
-429542
-429545
-430139
-430537
-430680
-433466
-434862
-435787
-435790
-437340
-437467
-437951
-438061
-439292
-441342
-443782
-445097
-448606
-449774
-454558
-455967
-457524
-457595
-459511
-459565
-460425
-460826
-461089
-461837
-462185
-462868
-463736
-464870
-467091
-467164
-467766
-468613
-468992
-469306
-469856
-472282
-476579
-477121
-477547
-478201
-478887
-480888
-484035
-484204
-485190
-486400
-487853
-488007
-488253
-488546
-488549
-488552
-490084
-491317
-494446
-494985
-495868
-498466
-498520
+275605
+275737
+275743
+278703
+280453
+281207
+281975
+282923
+284731
+284837
+287545
+289425
+291649
+300747
+304817
+307315
+313381
+314003
+316223
+316387
+316535
+316593
+319139
+329035
+329041
+330199
+332517
+338261
+342073
+352615
+353709
+354297
+357163
+357453
+357931
+359651
+359745
+362523
+365383
+366453
+366947
+374091
+374723
+375947
+377443
+377683
+380997
+382753
+389463
+395255
+397397
+403101
+407171
+407337
+407853
+409299
+413797
+418801
+420131
+420465
+421209
+422819
+425885
+426489
+427501
+431153
+433819
+434539
+434751
+441415
+448909
+451755
+451805
+452335
+453583
+455271
+455875
+461567
+463993
+467035
+475259
+475959
+478063
+478099
+478439
+481793
+484325
+486299
+490227
+490509
+491057
+493091
+494339
+495107
+497865
+498407
+499577
+502809
+503083
+504533
+507447
+508893
+509983
+510753
+511219
+512183
+518927
+522577
+524471
+525729
+541967
+542829
+544807
+545019
+545657
+546979
+547953
+548065
+549443
+550117
+550735
+552439
+553985
+556897
+558439
+559859
+560227
+560981
+566657
+571727
+573057
+573069
+573895
+575519
+577613
+577687
+577823
+578959
+582865
+584721
+585297
+585415
+592181
+596903
+597261
+599633
+602437
+602969
+604915
+606483
+607297
+611627
+615619
+616525
+618237
+618841
+622287
+625843
+628609
+631911
+633689
+633695
+635019
+636869
+638357
+638511
+640283
+641267
+641313
+641559
+642011
+646531
+648633
+651325
+656633
+657921
+658881
+662779
+668493
+669951
+670063
+671345
+674739
+674795
+674957
+676285
+678625
+680213
+686465
+686471
+686599
+687821
+689935
+690351
+690659
+691581
+695023
+695989
+696831
+697761
+698327
+698371
+702239
+704521
+706827
+710197
+713199
+716467
+720067
+720073
+724283
+735225
+735725
+737537
+742925
+744335
+744995
+745329
+749577
+753845
+754173
+754291
+755499
+756257
+757827
+764947
+770371
+772723
+775963
+779601
+781279
+783249
+783759
+787757
+792335
+793745
+794489
+795603
+797105
+800511
+803407
+805781
+808923
+812595
+813167
+816261
+816493
+816557
+819283
+821211
+821609
+822451
+822525
+823043
+826659
+827509
+831017
+837509
+842759
+844617
+845551
+845761
+846421
+846885
+847579
+847585
+848145
+848269
+850571
+850577
+851511
+854129
+856981
+859085
+859091
+860279
+861075
+861361
+866933
+869725
+871575
+871581
+874681
+874935
+875903
+876123
+878585
+882685
+887565
+890195
+897213
+899549
+909117
+911935
+915049
+915191
+919023
+919131
+920851
+921653
+922179
+923675
+924371
+925737
+927473
+929741
+934183
+934329
+935533
+937227
+937985
+938613
+939713
+944565
+953159
+954243
+955095
+956403
+957775
+961777
+968071
+968409
+970381
+972801
+975707
+976015
+976507
+977093
+977099
+977105
+980169
+982635
+988893
+989971
+991737
+996933
+997041