diff options
| -rw-r--r-- | ass1/editdist.py | 89 | ||||
| -rw-r--r-- | ass1/output_editdist.txt | 1 | ||||
| -rw-r--r-- | ass1/patfile.txt | 1 | ||||
| -rw-r--r-- | ass1/txtfile.txt | 1 | ||||
| -rw-r--r-- | w2/w2lab/main.py (renamed from w2lab/main.py) | 0 | ||||
| -rw-r--r-- | w2/w2lect.md | 30 |
6 files changed, 122 insertions, 0 deletions
diff --git a/ass1/editdist.py b/ass1/editdist.py new file mode 100644 index 0000000..ae5b0d1 --- /dev/null +++ b/ass1/editdist.py @@ -0,0 +1,89 @@ +import sys + +OUTPUT_PATH = "output_editdist.txt" + +def compare(string, i, end): + for j in range(end): + if i+j == end or string[i+j] != string[j]: + return j + + +def gusfield(string): + z = [0 for _ in string] + z[0] = len(string) + r = 0 + l = 0 + for i in range(1, len(string)): + if i == 1: # base case + z[1] = compare(string, i, len(string)) + if z[1] > 0: + r = z[1] + 1 + l = 1 + elif i > r: # Case 1 + z[i] = compare(string, i, len(string)) + if z[i] > 0: + q = i + z[i] + r = q - 1 + l = i + elif i <= r: # Case 2 + if z[i-l] < r-i: # Case 2a + z[i] = z[i-l] + else: # Case 2b + q = compare(string, i, len(string)) + z[i] = q + r = q + l = i + return z + + +def reverse(string): + return string[::-1] + # new = " " * len(string) + # for i in range(len(new)): + # new[i] = string[-i] + + # return [string[c] for c in range(len(string)-1, -1, -1)] + + +def comapre_zs(z_prefix, z_suffix, pat_length): + with open(OUTPUT_PATH, "w") as file: + for i in range(pat_length+1, len(z_prefix)-pat_length-1): + distance = None + if z_prefix[i] == pat_length: + distance = 0 + elif z_prefix[i] + z_suffix[i+pat_length-1] >= pat_length-1: + distance = 1 + if distance is not None: + file.write(f"{i-pat_length-1} {distance}\n") + print(f"===={i-pat_length-1} {distance}") + print(i-pat_length-1, z_prefix[i], z_suffix[i+pat_length-1]) + + + +def edit_distance(txt, pat): + forwards = pat + "$" + txt + backwards = reverse(pat) + "$" + txt + print(list(forwards)) + print(list(backwards)) + print(gusfield(forwards)) + print(gusfield(backwards)) + print("="*15) + comapre_zs(gusfield(forwards), gusfield(backwards), len(pat)) + +def read_args(): + with open(sys.argv[1], "r") as txt_file: + txt = txt_file.read() + with open(sys.argv[2], "r") as pat_file: + pat = pat_file.read() + return txt, pat + +def main(): + if len(sys.argv) < 3: + print("Not enough arguments!") + else: + txt, pat = read_args() + edit_distance(txt, pat) + +if __name__ == '__main__': + main() + diff --git a/ass1/output_editdist.txt b/ass1/output_editdist.txt new file mode 100644 index 0000000..6e8183b --- /dev/null +++ b/ass1/output_editdist.txt @@ -0,0 +1 @@ +0 1 diff --git a/ass1/patfile.txt b/ass1/patfile.txt new file mode 100644 index 0000000..85df507 --- /dev/null +++ b/ass1/patfile.txt @@ -0,0 +1 @@ +abcd
\ No newline at end of file diff --git a/ass1/txtfile.txt b/ass1/txtfile.txt new file mode 100644 index 0000000..9414491 --- /dev/null +++ b/ass1/txtfile.txt @@ -0,0 +1 @@ +yyybcdyyy
\ No newline at end of file diff --git a/w2lab/main.py b/w2/w2lab/main.py index 2914da3..2914da3 100644 --- a/w2lab/main.py +++ b/w2/w2lab/main.py diff --git a/w2/w2lect.md b/w2/w2lect.md new file mode 100644 index 0000000..3fee16a --- /dev/null +++ b/w2/w2lect.md @@ -0,0 +1,30 @@ +# Boyer-Moore + +## Bad character rule: + +Shift pattern along to the left to the right-most version of the bad character +$O(m+n)$ *mostly* + +## Extended bad character rule +- 2D array for each char and each position in the pattern +- **Reduces naive shifts (i.e. by 1 to the right) but takes more space** + - Could use linked lists or something but that takes more time + +## Good suffix rule +**Makes Boyer-Moore worst case *almost* linear time rather than squared time** +- A suffix before the bad character that you know matches the text +- character to the left of the next instance of the good suffix must be different to the one to the left of the original suffix +- Move pattern along to the right to the point where the next suffix in the pattern matches + +## Galil's optimization +Improves on good suffix rules +**Actually linear time** +extended just makes it slightly faster and doesn't change the complexity + +# KNP +- $O(m+n)$ +- Easier to write, simpler +- Slower in practise than Boyer-Moore +- If mismatched first charatcer, move left by 1 (not covered by slides) +- Use Galil's on this too +- Proof is examinable (BM isn't) |
