aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ass1/editdist.py89
-rw-r--r--ass1/output_editdist.txt1
-rw-r--r--ass1/patfile.txt1
-rw-r--r--ass1/txtfile.txt1
-rw-r--r--w2/w2lab/main.py (renamed from w2lab/main.py)0
-rw-r--r--w2/w2lect.md30
6 files changed, 122 insertions, 0 deletions
diff --git a/ass1/editdist.py b/ass1/editdist.py
new file mode 100644
index 0000000..ae5b0d1
--- /dev/null
+++ b/ass1/editdist.py
@@ -0,0 +1,89 @@
+import sys
+
+OUTPUT_PATH = "output_editdist.txt"
+
+def compare(string, i, end):
+ for j in range(end):
+ if i+j == end or string[i+j] != string[j]:
+ return j
+
+
+def gusfield(string):
+ z = [0 for _ in string]
+ z[0] = len(string)
+ r = 0
+ l = 0
+ for i in range(1, len(string)):
+ if i == 1: # base case
+ z[1] = compare(string, i, len(string))
+ if z[1] > 0:
+ r = z[1] + 1
+ l = 1
+ elif i > r: # Case 1
+ z[i] = compare(string, i, len(string))
+ if z[i] > 0:
+ q = i + z[i]
+ r = q - 1
+ l = i
+ elif i <= r: # Case 2
+ if z[i-l] < r-i: # Case 2a
+ z[i] = z[i-l]
+ else: # Case 2b
+ q = compare(string, i, len(string))
+ z[i] = q
+ r = q
+ l = i
+ return z
+
+
+def reverse(string):
+ return string[::-1]
+ # new = " " * len(string)
+ # for i in range(len(new)):
+ # new[i] = string[-i]
+
+ # return [string[c] for c in range(len(string)-1, -1, -1)]
+
+
+def comapre_zs(z_prefix, z_suffix, pat_length):
+ with open(OUTPUT_PATH, "w") as file:
+ for i in range(pat_length+1, len(z_prefix)-pat_length-1):
+ distance = None
+ if z_prefix[i] == pat_length:
+ distance = 0
+ elif z_prefix[i] + z_suffix[i+pat_length-1] >= pat_length-1:
+ distance = 1
+ if distance is not None:
+ file.write(f"{i-pat_length-1} {distance}\n")
+ print(f"===={i-pat_length-1} {distance}")
+ print(i-pat_length-1, z_prefix[i], z_suffix[i+pat_length-1])
+
+
+
+def edit_distance(txt, pat):
+ forwards = pat + "$" + txt
+ backwards = reverse(pat) + "$" + txt
+ print(list(forwards))
+ print(list(backwards))
+ print(gusfield(forwards))
+ print(gusfield(backwards))
+ print("="*15)
+ comapre_zs(gusfield(forwards), gusfield(backwards), len(pat))
+
+def read_args():
+ with open(sys.argv[1], "r") as txt_file:
+ txt = txt_file.read()
+ with open(sys.argv[2], "r") as pat_file:
+ pat = pat_file.read()
+ return txt, pat
+
+def main():
+ if len(sys.argv) < 3:
+ print("Not enough arguments!")
+ else:
+ txt, pat = read_args()
+ edit_distance(txt, pat)
+
+if __name__ == '__main__':
+ main()
+
diff --git a/ass1/output_editdist.txt b/ass1/output_editdist.txt
new file mode 100644
index 0000000..6e8183b
--- /dev/null
+++ b/ass1/output_editdist.txt
@@ -0,0 +1 @@
+0 1
diff --git a/ass1/patfile.txt b/ass1/patfile.txt
new file mode 100644
index 0000000..85df507
--- /dev/null
+++ b/ass1/patfile.txt
@@ -0,0 +1 @@
+abcd \ No newline at end of file
diff --git a/ass1/txtfile.txt b/ass1/txtfile.txt
new file mode 100644
index 0000000..9414491
--- /dev/null
+++ b/ass1/txtfile.txt
@@ -0,0 +1 @@
+yyybcdyyy \ No newline at end of file
diff --git a/w2lab/main.py b/w2/w2lab/main.py
index 2914da3..2914da3 100644
--- a/w2lab/main.py
+++ b/w2/w2lab/main.py
diff --git a/w2/w2lect.md b/w2/w2lect.md
new file mode 100644
index 0000000..3fee16a
--- /dev/null
+++ b/w2/w2lect.md
@@ -0,0 +1,30 @@
+# Boyer-Moore
+
+## Bad character rule:
+
+Shift pattern along to the left to the right-most version of the bad character
+$O(m+n)$ *mostly*
+
+## Extended bad character rule
+- 2D array for each char and each position in the pattern
+- **Reduces naive shifts (i.e. by 1 to the right) but takes more space**
+ - Could use linked lists or something but that takes more time
+
+## Good suffix rule
+**Makes Boyer-Moore worst case *almost* linear time rather than squared time**
+- A suffix before the bad character that you know matches the text
+- character to the left of the next instance of the good suffix must be different to the one to the left of the original suffix
+- Move pattern along to the right to the point where the next suffix in the pattern matches
+
+## Galil's optimization
+Improves on good suffix rules
+**Actually linear time**
+extended just makes it slightly faster and doesn't change the complexity
+
+# KNP
+- $O(m+n)$
+- Easier to write, simpler
+- Slower in practise than Boyer-Moore
+- If mismatched first charatcer, move left by 1 (not covered by slides)
+- Use Galil's on this too
+- Proof is examinable (BM isn't)