Ass 2: Comments

author: akiyamn 2021-05-02 17:40:33 +1000
committer: akiyamn 2021-05-02 17:40:33 +1000
commit: 1b2233099db80b8c3a513d137045aae7de7a252f (patch)
tree: ef9d8fbc796ea2bd387c70fabed09f3569cd43d8 /ass2/q3/lcp.py
parent: 3c60256c2bb81613e7ebc6919fd06d37f3bf318c (diff)
download: fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.tar.gz
fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.zip
1 files changed, 43 insertions, 75 deletions
diff --git a/ass2/q3/lcp.py b/ass2/q3/lcp.py
index 7e90ae0..6c72771 100644
--- a/ass2/q3/lcp.py
+++ b/ass2/q3/lcp.py
@@ -1,99 +1,67 @@
-from random import randrange
+import sys
+from ass2.ukkonen import ukkonen, Point, Node
 
-from ass2.ukkonen import ukkonen, Point, char_is_after, Node, skip_count
-from itertools import product
 
+def longest_prefix(root: Node, s1: str, s2: str, offset: tuple):
+    """
+    Finds the longest prefix of a suffix of two different strings, provided a suffix tree of "s1&s2"
+    The offset tuple represents the start index of each suffix to compare to.
 
-# s1 = "abdabc"
-# s2 = "daba"
-# s1 = "abaabaaba"
-# s2 = "baabaaabaa"
-
-# def longest_prefix(root: Node, s1: str, s2: str, suffixes: tuple):
-#     print(s1[suffixes[0]:])
-#     print(s2[suffixes[1]:])
-#     matches = 0
-#     if suffixes[0] == 0:
-#         point = Point(root, s1[0], 1)
-#     else:
-#         point = skip_count(suffixes[0], Point(root), 0)
-#     for i in range(suffixes[1], len(s2) + 1):
-#         print(s2[i], end=" ")
-#         if char_is_after(point, s2[i]):
-#             if point.is_explicit():
-#                 point = Point(point.node, s2[i], 1)
-#             else:
-#                 point = skip_count(1, point, i)
-#             matches += 1
-#             print("found")
-#         else:
-#             print("not found")
-#             break
-#     return matches
-
-def naive(s1, s2, offset):
-    count = 0
-    for t in list(map(lambda x: x[0] == x[1], zip(s1[offset[0]:], s2[offset[1]:]))):
-        if t:
-            count += 1
-        else:
-            return count
-    return count
-
-
-def lcp(root: Node, s1: str, s2: str, offset: tuple):
+    Starting from the root, if the two strings have a matching character, move in that direction of the suffix tree
+    Increment the index to compare the next characters by, by the length of the edge just traveled
+    Stop when both strings 'disagree' on which path to take.
+    This effectively finds the deepest common ancestor of two suffixes on the same tree.
+    """
     index = 0
     head = Point(root)
     shortest = min(len(s1) - offset[0], len(s2) - offset[1])
-    while index < shortest:
+    while index < shortest:  # Stop searching when you hit the end of the shortest string
         if s1[index + offset[0]] == s2[index + offset[1]]:
             going_to = head.node.get_child(s1[index + offset[0]])
-            head.set_node(going_to)
+            head.set_node(going_to)  # Move to new chosen node
             index += going_to.edge_length
         else:
             return index
     return index
 
 
-def test_everything_for(tree, s1, s2):
-    # print(s1, s2)
-    for i in range(len(s1)):
-        for j in range(len(s2)):
-            result = lcp(tree, s1, s2, (i, j))
-            a = naive(s1, s2, (i, j))
-            assert result == a
+def read_in_string(filename):
+    """
+    Reads in a string from a given file name
+    """
+    with open(filename, "r") as file:
+        return file.read()
 
 
-def new_string(max_letters, max_len):
-    return "".join([chr(randrange(ord("a"), ord("a") + max_letters)) for _ in range(0, max_len)])
+def create_suffix_list(filename):
+    """
+    From the suffix index file, generate a list of tuples representing each suffix pair to search
+    """
+    tuples = []
+    for line in read_in_string(filename).split("\n"):
+        parts = line.split(" ")
+        tuples.append((int(parts[0]), int(parts[1])))
+    return tuples
 
 
-def test(max_letters, max_len):
-    alphabet = "".join(map(chr, range(ord("a"), ord("a") + max_letters)))
-    counter = 0
-    while True:
-        s1, s2 = new_string(max_letters, max_len), new_string(max_letters, max_len)
-        tree = ukkonen(s1 + "z" + s2)
-        test_everything_for(tree, s1, s2)
-        counter += 1
-        if counter % 1000 == 0:
-            print(counter)
+def write_longest_prefixes(root: Node, s1: str, s2: str, suffixes):
+    """
+    Write the longest common suffixes of two strings provided a list of suffix index pairs to file
+    """
+    results = map(lambda t: f"{t[0]} {t[1]} {longest_prefix(root, s1, s2, t)}", suffixes)
+    with open("output_lcp.txt", "w") as file:
+        file.write("\n".join(results))
 
 
 def main():
-    s1 = "abcdacbdab"
-    s2 = "dacbdabc"
-    print(s1 + "&" + s2)
-    tree = ukkonen(s1 + "&" + s2)
-    tree.print_tree()
-
-    # tup = (4, 5)
-    # result = longest_prefix(tree, s1, s2, (0, 5))
-    # result = lcp(tree, s1, s2, tup)
-    # print(naive(s1, s2, tup))
-    # print(result)
+    assert len(sys.argv) == 4  # Need correct number of arguments
+    s1 = read_in_string(sys.argv[1])
+    s2 = read_in_string(sys.argv[2])
+    suffixes = create_suffix_list(sys.argv[3])
+    combined = s1 + "&" + s2
+    tree = ukkonen(combined)  # Generate the suffix tree for both strings once
+    write_longest_prefixes(tree, s1, s2, suffixes) # Write answers to file
 
 
 if __name__ == "__main__":
-    test(4, 50)
-    # main()
+    main()
author	akiyamn	2021-05-02 17:40:33 +1000
committer	akiyamn	2021-05-02 17:40:33 +1000
commit	1b2233099db80b8c3a513d137045aae7de7a252f (patch)
tree	ef9d8fbc796ea2bd387c70fabed09f3569cd43d8 /ass2/q3/lcp.py
parent	3c60256c2bb81613e7ebc6919fd06d37f3bf318c (diff)
download	fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.tar.gz fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.zip