Ass 2: Comments

author: akiyamn 2021-05-02 17:40:33 +1000
committer: akiyamn 2021-05-02 17:40:33 +1000
commit: 1b2233099db80b8c3a513d137045aae7de7a252f (patch)
tree: ef9d8fbc796ea2bd387c70fabed09f3569cd43d8 /ass2
parent: 3c60256c2bb81613e7ebc6919fd06d37f3bf318c (diff)
download: fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.tar.gz
fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.zip
3 files changed, 159 insertions, 111 deletions
diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py
index edfaa80..78cf6c4 100644
--- a/ass2/q2/suffix_array.py
+++ b/ass2/q2/suffix_array.py
@@ -3,6 +3,9 @@ import sys
 
 
 def depth_first_apply(node: Node, func):
+    """
+    Apply a given function to every leaf node in the suffix tree via DFS
+    """
     if not node.children:
         func(node.suffix_index)
     else:
@@ -11,11 +14,17 @@ def depth_first_apply(node: Node, func):
 
 
 def read_in_string(filename):
+    """
+    Reads in a string from a given file name
+    """
     with open(filename, "r") as file:
         return file.read()
 
 
 def suffix_array(string):
+    """
+    Generates a suffix array from a given string using Ukkonen's and a depth first search
+    """
     tree = ukkonen(string)
     buffer = []
     depth_first_apply(tree, buffer.append)
@@ -23,6 +32,9 @@ def suffix_array(string):
 
 
 def write_suffix_array(string, filename):
+    """
+    Writes a suffix array for a given string to a given file name.
+    """
     with open(filename, "w") as file:
         buffer = suffix_array(string)
         file.write("\n".join(map(str, buffer)))
diff --git a/ass2/q3/lcp.py b/ass2/q3/lcp.py
index 7e90ae0..6c72771 100644
--- a/ass2/q3/lcp.py
+++ b/ass2/q3/lcp.py
@@ -1,99 +1,67 @@
-from random import randrange
+import sys
+from ass2.ukkonen import ukkonen, Point, Node
 
-from ass2.ukkonen import ukkonen, Point, char_is_after, Node, skip_count
-from itertools import product
 
+def longest_prefix(root: Node, s1: str, s2: str, offset: tuple):
+    """
+    Finds the longest prefix of a suffix of two different strings, provided a suffix tree of "s1&s2"
+    The offset tuple represents the start index of each suffix to compare to.
 
-# s1 = "abdabc"
-# s2 = "daba"
-# s1 = "abaabaaba"
-# s2 = "baabaaabaa"
-
-# def longest_prefix(root: Node, s1: str, s2: str, suffixes: tuple):
-#     print(s1[suffixes[0]:])
-#     print(s2[suffixes[1]:])
-#     matches = 0
-#     if suffixes[0] == 0:
-#         point = Point(root, s1[0], 1)
-#     else:
-#         point = skip_count(suffixes[0], Point(root), 0)
-#     for i in range(suffixes[1], len(s2) + 1):
-#         print(s2[i], end=" ")
-#         if char_is_after(point, s2[i]):
-#             if point.is_explicit():
-#                 point = Point(point.node, s2[i], 1)
-#             else:
-#                 point = skip_count(1, point, i)
-#             matches += 1
-#             print("found")
-#         else:
-#             print("not found")
-#             break
-#     return matches
-
-def naive(s1, s2, offset):
-    count = 0
-    for t in list(map(lambda x: x[0] == x[1], zip(s1[offset[0]:], s2[offset[1]:]))):
-        if t:
-            count += 1
-        else:
-            return count
-    return count
-
-
-def lcp(root: Node, s1: str, s2: str, offset: tuple):
+    Starting from the root, if the two strings have a matching character, move in that direction of the suffix tree
+    Increment the index to compare the next characters by, by the length of the edge just traveled
+    Stop when both strings 'disagree' on which path to take.
+    This effectively finds the deepest common ancestor of two suffixes on the same tree.
+    """
     index = 0
     head = Point(root)
     shortest = min(len(s1) - offset[0], len(s2) - offset[1])
-    while index < shortest:
+    while index < shortest:  # Stop searching when you hit the end of the shortest string
         if s1[index + offset[0]] == s2[index + offset[1]]:
             going_to = head.node.get_child(s1[index + offset[0]])
-            head.set_node(going_to)
+            head.set_node(going_to)  # Move to new chosen node
             index += going_to.edge_length
         else:
             return index
     return index
 
 
-def test_everything_for(tree, s1, s2):
-    # print(s1, s2)
-    for i in range(len(s1)):
-        for j in range(len(s2)):
-            result = lcp(tree, s1, s2, (i, j))
-            a = naive(s1, s2, (i, j))
-            assert result == a
+def read_in_string(filename):
+    """
+    Reads in a string from a given file name
+    """
+    with open(filename, "r") as file:
+        return file.read()
 
 
-def new_string(max_letters, max_len):
-    return "".join([chr(randrange(ord("a"), ord("a") + max_letters)) for _ in range(0, max_len)])
+def create_suffix_list(filename):
+    """
+    From the suffix index file, generate a list of tuples representing each suffix pair to search
+    """
+    tuples = []
+    for line in read_in_string(filename).split("\n"):
+        parts = line.split(" ")
+        tuples.append((int(parts[0]), int(parts[1])))
+    return tuples
 
 
-def test(max_letters, max_len):
-    alphabet = "".join(map(chr, range(ord("a"), ord("a") + max_letters)))
-    counter = 0
-    while True:
-        s1, s2 = new_string(max_letters, max_len), new_string(max_letters, max_len)
-        tree = ukkonen(s1 + "z" + s2)
-        test_everything_for(tree, s1, s2)
-        counter += 1
-        if counter % 1000 == 0:
-            print(counter)
+def write_longest_prefixes(root: Node, s1: str, s2: str, suffixes):
+    """
+    Write the longest common suffixes of two strings provided a list of suffix index pairs to file
+    """
+    results = map(lambda t: f"{t[0]} {t[1]} {longest_prefix(root, s1, s2, t)}", suffixes)
+    with open("output_lcp.txt", "w") as file:
+        file.write("\n".join(results))
 
 
 def main():
-    s1 = "abcdacbdab"
-    s2 = "dacbdabc"
-    print(s1 + "&" + s2)
-    tree = ukkonen(s1 + "&" + s2)
-    tree.print_tree()
-
-    # tup = (4, 5)
-    # result = longest_prefix(tree, s1, s2, (0, 5))
-    # result = lcp(tree, s1, s2, tup)
-    # print(naive(s1, s2, tup))
-    # print(result)
+    assert len(sys.argv) == 4  # Need correct number of arguments
+    s1 = read_in_string(sys.argv[1])
+    s2 = read_in_string(sys.argv[2])
+    suffixes = create_suffix_list(sys.argv[3])
+    combined = s1 + "&" + s2
+    tree = ukkonen(combined)  # Generate the suffix tree for both strings once
+    write_longest_prefixes(tree, s1, s2, suffixes) # Write answers to file
 
 
 if __name__ == "__main__":
-    test(4, 50)
-    # main()
+    main()
diff --git a/ass2/ukkonen.py b/ass2/ukkonen.py
index bc8b563..5aa94e7 100644
--- a/ass2/ukkonen.py
+++ b/ass2/ukkonen.py
@@ -1,8 +1,21 @@
+"""
+This file is imported into questions 2 and 3.
+"""
+
 import sys
 
 ALPHABET_SIZE = 28
 
+
 class OrderedDict(dict):
+    """
+    A hybrid Python dictionary/list
+        All set/get item operations on this data structure are the same complexity of a normal dictionary O(1)-ish
+        For Ukkonen's operation, only the normal dictionary features are used.
+    As values are stored in the dictionary, the are also referenced in a list of size O(alphabet).
+        This acts as a kind of 'counting sort' when accessed is O(n), but provides a pre-sorted list of all children nodes
+        This is used for generating the suffix array.
+    """
     def __init__(self):
         super().__init__()
         self.first_letters = [None for _ in range(ALPHABET_SIZE)]
@@ -18,11 +31,14 @@ class OrderedDict(dict):
         super().__delitem__(key)
         self.first_letters[self.rank(key)] = None
 
-    def ordered_items(self):
+    def ordered_items(self):  # Return iterable of pre-sorted items (for suffix array)
         return filter(lambda x: x is not None, self.first_letters)
 
     @staticmethod
     def rank(char):
+        """
+        Define a number value to an alphabet letter, including special characters so they can fit in a list
+        """
         if char == "$":
             return 26
         elif char == "&":
@@ -31,8 +47,11 @@ class OrderedDict(dict):
             return ord(char) - 96
 
 
-
 class Node:
+    """
+    Represents an arbitrary node in a suffix tree
+    Also statically sotres some state information about the algorithm (not pretty, I know)
+    """
     global_end = 0
     num_splits = 0
     all_nodes = []
@@ -50,16 +69,22 @@ class Node:
         self.link = None
 
     def __str__(self):
+        """
+        String representation of node, shows important internal values of a node (for debug)
+        """
         link_str = "" if self.link is None else f" -> {self.link.id}"
         if not self.root:
             j, i = self.tuple()
             return f"[{self.id}, {self.tuple()}, {self.string[j:i + 1]}{link_str}]"
         return f"[{self.id} root{link_str}]"
 
-    def __repr__(self):
+    def __repr__(self):  # Shorter representation of node
         return f"[{self.id}]"
 
     def print_tree(self, spaces=1):
+        """
+        Recursively prints tree of nodes (for debug)
+        """
         print(f"{self}")
         for edge in self.children:
             print(f"   " * spaces, end="")
@@ -82,13 +107,16 @@ class Node:
         self.children.pop(child.first_char())
 
     @property
-    def end_index(self):
+    def end_index(self):  # Translates end index into a number (could be '#' pointer)
         return self.tuple()[1]
 
     def tuple(self):
+        """
+        Returns the resolved start and end coordinates of the substring this node represents
+        """
         if self.root:
             raise Exception("Can't get substring of root.")
-        if self.end == "#":
+        if self.end == "#":  # Translate '#' into global_end
             return self.start, self.global_end
         return self.start, self.end
 
@@ -106,6 +134,12 @@ class Node:
 
 
 class Point:
+    """
+    A representation of a single point on the tree. Used to store active node, edge and length data in one place
+    Could represent a place in the middle of an edge (implicit) or a place on a node (explicit).
+    Abstracts away a lot of tedium regarding working with these closely connected values.
+    Can be used to create 'pure' functions which return a transformation on a given point
+    """
     def __init__(self, node, edge="", length=0):
         assert isinstance(node, Node)
         self.node = node
@@ -118,25 +152,34 @@ class Point:
     def is_explicit(self):  # a.k.a. is not on an edge
         return self.edge == ""
 
-    def set_node(self, node):
+    def set_node(self, node):  # Set point to a specific node, reset other values
         self.node = node
         self.edge = ""
         self.length = 0
 
     @property
-    def edge_node(self) -> Node:
+    def edge_node(self) -> Node:  # Return the Node object of the edge this object points to
         return self.node.get_child(self.edge)
 
     def index_here(self):
+        """
+        Return the index in the original string that this point refers to
+        """
         if self.is_explicit():
             return 0 if self.node.root else self.node.start
         return self.edge_node.start + self.length - 1
 
     def char_here(self):
+        """
+        Return the char in the original string that this point refers to
+        """
         return Node.string[self.index_here()]
 
 
 def create_root():
+    """
+    Create a root node with special root properties. Used to initalise the algorithm
+    """
     assert len(Node.all_nodes) == 0
     root = Node(None, None)
     root.root = True
@@ -145,6 +188,11 @@ def create_root():
 
 
 def split_edge(split_point: Point):
+    """
+    Split a given edge into two separate edges, creating a new node in the middle (called a mediator in my code)
+    Used for Rule 2s on implicit suffixes.
+    Returns the newly created mediator node
+    """
     assert not split_point.is_explicit()
     edge = split_point.edge_node
     original = edge.tuple()
@@ -164,41 +212,47 @@ def pos(n: int):
 
 
 def do_phase(root: Node, active: Point, i, last_j, remainder):
+    """
+    Performs a single phase of Ukkonen's algorithm, returning values used for the next phase.
+    """
+
+    # Initialisation
     root_point = Point(root)
-    Node.global_end += 1
+    Node.global_end += 1  # Perform rapid leaf extension trick (Rule 1)
     did_rule_three = False
     j = last_j + 1
     node_just_created = None
-    while not did_rule_three and j <= i + 1:
+
+    while not did_rule_three and j <= i + 1:  # Run only the required extensions for this phase
 
         curr_char = Node.string[i]
         match = char_is_after(active, curr_char)
-        if match:
-            # print(3)
+        if match:  # Decide if Rule 2 or 3.
+            # RULE 3 LOGIC
             remainder += 1
             if node_just_created is not None:
-                node_just_created.link = active.node
-            active = skip_count(1, active, i)
-            did_rule_three = True
+                node_just_created.link = active.node  # Create suffix link (Rule 3)
+            active = skip_count(1, active, i)  # Move active node
+            did_rule_three = True  # Break loop
         else:
-            # print(2)
-            if not active.is_explicit():
+            # RULE 2 LOGIC
+            if not active.is_explicit():  # Active point on an edge, need to split
                 mediator = split_edge(active)
-                mediator.add_child(Node(i, "#"))
+                mediator.add_child(Node(i, "#"))  # Dangle new character off of mediator node from split
                 if node_just_created is not None:
-                    node_just_created.link = mediator
+                    node_just_created.link = mediator  # Create suffix link (First sub-case)
                 node_just_created = mediator
                 active.length -= 1
                 if active.length == 0:
                     active.set_node(active.node)
-            else:
+            else:  # Active point on node, just dangle off a new node
                 active.node.add_child(Node(i, "#"))
                 if node_just_created is not None and node_just_created.link is None:
-                    node_just_created.link = active.node
+                    node_just_created.link = active.node  # Create suffix link (Second sub-case)
             remainder = pos(remainder - 1)
-            active.set_node(active.node.link)
+            active.set_node(active.node.link)  # Go to suffix link
             if remainder > 0:
-                active = skip_count(remainder, Point(root), i - remainder)
+                active = skip_count(remainder, Point(root), i - remainder)  # Traverse from root
             last_j = j
             j += 1
         # print(active)
@@ -207,23 +261,34 @@ def do_phase(root: Node, active: Point, i, last_j, remainder):
 
 
 def char_is_after(point: Point, char):
-    if point.is_explicit():
+    """
+    Return if a given character is traversable directly after a given point
+    Used for Rule 2/3 selection
+    """
+    if point.is_explicit():  # If point on a node
         return char in point.node.children
-    else:
+    else:  # If point on an edge
         if point.length == point.edge_node.edge_length:
             return Node.string[point.edge_node.start] == char
         else:  # If not at the end of an edge
-            # return Node.string[point.index_here() + point.length] == char
             return Node.string[point.index_here() + 1] == char
 
 
 def skip_count(num_chars, start_point: Point, index):
+    """
+    Use the skip-counting trick to traverse num_chars down from point start_point.
+    Use index value as where to start looking in the string for char comparison
+    Returns the point that the traversal lands on
+    """
+
+    # Initialise
     incoming_length = -1
     existing_length = 0
     head = start_point
     chars_left = num_chars
     char = ""
 
+    # Move point to nearest node if it is on an edge
     if not head.is_explicit():
         incoming_length = head.edge_node.edge_length - head.length
         if num_chars < incoming_length:
@@ -233,24 +298,22 @@ def skip_count(num_chars, start_point: Point, index):
         chars_left -= incoming_length
         index += incoming_length
 
-    # Node.string[i] if head.node.root else Node.string[head.node.end_index + 1]
-    # assert head.node.end_index + 1 + chars_left < len(Node.string)
+    # Main traversal loop
     while chars_left > 0:
-        # assert head.node.end_index + 1 + chars_left < len(Node.string)
-        direction = Node.string[index]
+        direction = Node.string[index]  # Choose a direction to go from this point
         next_node = head.node.get_child(direction)
-        if next_node is None:
+        if next_node is None:  # Went off the tree -> error
             raise IndexError(f"Attempted to traverse char\n '{direction}' at point {head}. ({index=})")
         incoming_length = next_node.edge_length
-        if chars_left < incoming_length:
+        if chars_left < incoming_length:  # Break if we able can't go down that edge
             break
+        # Move down edge to next node
         chars_left -= incoming_length
         index += incoming_length
         head.set_node(next_node)
 
-    # direction = Node.string[index]
-
-    if chars_left > 0:  # Landed on an edge
+    # Return position on edge if couldn't traverse a final edge (search landed on edge)
+    if chars_left > 0:
         head.edge = Node.string[index]
         head.length = chars_left
 
@@ -258,6 +321,11 @@ def skip_count(num_chars, start_point: Point, index):
 
 
 def ukkonen(string):
+    """
+    Reset the algorithm values and create return the root of a suffix tree for a given string
+    using everyone's favourite algorithm: Ukkonen's algorithm. O(n) time.
+    """
+    # Initialise values
     string += "$"
     Node.string = string
     Node.global_end = 0
@@ -266,16 +334,16 @@ def ukkonen(string):
     n = len(string)
     remainder = 0
     last_j = 1
+    # Perform base case i = 0 phase
     root = create_root()
     root.add_child(Node(0, "#"))
     active = Point(root)
+    # Perform rest of phases
     for i in range(1, n):
         active, remainder, last_j = do_phase(root, active, i, last_j, remainder)
     return root
 
 
 if __name__ == "__main__":
-    # ukkonen("DEFDBEFFDDEFFFADEFFB")
     ukkonen("abacabad")
     print("done")
-# ukkonen("abcbcbc$")
author	akiyamn	2021-05-02 17:40:33 +1000
committer	akiyamn	2021-05-02 17:40:33 +1000
commit	1b2233099db80b8c3a513d137045aae7de7a252f (patch)
tree	ef9d8fbc796ea2bd387c70fabed09f3569cd43d8 /ass2
parent	3c60256c2bb81613e7ebc6919fd06d37f3bf318c (diff)
download	fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.tar.gz fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.zip