diff options
| author | akiyamn | 2021-05-02 17:40:33 +1000 |
|---|---|---|
| committer | akiyamn | 2021-05-02 17:40:33 +1000 |
| commit | 1b2233099db80b8c3a513d137045aae7de7a252f (patch) | |
| tree | ef9d8fbc796ea2bd387c70fabed09f3569cd43d8 /ass2 | |
| parent | 3c60256c2bb81613e7ebc6919fd06d37f3bf318c (diff) | |
| download | fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.tar.gz fit3155-1b2233099db80b8c3a513d137045aae7de7a252f.zip | |
Ass 2: Comments
Diffstat (limited to 'ass2')
| -rw-r--r-- | ass2/q2/suffix_array.py | 12 | ||||
| -rw-r--r-- | ass2/q3/lcp.py | 118 | ||||
| -rw-r--r-- | ass2/ukkonen.py | 140 |
3 files changed, 159 insertions, 111 deletions
diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py index edfaa80..78cf6c4 100644 --- a/ass2/q2/suffix_array.py +++ b/ass2/q2/suffix_array.py @@ -3,6 +3,9 @@ import sys def depth_first_apply(node: Node, func): + """ + Apply a given function to every leaf node in the suffix tree via DFS + """ if not node.children: func(node.suffix_index) else: @@ -11,11 +14,17 @@ def depth_first_apply(node: Node, func): def read_in_string(filename): + """ + Reads in a string from a given file name + """ with open(filename, "r") as file: return file.read() def suffix_array(string): + """ + Generates a suffix array from a given string using Ukkonen's and a depth first search + """ tree = ukkonen(string) buffer = [] depth_first_apply(tree, buffer.append) @@ -23,6 +32,9 @@ def suffix_array(string): def write_suffix_array(string, filename): + """ + Writes a suffix array for a given string to a given file name. + """ with open(filename, "w") as file: buffer = suffix_array(string) file.write("\n".join(map(str, buffer))) diff --git a/ass2/q3/lcp.py b/ass2/q3/lcp.py index 7e90ae0..6c72771 100644 --- a/ass2/q3/lcp.py +++ b/ass2/q3/lcp.py @@ -1,99 +1,67 @@ -from random import randrange +import sys +from ass2.ukkonen import ukkonen, Point, Node -from ass2.ukkonen import ukkonen, Point, char_is_after, Node, skip_count -from itertools import product +def longest_prefix(root: Node, s1: str, s2: str, offset: tuple): + """ + Finds the longest prefix of a suffix of two different strings, provided a suffix tree of "s1&s2" + The offset tuple represents the start index of each suffix to compare to. -# s1 = "abdabc" -# s2 = "daba" -# s1 = "abaabaaba" -# s2 = "baabaaabaa" - -# def longest_prefix(root: Node, s1: str, s2: str, suffixes: tuple): -# print(s1[suffixes[0]:]) -# print(s2[suffixes[1]:]) -# matches = 0 -# if suffixes[0] == 0: -# point = Point(root, s1[0], 1) -# else: -# point = skip_count(suffixes[0], Point(root), 0) -# for i in range(suffixes[1], len(s2) + 1): -# print(s2[i], end=" ") -# if char_is_after(point, s2[i]): -# if point.is_explicit(): -# point = Point(point.node, s2[i], 1) -# else: -# point = skip_count(1, point, i) -# matches += 1 -# print("found") -# else: -# print("not found") -# break -# return matches - -def naive(s1, s2, offset): - count = 0 - for t in list(map(lambda x: x[0] == x[1], zip(s1[offset[0]:], s2[offset[1]:]))): - if t: - count += 1 - else: - return count - return count - - -def lcp(root: Node, s1: str, s2: str, offset: tuple): + Starting from the root, if the two strings have a matching character, move in that direction of the suffix tree + Increment the index to compare the next characters by, by the length of the edge just traveled + Stop when both strings 'disagree' on which path to take. + This effectively finds the deepest common ancestor of two suffixes on the same tree. + """ index = 0 head = Point(root) shortest = min(len(s1) - offset[0], len(s2) - offset[1]) - while index < shortest: + while index < shortest: # Stop searching when you hit the end of the shortest string if s1[index + offset[0]] == s2[index + offset[1]]: going_to = head.node.get_child(s1[index + offset[0]]) - head.set_node(going_to) + head.set_node(going_to) # Move to new chosen node index += going_to.edge_length else: return index return index -def test_everything_for(tree, s1, s2): - # print(s1, s2) - for i in range(len(s1)): - for j in range(len(s2)): - result = lcp(tree, s1, s2, (i, j)) - a = naive(s1, s2, (i, j)) - assert result == a +def read_in_string(filename): + """ + Reads in a string from a given file name + """ + with open(filename, "r") as file: + return file.read() -def new_string(max_letters, max_len): - return "".join([chr(randrange(ord("a"), ord("a") + max_letters)) for _ in range(0, max_len)]) +def create_suffix_list(filename): + """ + From the suffix index file, generate a list of tuples representing each suffix pair to search + """ + tuples = [] + for line in read_in_string(filename).split("\n"): + parts = line.split(" ") + tuples.append((int(parts[0]), int(parts[1]))) + return tuples -def test(max_letters, max_len): - alphabet = "".join(map(chr, range(ord("a"), ord("a") + max_letters))) - counter = 0 - while True: - s1, s2 = new_string(max_letters, max_len), new_string(max_letters, max_len) - tree = ukkonen(s1 + "z" + s2) - test_everything_for(tree, s1, s2) - counter += 1 - if counter % 1000 == 0: - print(counter) +def write_longest_prefixes(root: Node, s1: str, s2: str, suffixes): + """ + Write the longest common suffixes of two strings provided a list of suffix index pairs to file + """ + results = map(lambda t: f"{t[0]} {t[1]} {longest_prefix(root, s1, s2, t)}", suffixes) + with open("output_lcp.txt", "w") as file: + file.write("\n".join(results)) def main(): - s1 = "abcdacbdab" - s2 = "dacbdabc" - print(s1 + "&" + s2) - tree = ukkonen(s1 + "&" + s2) - tree.print_tree() - - # tup = (4, 5) - # result = longest_prefix(tree, s1, s2, (0, 5)) - # result = lcp(tree, s1, s2, tup) - # print(naive(s1, s2, tup)) - # print(result) + assert len(sys.argv) == 4 # Need correct number of arguments + s1 = read_in_string(sys.argv[1]) + s2 = read_in_string(sys.argv[2]) + suffixes = create_suffix_list(sys.argv[3]) + combined = s1 + "&" + s2 + tree = ukkonen(combined) # Generate the suffix tree for both strings once + write_longest_prefixes(tree, s1, s2, suffixes) # Write answers to file if __name__ == "__main__": - test(4, 50) - # main() + main() diff --git a/ass2/ukkonen.py b/ass2/ukkonen.py index bc8b563..5aa94e7 100644 --- a/ass2/ukkonen.py +++ b/ass2/ukkonen.py @@ -1,8 +1,21 @@ +""" +This file is imported into questions 2 and 3. +""" + import sys ALPHABET_SIZE = 28 + class OrderedDict(dict): + """ + A hybrid Python dictionary/list + All set/get item operations on this data structure are the same complexity of a normal dictionary O(1)-ish + For Ukkonen's operation, only the normal dictionary features are used. + As values are stored in the dictionary, the are also referenced in a list of size O(alphabet). + This acts as a kind of 'counting sort' when accessed is O(n), but provides a pre-sorted list of all children nodes + This is used for generating the suffix array. + """ def __init__(self): super().__init__() self.first_letters = [None for _ in range(ALPHABET_SIZE)] @@ -18,11 +31,14 @@ class OrderedDict(dict): super().__delitem__(key) self.first_letters[self.rank(key)] = None - def ordered_items(self): + def ordered_items(self): # Return iterable of pre-sorted items (for suffix array) return filter(lambda x: x is not None, self.first_letters) @staticmethod def rank(char): + """ + Define a number value to an alphabet letter, including special characters so they can fit in a list + """ if char == "$": return 26 elif char == "&": @@ -31,8 +47,11 @@ class OrderedDict(dict): return ord(char) - 96 - class Node: + """ + Represents an arbitrary node in a suffix tree + Also statically sotres some state information about the algorithm (not pretty, I know) + """ global_end = 0 num_splits = 0 all_nodes = [] @@ -50,16 +69,22 @@ class Node: self.link = None def __str__(self): + """ + String representation of node, shows important internal values of a node (for debug) + """ link_str = "" if self.link is None else f" -> {self.link.id}" if not self.root: j, i = self.tuple() return f"[{self.id}, {self.tuple()}, {self.string[j:i + 1]}{link_str}]" return f"[{self.id} root{link_str}]" - def __repr__(self): + def __repr__(self): # Shorter representation of node return f"[{self.id}]" def print_tree(self, spaces=1): + """ + Recursively prints tree of nodes (for debug) + """ print(f"{self}") for edge in self.children: print(f" " * spaces, end="") @@ -82,13 +107,16 @@ class Node: self.children.pop(child.first_char()) @property - def end_index(self): + def end_index(self): # Translates end index into a number (could be '#' pointer) return self.tuple()[1] def tuple(self): + """ + Returns the resolved start and end coordinates of the substring this node represents + """ if self.root: raise Exception("Can't get substring of root.") - if self.end == "#": + if self.end == "#": # Translate '#' into global_end return self.start, self.global_end return self.start, self.end @@ -106,6 +134,12 @@ class Node: class Point: + """ + A representation of a single point on the tree. Used to store active node, edge and length data in one place + Could represent a place in the middle of an edge (implicit) or a place on a node (explicit). + Abstracts away a lot of tedium regarding working with these closely connected values. + Can be used to create 'pure' functions which return a transformation on a given point + """ def __init__(self, node, edge="", length=0): assert isinstance(node, Node) self.node = node @@ -118,25 +152,34 @@ class Point: def is_explicit(self): # a.k.a. is not on an edge return self.edge == "" - def set_node(self, node): + def set_node(self, node): # Set point to a specific node, reset other values self.node = node self.edge = "" self.length = 0 @property - def edge_node(self) -> Node: + def edge_node(self) -> Node: # Return the Node object of the edge this object points to return self.node.get_child(self.edge) def index_here(self): + """ + Return the index in the original string that this point refers to + """ if self.is_explicit(): return 0 if self.node.root else self.node.start return self.edge_node.start + self.length - 1 def char_here(self): + """ + Return the char in the original string that this point refers to + """ return Node.string[self.index_here()] def create_root(): + """ + Create a root node with special root properties. Used to initalise the algorithm + """ assert len(Node.all_nodes) == 0 root = Node(None, None) root.root = True @@ -145,6 +188,11 @@ def create_root(): def split_edge(split_point: Point): + """ + Split a given edge into two separate edges, creating a new node in the middle (called a mediator in my code) + Used for Rule 2s on implicit suffixes. + Returns the newly created mediator node + """ assert not split_point.is_explicit() edge = split_point.edge_node original = edge.tuple() @@ -164,41 +212,47 @@ def pos(n: int): def do_phase(root: Node, active: Point, i, last_j, remainder): + """ + Performs a single phase of Ukkonen's algorithm, returning values used for the next phase. + """ + + # Initialisation root_point = Point(root) - Node.global_end += 1 + Node.global_end += 1 # Perform rapid leaf extension trick (Rule 1) did_rule_three = False j = last_j + 1 node_just_created = None - while not did_rule_three and j <= i + 1: + + while not did_rule_three and j <= i + 1: # Run only the required extensions for this phase curr_char = Node.string[i] match = char_is_after(active, curr_char) - if match: - # print(3) + if match: # Decide if Rule 2 or 3. + # RULE 3 LOGIC remainder += 1 if node_just_created is not None: - node_just_created.link = active.node - active = skip_count(1, active, i) - did_rule_three = True + node_just_created.link = active.node # Create suffix link (Rule 3) + active = skip_count(1, active, i) # Move active node + did_rule_three = True # Break loop else: - # print(2) - if not active.is_explicit(): + # RULE 2 LOGIC + if not active.is_explicit(): # Active point on an edge, need to split mediator = split_edge(active) - mediator.add_child(Node(i, "#")) + mediator.add_child(Node(i, "#")) # Dangle new character off of mediator node from split if node_just_created is not None: - node_just_created.link = mediator + node_just_created.link = mediator # Create suffix link (First sub-case) node_just_created = mediator active.length -= 1 if active.length == 0: active.set_node(active.node) - else: + else: # Active point on node, just dangle off a new node active.node.add_child(Node(i, "#")) if node_just_created is not None and node_just_created.link is None: - node_just_created.link = active.node + node_just_created.link = active.node # Create suffix link (Second sub-case) remainder = pos(remainder - 1) - active.set_node(active.node.link) + active.set_node(active.node.link) # Go to suffix link if remainder > 0: - active = skip_count(remainder, Point(root), i - remainder) + active = skip_count(remainder, Point(root), i - remainder) # Traverse from root last_j = j j += 1 # print(active) @@ -207,23 +261,34 @@ def do_phase(root: Node, active: Point, i, last_j, remainder): def char_is_after(point: Point, char): - if point.is_explicit(): + """ + Return if a given character is traversable directly after a given point + Used for Rule 2/3 selection + """ + if point.is_explicit(): # If point on a node return char in point.node.children - else: + else: # If point on an edge if point.length == point.edge_node.edge_length: return Node.string[point.edge_node.start] == char else: # If not at the end of an edge - # return Node.string[point.index_here() + point.length] == char return Node.string[point.index_here() + 1] == char def skip_count(num_chars, start_point: Point, index): + """ + Use the skip-counting trick to traverse num_chars down from point start_point. + Use index value as where to start looking in the string for char comparison + Returns the point that the traversal lands on + """ + + # Initialise incoming_length = -1 existing_length = 0 head = start_point chars_left = num_chars char = "" + # Move point to nearest node if it is on an edge if not head.is_explicit(): incoming_length = head.edge_node.edge_length - head.length if num_chars < incoming_length: @@ -233,24 +298,22 @@ def skip_count(num_chars, start_point: Point, index): chars_left -= incoming_length index += incoming_length - # Node.string[i] if head.node.root else Node.string[head.node.end_index + 1] - # assert head.node.end_index + 1 + chars_left < len(Node.string) + # Main traversal loop while chars_left > 0: - # assert head.node.end_index + 1 + chars_left < len(Node.string) - direction = Node.string[index] + direction = Node.string[index] # Choose a direction to go from this point next_node = head.node.get_child(direction) - if next_node is None: + if next_node is None: # Went off the tree -> error raise IndexError(f"Attempted to traverse char\n '{direction}' at point {head}. ({index=})") incoming_length = next_node.edge_length - if chars_left < incoming_length: + if chars_left < incoming_length: # Break if we able can't go down that edge break + # Move down edge to next node chars_left -= incoming_length index += incoming_length head.set_node(next_node) - # direction = Node.string[index] - - if chars_left > 0: # Landed on an edge + # Return position on edge if couldn't traverse a final edge (search landed on edge) + if chars_left > 0: head.edge = Node.string[index] head.length = chars_left @@ -258,6 +321,11 @@ def skip_count(num_chars, start_point: Point, index): def ukkonen(string): + """ + Reset the algorithm values and create return the root of a suffix tree for a given string + using everyone's favourite algorithm: Ukkonen's algorithm. O(n) time. + """ + # Initialise values string += "$" Node.string = string Node.global_end = 0 @@ -266,16 +334,16 @@ def ukkonen(string): n = len(string) remainder = 0 last_j = 1 + # Perform base case i = 0 phase root = create_root() root.add_child(Node(0, "#")) active = Point(root) + # Perform rest of phases for i in range(1, n): active, remainder, last_j = do_phase(root, active, i, last_j, remainder) return root if __name__ == "__main__": - # ukkonen("DEFDBEFFDDEFFFADEFFB") ukkonen("abacabad") print("done") -# ukkonen("abcbcbc$") |
