diff options
| author | akiyamn | 2021-04-27 14:12:47 +1000 |
|---|---|---|
| committer | akiyamn | 2021-04-27 14:12:47 +1000 |
| commit | 1695a8e7c9e0d345918946ef6fbc8f56c7751e32 (patch) | |
| tree | 41829151bb336da54aab6b13ff4c47b3a6454698 /ass2 | |
| parent | e347fd3246a4282d7fea85c1ae727e48c810480b (diff) | |
| download | fit3155-1695a8e7c9e0d345918946ef6fbc8f56c7751e32.tar.gz fit3155-1695a8e7c9e0d345918946ef6fbc8f56c7751e32.zip | |
Ass 2: Guts working for suffix array
Diffstat (limited to 'ass2')
| -rw-r--r-- | ass2/q2/gen_cases.py | 5 | ||||
| -rw-r--r-- | ass2/q2/suffix_array.py | 219 | ||||
| -rw-r--r-- | ass2/ukkonen.py (renamed from ass2/q2/new_suffix_array.py) | 39 |
3 files changed, 67 insertions, 196 deletions
diff --git a/ass2/q2/gen_cases.py b/ass2/q2/gen_cases.py index 82d165e..f32a5a4 100644 --- a/ass2/q2/gen_cases.py +++ b/ass2/q2/gen_cases.py @@ -1,7 +1,6 @@ import math -from itertools import product, accumulate, count, repeat -from functools import reduce -from new_suffix_array import ukkonen +from itertools import product +from ass2.ukkonen import ukkonen import timeit import random diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py index 61ac445..1c67433 100644 --- a/ass2/q2/suffix_array.py +++ b/ass2/q2/suffix_array.py @@ -1,201 +1,40 @@ -class Node: - global_end = 0 - string = "" - all = [] +from ass2.ukkonen import Node, ukkonen - def __init__(self, data=None): - self.data = data - self.children = [] - self.id = len(self.all) - self.all.append(self) - self.parent = None - self.link = None - def __str__(self): - tup = self.get_tuple() - link_str = "" if self.link is None else f" -> {self.link.id}" - if tup is not None: - j, i = tup - return f"[{self.id}, {self.data}, {self.string[j:i + 1]}{link_str}]" - return f"[{self.id} root{link_str}]" +def rank(char): + return ord("a") - ord(char) if char != "$" else 26 - def __repr__(self): - return f"[{self.id}]" - def print_children(self): - print(self, end=" contains: ") - for child in self.children: - print(child, end=", ") +# tree = ukkonen("abacabad") +# # tree.print_tree() +# for node in Node.all_nodes: +# if not node.children: +# print(node) - def add_child(self, child): - if child: - child.parent = self - self.children.append(child) - def num_chars_deep(self): - if self.data is None: - return 0 - else: - return self.parent.num_chars_deep() + self.edge_length() +def depth_first_apply(node: Node, func): + if not node.children: + func(node.suffix_index) + else: + for child in node.children.ordered_items(): + depth_first_apply(child, func) - def remove_child(self, child): - self.children.remove(child) - def bastardise(self): - self.parent.remove_child(self) - self.parent = None +# def calc_suffix_index(): +# lookup = {} +# counter = 0 +# # leaves = filter(lambda n: not n.children, Node.all_nodes) +# for node in Node.all_nodes: +# if not node.children: +# lookup +# +# print(list(leaves)) - def add_suffix_link(self, destination): - assert self.link is None - assert len(self.children) > 0 - assert isinstance(destination, Node) - self.link = destination - def follow_link(self): - assert self.link is not None - return self.link +def accum(): + array = [] + depth_first_apply(tree, lambda x: array.append(x)) + return array - def get_tuple(self): - if self.data is not None and self.data[1] == "#": - return self.data[0], self.global_end - return self.data - - def first_letter(self): - return self.string[self.data[0]] - - def get_child(self, char): - for child in self.children: - if child.first_letter() == char: - return child - return None - - def edge_length(self): - if self.data is None: - return 0 - else: - substring = self.get_tuple() - return substring[1] - substring[0] + 1 - - def print_tree(self, spaces=0): - print(f"{self}") - for child in self.children: - print(f" " * spaces, end="") - child.print_tree(spaces=spaces + 1) - - -def branch_from_point(active_node, active_edge, active_length, j, need_link_to): - edge = active_node.get_child(active_edge) - edge.bastardise() - original_substr = edge.get_tuple() - edge.data = (edge.data[0] + active_length, edge.data[1]) - mediator = Node((original_substr[0], original_substr[0] + active_length - 1)) - active_node.add_child(mediator) - mediator.add_child(edge) - mediator.add_child(Node((j+1, '#'))) - # assert j+1 != 12 - return mediator - - -def traverse_down(string, substring, active_node, active_edge, active_length, remainder): - j, i = substring - traversed = 0 - g = 0 - char = "" - remainder -= 0 if active_node.data is None else active_node.data[0] + 1 - while remainder > 0: - - char = string[i + traversed] - path = None - if active_length > 0: - path = active_node.get_child(active_edge) - else: - path = active_node.get_child(char) - if path is None: - raise Exception(f"Tried to go down '{char}' from point: {active_node, active_edge, active_length}") - traversed += path.edge_length() - g = min(remainder, path.edge_length()) - # if path is None: - # return active_node, active_edge, active_length - if remainder >= path.edge_length() - active_length: - active_node = path - print(active_node.link is None) - active_length = 0 - active_edge = '' - remainder -= path.edge_length() - active_length - else: - active_length = remainder - active_edge = char if active_length != 0 else None - break - active_length = remainder - return active_node, active_edge, active_length - - -def char_match_after(char, active_node, active_edge, active_length): - if active_length < 1: - return active_node.get_child(char) is not None - edge = active_node.get_child(active_edge) - position = edge.data[0] + active_length - return Node.string[position] == char - -def suffix_tree(string): - # string += "$" - Node.string = string - n = len(string) - last_j = 0 - root = Node() - root.add_child(Node((0, '#'))) - root.add_suffix_link(root) - need_link_to = root - active_node = root - active_edge = "" - active_length = 0 - remainder = 0 - for i in range(1, n): - - # if active_node.link is not None: - # active_node = active_node.link - # if active_node == active_node.link: - # remainder -= 1 - Node.global_end += 1 # Rule 1s implicitly - for j in range(last_j + 1, i + 1): - print(string[0:i + 1], end=": ") - print(active_node, active_edge, active_length) - # if 2 or more letter string, just match the last character at the current position (we already went - # there) i.e. i i think - char = string[i] if i - j <= 1 else string[i] - match = char_match_after(char, active_node, active_edge, active_length) - if match: # Rule 3 - print(3, -j) - remainder += 1 - active_node, active_edge, active_length = traverse_down(string, (j, i), active_node, active_edge, - active_length, remainder) - break - else: # Rule 2 - print(2, -j) - # new_node = None - if active_length > 0: # If the pointer is on an edge - # if active_node.link is not None: - # active_node = active_node.link - new_node = branch_from_point(active_node, active_edge, active_length, j, need_link_to) - new_node.add_suffix_link(need_link_to) - need_link_to = new_node - print(f"*** I want to go from {repr(active_node)} -> {repr(active_node.link)} ***") - # if active_node.link is not None: - # active_node = active_node.link - else: # If the pointer is on a node - new_node = Node((j, '#')) - active_node.add_child(new_node) - - active_edge = "" - remainder = max(remainder - 1, 0) - active_length = 0 - last_j = j - - print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") - # print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") - root.print_tree() - print("\n" * 10) - - -if __name__ == "__main__": - suffix_tree("abcabxabcd") +tree = ukkonen("mississippi") +print(accum()) diff --git a/ass2/q2/new_suffix_array.py b/ass2/ukkonen.py index 841ff67..8d536c5 100644 --- a/ass2/q2/new_suffix_array.py +++ b/ass2/ukkonen.py @@ -1,8 +1,36 @@ import sys +class OrderedDict(dict): + def __init__(self): + super().__init__() + self.first_letters = [None for _ in range(27)] + + def __setitem__(self, key, value): + super().__setitem__(key, value) + try: + self.first_letters[self.rank(key)] = value + except IndexError: + raise IndexError(f"Could not add item of key '{key}' since it is out of range of the rank function.") + + def __delitem__(self, key): + super().__delitem__(key) + self.first_letters[self.rank(key)] = None + + def ordered_items(self): + return filter(lambda x: x is not None, self.first_letters) + + @staticmethod + def rank(char): + # result = 26 if char == "$" else ord(char) - 97 + result = 0 if char == "$" else ord(char) - 96 + assert result in range(0, 27) + return result + + class Node: global_end = 0 + num_splits = 0 all_nodes = [] string = "" @@ -10,8 +38,9 @@ class Node: self.root = False self.start = start self.end = end - self.children = {} + self.children = OrderedDict() self.id = len(self.all_nodes) + self.suffix_index = self.id - self.num_splits - 1 self.all_nodes.append(self) self.parent = None self.link = None @@ -118,7 +147,9 @@ def split_edge(split_point: Point): edge = split_point.edge_node original = edge.tuple() edge.detach() + Node.num_splits += 1 mediator = Node(original[0], original[0] + split_point.length - 1) + mediator.suffix_index = None edge.start = original[0] + split_point.length assert edge.start <= edge.end_index mediator.add_child(edge) @@ -168,8 +199,8 @@ def do_phase(root: Node, active: Point, i, last_j, remainder): active = skip_count(remainder, Point(root), i - remainder) last_j = j j += 1 - # print(active) - # root.print_tree() + print(active) + root.print_tree() return active, remainder, last_j @@ -228,6 +259,7 @@ def ukkonen(string): string += "$" Node.string = string Node.global_end = 0 + Node.num_splits = 0 Node.all_nodes.clear() n = len(string) remainder = 0 @@ -237,6 +269,7 @@ def ukkonen(string): active = Point(root) for i in range(1, n): active, remainder, last_j = do_phase(root, active, i, last_j, remainder) + return root if __name__ == "__main__": |
