diff options
Diffstat (limited to 'ass2/q2')
| -rw-r--r-- | ass2/q2/gen_cases.py | 5 | ||||
| -rw-r--r-- | ass2/q2/new_suffix_array.py | 246 | ||||
| -rw-r--r-- | ass2/q2/suffix_array.py | 219 |
3 files changed, 31 insertions, 439 deletions
diff --git a/ass2/q2/gen_cases.py b/ass2/q2/gen_cases.py index 82d165e..f32a5a4 100644 --- a/ass2/q2/gen_cases.py +++ b/ass2/q2/gen_cases.py @@ -1,7 +1,6 @@ import math -from itertools import product, accumulate, count, repeat -from functools import reduce -from new_suffix_array import ukkonen +from itertools import product +from ass2.ukkonen import ukkonen import timeit import random diff --git a/ass2/q2/new_suffix_array.py b/ass2/q2/new_suffix_array.py deleted file mode 100644 index 841ff67..0000000 --- a/ass2/q2/new_suffix_array.py +++ /dev/null @@ -1,246 +0,0 @@ -import sys - - -class Node: - global_end = 0 - all_nodes = [] - string = "" - - def __init__(self, start, end): - self.root = False - self.start = start - self.end = end - self.children = {} - self.id = len(self.all_nodes) - self.all_nodes.append(self) - self.parent = None - self.link = None - - def __str__(self): - link_str = "" if self.link is None else f" -> {self.link.id}" - if not self.root: - j, i = self.tuple() - return f"[{self.id}, {self.tuple()}, {self.string[j:i + 1]}{link_str}]" - return f"[{self.id} root{link_str}]" - - def __repr__(self): - return f"[{self.id}]" - - def print_tree(self, spaces=1): - print(f"{self}") - for edge in self.children: - print(f" " * spaces, end="") - self.get_child(edge).print_tree(spaces=spaces + 1) - - def first_char(self): - return self.string[self.start] - - def get_child(self, char): - if char in self.children: - return self.children[char] - return None - - def add_child(self, child): - child.parent = self - self.children[child.first_char()] = child - return child - - def remove_child(self, child): - self.children.pop(child.first_char()) - - @property - def end_index(self): - return self.tuple()[1] - - def tuple(self): - if self.root: - raise Exception("Can't get substring of root.") - if self.end == "#": - return self.start, self.global_end - return self.start, self.end - - @property - def edge_length(self): - if self.root: - return 0 - else: - start, end = self.tuple() - return end - start + 1 - - def detach(self): - self.parent.remove_child(self) - self.parent = None - - -class Point: - def __init__(self, node, edge="", length=0): - assert isinstance(node, Node) - self.node = node - self.edge = edge - self.length = length - - def __repr__(self): - return f"(Node {self.node.id}'s edge:'{self.edge}', {self.length} along.)" - - def is_explicit(self): # a.k.a. is not on an edge - return self.edge == "" - - def set_node(self, node): - self.node = node - self.edge = "" - self.length = 0 - if not self.is_explicit(): - print("WARNING: Node.set_node", file=sys.stderr) - - @property - def edge_node(self) -> Node: - return self.node.get_child(self.edge) - - def index_here(self): - if self.is_explicit(): - return 0 if self.node.root else self.node.start - return self.edge_node.start + self.length - 1 - - def char_here(self): - return Node.string[self.index_here()] - - -def create_root(): - assert len(Node.all_nodes) == 0 - root = Node(None, None) - root.root = True - root.link = root - return root - - -def split_edge(split_point: Point): - assert not split_point.is_explicit() - edge = split_point.edge_node - original = edge.tuple() - edge.detach() - mediator = Node(original[0], original[0] + split_point.length - 1) - edge.start = original[0] + split_point.length - assert edge.start <= edge.end_index - mediator.add_child(edge) - split_point.node.add_child(mediator) - return mediator - - -def pos(n: int): - return max(n, 0) - - -def do_phase(root: Node, active: Point, i, last_j, remainder): - root_point = Point(root) - Node.global_end += 1 - did_rule_three = False - j = last_j + 1 - node_just_created = None - while not did_rule_three and j <= i + 1: - - curr_char = Node.string[i] - match = char_is_after(active, curr_char) - if match: - # print(3) - remainder += 1 - if node_just_created is not None: - node_just_created.link = active.node - active = skip_count(1, active, i) - did_rule_three = True - else: - # print(2) - if not active.is_explicit(): - mediator = split_edge(active) - mediator.add_child(Node(i, "#")) - if node_just_created is not None: - node_just_created.link = mediator - node_just_created = mediator - active.length -= 1 - if active.length == 0: - active.set_node(active.node) - else: - active.node.add_child(Node(i, "#")) - if node_just_created is not None and node_just_created.link is None: - node_just_created.link = active.node - remainder = pos(remainder - 1) - active.set_node(active.node.link) - if remainder > 0: - active = skip_count(remainder, Point(root), i - remainder) - last_j = j - j += 1 - # print(active) - # root.print_tree() - return active, remainder, last_j - - -def char_is_after(point: Point, char): - if point.is_explicit(): - return char in point.node.children - else: - if point.length == point.edge_node.edge_length: - return Node.string[point.edge_node.start] == char - else: # If not at the end of an edge - # return Node.string[point.index_here() + point.length] == char - return Node.string[point.index_here() + 1] == char - - -def skip_count(num_chars, start_point: Point, index): - incoming_length = -1 - existing_length = 0 - head = start_point - chars_left = num_chars - char = "" - - if not head.is_explicit(): - incoming_length = head.edge_node.edge_length - head.length - if num_chars < incoming_length: - head.length += num_chars - return head - head.set_node(head.edge_node) - chars_left -= incoming_length - index += incoming_length - - # Node.string[i] if head.node.root else Node.string[head.node.end_index + 1] - # assert head.node.end_index + 1 + chars_left < len(Node.string) - while chars_left > 0: - # assert head.node.end_index + 1 + chars_left < len(Node.string) - direction = Node.string[index] - next_node = head.node.get_child(direction) - if next_node is None: - raise Exception(f"Attempted to traverse char\n '{direction}' at point {head}. ({index=})") - incoming_length = next_node.edge_length - if chars_left < incoming_length: - break - chars_left -= incoming_length - index += incoming_length - head.set_node(next_node) - - # direction = Node.string[index] - - if chars_left > 0: # Landed on an edge - head.edge = Node.string[index] - head.length = chars_left - - return head - - -def ukkonen(string): - string += "$" - Node.string = string - Node.global_end = 0 - Node.all_nodes.clear() - n = len(string) - remainder = 0 - last_j = 1 - root = create_root() - root.add_child(Node(0, "#")) - active = Point(root) - for i in range(1, n): - active, remainder, last_j = do_phase(root, active, i, last_j, remainder) - - -if __name__ == "__main__": - # ukkonen("DEFDBEFFDDEFFFADEFFB") - ukkonen("abacabad") - print("done") -# ukkonen("abcbcbc$") diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py index 61ac445..1c67433 100644 --- a/ass2/q2/suffix_array.py +++ b/ass2/q2/suffix_array.py @@ -1,201 +1,40 @@ -class Node: - global_end = 0 - string = "" - all = [] +from ass2.ukkonen import Node, ukkonen - def __init__(self, data=None): - self.data = data - self.children = [] - self.id = len(self.all) - self.all.append(self) - self.parent = None - self.link = None - def __str__(self): - tup = self.get_tuple() - link_str = "" if self.link is None else f" -> {self.link.id}" - if tup is not None: - j, i = tup - return f"[{self.id}, {self.data}, {self.string[j:i + 1]}{link_str}]" - return f"[{self.id} root{link_str}]" +def rank(char): + return ord("a") - ord(char) if char != "$" else 26 - def __repr__(self): - return f"[{self.id}]" - def print_children(self): - print(self, end=" contains: ") - for child in self.children: - print(child, end=", ") +# tree = ukkonen("abacabad") +# # tree.print_tree() +# for node in Node.all_nodes: +# if not node.children: +# print(node) - def add_child(self, child): - if child: - child.parent = self - self.children.append(child) - def num_chars_deep(self): - if self.data is None: - return 0 - else: - return self.parent.num_chars_deep() + self.edge_length() +def depth_first_apply(node: Node, func): + if not node.children: + func(node.suffix_index) + else: + for child in node.children.ordered_items(): + depth_first_apply(child, func) - def remove_child(self, child): - self.children.remove(child) - def bastardise(self): - self.parent.remove_child(self) - self.parent = None +# def calc_suffix_index(): +# lookup = {} +# counter = 0 +# # leaves = filter(lambda n: not n.children, Node.all_nodes) +# for node in Node.all_nodes: +# if not node.children: +# lookup +# +# print(list(leaves)) - def add_suffix_link(self, destination): - assert self.link is None - assert len(self.children) > 0 - assert isinstance(destination, Node) - self.link = destination - def follow_link(self): - assert self.link is not None - return self.link +def accum(): + array = [] + depth_first_apply(tree, lambda x: array.append(x)) + return array - def get_tuple(self): - if self.data is not None and self.data[1] == "#": - return self.data[0], self.global_end - return self.data - - def first_letter(self): - return self.string[self.data[0]] - - def get_child(self, char): - for child in self.children: - if child.first_letter() == char: - return child - return None - - def edge_length(self): - if self.data is None: - return 0 - else: - substring = self.get_tuple() - return substring[1] - substring[0] + 1 - - def print_tree(self, spaces=0): - print(f"{self}") - for child in self.children: - print(f" " * spaces, end="") - child.print_tree(spaces=spaces + 1) - - -def branch_from_point(active_node, active_edge, active_length, j, need_link_to): - edge = active_node.get_child(active_edge) - edge.bastardise() - original_substr = edge.get_tuple() - edge.data = (edge.data[0] + active_length, edge.data[1]) - mediator = Node((original_substr[0], original_substr[0] + active_length - 1)) - active_node.add_child(mediator) - mediator.add_child(edge) - mediator.add_child(Node((j+1, '#'))) - # assert j+1 != 12 - return mediator - - -def traverse_down(string, substring, active_node, active_edge, active_length, remainder): - j, i = substring - traversed = 0 - g = 0 - char = "" - remainder -= 0 if active_node.data is None else active_node.data[0] + 1 - while remainder > 0: - - char = string[i + traversed] - path = None - if active_length > 0: - path = active_node.get_child(active_edge) - else: - path = active_node.get_child(char) - if path is None: - raise Exception(f"Tried to go down '{char}' from point: {active_node, active_edge, active_length}") - traversed += path.edge_length() - g = min(remainder, path.edge_length()) - # if path is None: - # return active_node, active_edge, active_length - if remainder >= path.edge_length() - active_length: - active_node = path - print(active_node.link is None) - active_length = 0 - active_edge = '' - remainder -= path.edge_length() - active_length - else: - active_length = remainder - active_edge = char if active_length != 0 else None - break - active_length = remainder - return active_node, active_edge, active_length - - -def char_match_after(char, active_node, active_edge, active_length): - if active_length < 1: - return active_node.get_child(char) is not None - edge = active_node.get_child(active_edge) - position = edge.data[0] + active_length - return Node.string[position] == char - -def suffix_tree(string): - # string += "$" - Node.string = string - n = len(string) - last_j = 0 - root = Node() - root.add_child(Node((0, '#'))) - root.add_suffix_link(root) - need_link_to = root - active_node = root - active_edge = "" - active_length = 0 - remainder = 0 - for i in range(1, n): - - # if active_node.link is not None: - # active_node = active_node.link - # if active_node == active_node.link: - # remainder -= 1 - Node.global_end += 1 # Rule 1s implicitly - for j in range(last_j + 1, i + 1): - print(string[0:i + 1], end=": ") - print(active_node, active_edge, active_length) - # if 2 or more letter string, just match the last character at the current position (we already went - # there) i.e. i i think - char = string[i] if i - j <= 1 else string[i] - match = char_match_after(char, active_node, active_edge, active_length) - if match: # Rule 3 - print(3, -j) - remainder += 1 - active_node, active_edge, active_length = traverse_down(string, (j, i), active_node, active_edge, - active_length, remainder) - break - else: # Rule 2 - print(2, -j) - # new_node = None - if active_length > 0: # If the pointer is on an edge - # if active_node.link is not None: - # active_node = active_node.link - new_node = branch_from_point(active_node, active_edge, active_length, j, need_link_to) - new_node.add_suffix_link(need_link_to) - need_link_to = new_node - print(f"*** I want to go from {repr(active_node)} -> {repr(active_node.link)} ***") - # if active_node.link is not None: - # active_node = active_node.link - else: # If the pointer is on a node - new_node = Node((j, '#')) - active_node.add_child(new_node) - - active_edge = "" - remainder = max(remainder - 1, 0) - active_length = 0 - last_j = j - - print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") - # print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") - root.print_tree() - print("\n" * 10) - - -if __name__ == "__main__": - suffix_tree("abcabxabcd") +tree = ukkonen("mississippi") +print(accum()) |
