diff options
Diffstat (limited to 'ass2/q2/new_suffix_array.py')
| -rw-r--r-- | ass2/q2/new_suffix_array.py | 188 |
1 files changed, 188 insertions, 0 deletions
diff --git a/ass2/q2/new_suffix_array.py b/ass2/q2/new_suffix_array.py new file mode 100644 index 0000000..0c97137 --- /dev/null +++ b/ass2/q2/new_suffix_array.py @@ -0,0 +1,188 @@ +import sys + + +class Node: + global_end = 0 + all_nodes = [] + string = "" + + def __init__(self, start, end, root=False): + self.root = root + self.start = start + self.end = end + self.children = {} + self.id = len(self.all_nodes) + self.all_nodes.append(self) + self.parent = None + self.link = None + + def __str__(self): + link_str = "" if self.link is None else f" -> {self.link.id}" + if not self.root: + j, i = self.tuple() + return f"[{self.id}, {self.tuple()}, {self.string[j:i + 1]}{link_str}]" + return f"[{self.id} root{link_str}]" + + def __repr__(self): + return f"[{self.id}]" + + def print_tree(self, spaces=1): + print(f"{self}") + for edge in self.children: + print(f" " * spaces, end="") + self.get_child(edge).print_tree(spaces=spaces + 1) + + def first_char(self): + return self.string[self.start] + + def get_child(self, char): + if char in self.children: + return self.children[char] + return None + + def add_child(self, child): + child.parent = self + self.children[child.first_char()] = child + return child + + def remove_child(self, child): + self.children.pop(child) + + @property + def end_index(self): + return self.tuple()[1] + + def tuple(self): + if self.root: + raise Exception("Can't get substring of root.") + if self.end == "#": + return self.start, self.global_end + return self.start, self.end + + @property + def edge_length(self): + if self.root: + return 0 + else: + start, end = self.tuple() + return end - start + 1 + + def detach(self): + self.parent.remove_child(self) + self.parent = None + + +class Point: + def __init__(self, node, edge="", length=0): + assert isinstance(node, Node) + self.node = node + self.edge = edge + self.length = length + + def __repr__(self): + return f"(Node {self.node.id}'s edge:'{self.edge}', {self.length} along.)" + + def is_explicit(self): # a.k.a. is not on an edge + return self.edge == "" + + def set_node(self, node): + self.node = node + self.edge = "" + self.length = 0 + if not self.is_explicit(): + print("WARNING: Node.set_node", file=sys.stderr) + + @property + def edge_node(self) -> Node: + return self.node.get_child(self.edge) + + def index_here(self): + if self.node.root: + return None + if self.is_explicit(): + return self.node.start + return self.edge_node.start + self.length - 1 + + def char_here(self): + return Node.string[self.index_here()] + + +def create_root(): + assert len(Node.all_nodes) == 0 + return Node(None, None, root=True) + + +def do_phase(root: Node, active: Point, i, last_j): + root_point = Point(root) + Node.global_end += 1 + did_rule_three = False + j = last_j + 1 + while not did_rule_three: + curr_char = Node.string[i] + new_point = skip_count(i + 1, root_point) + + +def char_is_after(point: Point, char): + if point.is_explicit(): + return char in point.node.children + else: + if point.length < point.node.edge_length: # If not at the end of an edge + return Node.string[point.index_here() + point.length] + return None + + +def skip_count(num_chars, start_point: Point): + incoming_length = -1 + existing_length = 0 + head = start_point + chars_left = num_chars + char = "" + + if not head.is_explicit(): + incoming_length = head.edge_node.edge_length - head.length + if num_chars < incoming_length: + head.length += num_chars + return head + head.set_node(head.edge_node) + chars_left -= incoming_length + + direction = Node.string[0] if head.node.root else Node.string[head.node.end_index + 1] + # next_node = head.node.get_child(direction) + # incoming_length = next_node.edge_length + # + # chars_left -= incoming_length + # head.set_node(next_node) + + while chars_left > incoming_length: + direction = Node.string[0] if head.node.root else Node.string[head.node.end_index + 1] + next_node = head.node.get_child(direction) + incoming_length = next_node.edge_length + if chars_left <= incoming_length: + break + chars_left -= incoming_length + head.set_node(next_node) + + + if chars_left > 0: # Landed on an edge + head.edge = direction + head.length = chars_left + + return head + + +def ukkonen(string): + Node.string = string + # string += "$" + n = len(string) + remainder = 0 + last_j = 1 + root = create_root() + root.add_child(Node(0, 3)).add_child(Node(4, 6)).add_child(Node(7, 7)).add_child(Node(8, "#")) + root.add_child(Node(1, 6)) + active = Point(root) + Node.global_end = len(string) + root.print_tree() + print(skip_count(151, Point(root, "a", 2))) + + +ukkonen("abcdefghijklmnop$") |
