class Node: global_end = 0 string = "" all = [] def __init__(self, data=None): self.data = data self.children = [] self.id = len(self.all) self.all.append(self) self.parent = None self.link = None def __str__(self): tup = self.get_tuple() link_str = "" if self.link is None else f" -> {self.link.id}" if tup is not None: j, i = tup return f"[{self.id}, {self.data}, {self.string[j:i + 1]}{link_str}]" return f"[{self.id} root{link_str}]" def __repr__(self): return f"[{self.id}]" def print_children(self): print(self, end=" contains: ") for child in self.children: print(child, end=", ") def add_child(self, child): if child: child.parent = self self.children.append(child) def num_chars_deep(self): if self.data is None: return 0 else: return self.parent.num_chars_deep() + self.edge_length() def remove_child(self, child): self.children.remove(child) def bastardise(self): self.parent.remove_child(self) self.parent = None def add_suffix_link(self, destination): assert self.link is None assert len(self.children) > 0 assert isinstance(destination, Node) self.link = destination def follow_link(self): assert self.link is not None return self.link def get_tuple(self): if self.data is not None and self.data[1] == "#": return self.data[0], self.global_end return self.data def first_letter(self): return self.string[self.data[0]] def get_child(self, char): for child in self.children: if child.first_letter() == char: return child return None def edge_length(self): if self.data is None: return 0 else: substring = self.get_tuple() return substring[1] - substring[0] + 1 def print_tree(self, spaces=0): print(f"{self}") for child in self.children: print(f" " * spaces, end="") child.print_tree(spaces=spaces + 1) def branch_from_point(active_node, active_edge, active_length, j, need_link_to): edge = active_node.get_child(active_edge) edge.bastardise() original_substr = edge.get_tuple() edge.data = (edge.data[0] + active_length, edge.data[1]) mediator = Node((original_substr[0], original_substr[0] + active_length - 1)) active_node.add_child(mediator) mediator.add_child(edge) mediator.add_child(Node((j+1, '#'))) # assert j+1 != 12 return mediator def traverse_down(string, substring, active_node, active_edge, active_length, remainder): j, i = substring traversed = 0 g = 0 char = "" remainder -= 0 if active_node.data is None else active_node.data[0] + 1 while remainder > 0: char = string[i + traversed] path = None if active_length > 0: path = active_node.get_child(active_edge) else: path = active_node.get_child(char) if path is None: raise Exception(f"Tried to go down '{char}' from point: {active_node, active_edge, active_length}") traversed += path.edge_length() g = min(remainder, path.edge_length()) # if path is None: # return active_node, active_edge, active_length if remainder >= path.edge_length() - active_length: active_node = path print(active_node.link is None) active_length = 0 active_edge = '' remainder -= path.edge_length() - active_length else: active_length = remainder active_edge = char if active_length != 0 else None break active_length = remainder return active_node, active_edge, active_length def char_match_after(char, active_node, active_edge, active_length): if active_length < 1: return active_node.get_child(char) is not None edge = active_node.get_child(active_edge) position = edge.data[0] + active_length return Node.string[position] == char def suffix_tree(string): # string += "$" Node.string = string n = len(string) last_j = 0 root = Node() root.add_child(Node((0, '#'))) root.add_suffix_link(root) need_link_to = root active_node = root active_edge = "" active_length = 0 remainder = 0 for i in range(1, n): # if active_node.link is not None: # active_node = active_node.link # if active_node == active_node.link: # remainder -= 1 Node.global_end += 1 # Rule 1s implicitly for j in range(last_j + 1, i + 1): print(string[0:i + 1], end=": ") print(active_node, active_edge, active_length) # if 2 or more letter string, just match the last character at the current position (we already went # there) i.e. i i think char = string[i] if i - j <= 1 else string[i] match = char_match_after(char, active_node, active_edge, active_length) if match: # Rule 3 print(3, -j) remainder += 1 active_node, active_edge, active_length = traverse_down(string, (j, i), active_node, active_edge, active_length, remainder) break else: # Rule 2 print(2, -j) # new_node = None if active_length > 0: # If the pointer is on an edge # if active_node.link is not None: # active_node = active_node.link new_node = branch_from_point(active_node, active_edge, active_length, j, need_link_to) new_node.add_suffix_link(need_link_to) need_link_to = new_node print(f"*** I want to go from {repr(active_node)} -> {repr(active_node.link)} ***") # if active_node.link is not None: # active_node = active_node.link else: # If the pointer is on a node new_node = Node((j, '#')) active_node.add_child(new_node) active_edge = "" remainder = max(remainder - 1, 0) active_length = 0 last_j = j print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") # print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") root.print_tree() print("\n" * 10) if __name__ == "__main__": suffix_tree("abcabxabcd")