diff options
| author | akiyamn | 2021-04-19 17:24:21 +1000 |
|---|---|---|
| committer | akiyamn | 2021-04-19 17:24:21 +1000 |
| commit | f5c908a51d2b906aa216aece440e74376037b4d0 (patch) | |
| tree | e3d0c20fb90c6b8b43c9a8ac6abede1293a63ae6 /ass2/q2/suffix_array.py | |
| parent | 648f59dbcc764b1074eaf46219045e535844cf4d (diff) | |
| download | fit3155-f5c908a51d2b906aa216aece440e74376037b4d0.tar.gz fit3155-f5c908a51d2b906aa216aece440e74376037b4d0.zip | |
Ass2 Q1 is basically done
Diffstat (limited to 'ass2/q2/suffix_array.py')
| -rw-r--r-- | ass2/q2/suffix_array.py | 83 |
1 files changed, 60 insertions, 23 deletions
diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py index 1bb88b9..db22c3e 100644 --- a/ass2/q2/suffix_array.py +++ b/ass2/q2/suffix_array.py @@ -9,13 +9,15 @@ class Node: self.id = len(self.all) self.all.append(self) self.parent = None + self.link = None def __str__(self): tup = self.get_tuple() + link_str = "" if self.link is None else f" -> {self.link.id}" if tup is not None: j, i = tup - return f"[{self.id}, {self.data}, {self.string[j:i + 1]}]" - return f"[{self.id} root]" + return f"[{self.id}, {self.data}, {self.string[j:i + 1]}{link_str}]" + return f"[{self.id} root{link_str}]" def __repr__(self): return f"[{self.id}]" @@ -41,7 +43,17 @@ class Node: def bastardise(self): self.parent.remove_child(self) - parent = None + self.parent = None + + def add_suffix_link(self, destination): + assert self.link is None + assert len(self.children) > 0 + assert isinstance(destination, Node) + self.link = destination + + def follow_link(self): + assert self.link is not None + return self.link def get_tuple(self): if self.data is not None and self.data[1] == "#": @@ -71,7 +83,7 @@ class Node: child.print_tree(spaces=spaces + 1) -def branch_from_point(active_node, active_edge, active_length, j): +def branch_from_point(active_node, active_edge, active_length, j, need_link_to): edge = active_node.get_child(active_edge) edge.bastardise() original_substr = edge.get_tuple() @@ -79,7 +91,9 @@ def branch_from_point(active_node, active_edge, active_length, j): mediator = Node((original_substr[0], original_substr[0] + active_length - 1)) active_node.add_child(mediator) mediator.add_child(edge) - mediator.add_child(Node((j + 1, '#'))) + mediator.add_child(Node((j+1, '#'))) + # assert j+1 != 12 + return mediator def traverse_down(string, substring, active_node, active_edge, active_length, remainder): @@ -89,18 +103,25 @@ def traverse_down(string, substring, active_node, active_edge, active_length, re char = "" remainder -= 0 if active_node.data is None else active_node.data[0] + 1 while remainder > 0: + char = string[i + traversed] - path = active_node.get_child(char) - assert path is not None + path = None + if active_length > 0: + path = active_node.get_child(active_edge) + else: + path = active_node.get_child(char) + if path is None: + raise Exception(f"Tried to go down '{char}' from point: {active_node, active_edge, active_length}") traversed += path.edge_length() g = min(remainder, path.edge_length()) # if path is None: # return active_node, active_edge, active_length - if remainder >= path.edge_length(): + if remainder >= path.edge_length() - active_length: active_node = path + print(active_node.link is None) active_length = 0 active_edge = '' - remainder -= path.edge_length() + remainder -= path.edge_length() - active_length else: active_length = remainder active_edge = char if active_length != 0 else None @@ -113,10 +134,9 @@ def char_match_after(char, active_node, active_edge, active_length): if active_length < 1: return active_node.get_child(char) is not None edge = active_node.get_child(active_edge) - position = edge.data[0] + active_length + 1 + position = edge.data[0] + active_length return Node.string[position] == char - def suffix_tree(string): # string += "$" Node.string = string @@ -124,39 +144,56 @@ def suffix_tree(string): last_j = 0 root = Node() root.add_child(Node((0, '#'))) + root.add_suffix_link(root) + need_link_to = root active_node = root active_edge = "" active_length = 0 remainder = 0 - last_rule = 0 for i in range(1, n): + + # if active_node.link is not None: + # active_node = active_node.link + # if active_node == active_node.link: + # remainder -= 1 Node.global_end += 1 # Rule 1s implicitly for j in range(last_j + 1, i + 1): - print(string[j:i + 1], end=": ") - # if 2 or more letter string, just match the last character at the current position (we already went there) i.e. i i think - char = string[i] if i-j <= 1 else string[j] + print(string[0:i + 1], end=": ") + # if 2 or more letter string, just match the last character at the current position (we already went + # there) i.e. i i think + char = string[i] if i - j <= 1 else string[i] match = char_match_after(char, active_node, active_edge, active_length) if match: # Rule 3 print(3, -j) remainder += 1 active_node, active_edge, active_length = traverse_down(string, (j, i), active_node, active_edge, active_length, remainder) - break else: # Rule 2 print(2, -j) - if active_length > 0: - branch_from_point(active_node, active_edge, active_length, j) - else: - active_node.add_child(Node((j, '#'))) + # new_node = None + if active_length > 0: # If the pointer is on an edge + # if active_node.link is not None: + # active_node = active_node.link + new_node = branch_from_point(active_node, active_edge, active_length, j, need_link_to) + new_node.add_suffix_link(need_link_to) + need_link_to = new_node + print(f"*** I want to go from {repr(active_node)} -> {repr(active_node.link)} ***") + # if active_node.link is not None: + # active_node = active_node.link + else: # If the pointer is on a node + new_node = Node((j, '#')) + active_node.add_child(new_node) + active_edge = "" remainder = max(remainder - 1, 0) active_length = 0 last_j = j - print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") - root.print_tree() - print("\n" * 10) + print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") + # print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") + root.print_tree() + print("\n" * 10) suffix_tree("abacabad$") |
