aboutsummaryrefslogtreecommitdiff
path: root/ass2/q2
diff options
context:
space:
mode:
authorakiyamn2021-04-18 17:38:31 +1000
committerakiyamn2021-04-18 17:38:31 +1000
commit648f59dbcc764b1074eaf46219045e535844cf4d (patch)
treec0891370b92a0824bed2a90f2fb776a05f7c9ffe /ass2/q2
parent96932e20763fceccc90989f02f37b24f114fdfcf (diff)
downloadfit3155-648f59dbcc764b1074eaf46219045e535844cf4d.tar.gz
fit3155-648f59dbcc764b1074eaf46219045e535844cf4d.zip
Starting Ukkonen's, it kind of works but doesn't
Diffstat (limited to 'ass2/q2')
-rw-r--r--ass2/q2/suffix_array.py162
1 files changed, 162 insertions, 0 deletions
diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py
new file mode 100644
index 0000000..1bb88b9
--- /dev/null
+++ b/ass2/q2/suffix_array.py
@@ -0,0 +1,162 @@
+class Node:
+ global_end = 0
+ string = ""
+ all = []
+
+ def __init__(self, data=None):
+ self.data = data
+ self.children = []
+ self.id = len(self.all)
+ self.all.append(self)
+ self.parent = None
+
+ def __str__(self):
+ tup = self.get_tuple()
+ if tup is not None:
+ j, i = tup
+ return f"[{self.id}, {self.data}, {self.string[j:i + 1]}]"
+ return f"[{self.id} root]"
+
+ def __repr__(self):
+ return f"[{self.id}]"
+
+ def print_children(self):
+ print(self, end=" contains: ")
+ for child in self.children:
+ print(child, end=", ")
+
+ def add_child(self, child):
+ if child:
+ child.parent = self
+ self.children.append(child)
+
+ def num_chars_deep(self):
+ if self.data is None:
+ return 0
+ else:
+ return self.parent.num_chars_deep() + self.edge_length()
+
+ def remove_child(self, child):
+ self.children.remove(child)
+
+ def bastardise(self):
+ self.parent.remove_child(self)
+ parent = None
+
+ def get_tuple(self):
+ if self.data is not None and self.data[1] == "#":
+ return self.data[0], self.global_end
+ return self.data
+
+ def first_letter(self):
+ return self.string[self.data[0]]
+
+ def get_child(self, char):
+ for child in self.children:
+ if child.first_letter() == char:
+ return child
+ return None
+
+ def edge_length(self):
+ if self.data is None:
+ return 0
+ else:
+ substring = self.get_tuple()
+ return substring[1] - substring[0] + 1
+
+ def print_tree(self, spaces=0):
+ print(f"{self}")
+ for child in self.children:
+ print(f" " * spaces, end="")
+ child.print_tree(spaces=spaces + 1)
+
+
+def branch_from_point(active_node, active_edge, active_length, j):
+ edge = active_node.get_child(active_edge)
+ edge.bastardise()
+ original_substr = edge.get_tuple()
+ edge.data = (edge.data[0] + active_length, edge.data[1])
+ mediator = Node((original_substr[0], original_substr[0] + active_length - 1))
+ active_node.add_child(mediator)
+ mediator.add_child(edge)
+ mediator.add_child(Node((j + 1, '#')))
+
+
+def traverse_down(string, substring, active_node, active_edge, active_length, remainder):
+ j, i = substring
+ traversed = 0
+ g = 0
+ char = ""
+ remainder -= 0 if active_node.data is None else active_node.data[0] + 1
+ while remainder > 0:
+ char = string[i + traversed]
+ path = active_node.get_child(char)
+ assert path is not None
+ traversed += path.edge_length()
+ g = min(remainder, path.edge_length())
+ # if path is None:
+ # return active_node, active_edge, active_length
+ if remainder >= path.edge_length():
+ active_node = path
+ active_length = 0
+ active_edge = ''
+ remainder -= path.edge_length()
+ else:
+ active_length = remainder
+ active_edge = char if active_length != 0 else None
+ break
+ active_length = remainder
+ return active_node, active_edge, active_length
+
+
+def char_match_after(char, active_node, active_edge, active_length):
+ if active_length < 1:
+ return active_node.get_child(char) is not None
+ edge = active_node.get_child(active_edge)
+ position = edge.data[0] + active_length + 1
+ return Node.string[position] == char
+
+
+def suffix_tree(string):
+ # string += "$"
+ Node.string = string
+ n = len(string)
+ last_j = 0
+ root = Node()
+ root.add_child(Node((0, '#')))
+ active_node = root
+ active_edge = ""
+ active_length = 0
+ remainder = 0
+ last_rule = 0
+ for i in range(1, n):
+ Node.global_end += 1 # Rule 1s implicitly
+ for j in range(last_j + 1, i + 1):
+ print(string[j:i + 1], end=": ")
+ # if 2 or more letter string, just match the last character at the current position (we already went there) i.e. i i think
+ char = string[i] if i-j <= 1 else string[j]
+ match = char_match_after(char, active_node, active_edge, active_length)
+ if match: # Rule 3
+ print(3, -j)
+ remainder += 1
+ active_node, active_edge, active_length = traverse_down(string, (j, i), active_node, active_edge,
+ active_length, remainder)
+
+ break
+ else: # Rule 2
+ print(2, -j)
+ if active_length > 0:
+ branch_from_point(active_node, active_edge, active_length, j)
+ else:
+ active_node.add_child(Node((j, '#')))
+ active_edge = ""
+ remainder = max(remainder - 1, 0)
+ active_length = 0
+ last_j = j
+
+ print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}")
+ root.print_tree()
+ print("\n" * 10)
+
+
+suffix_tree("abacabad$")