aboutsummaryrefslogtreecommitdiff
path: root/ass2
diff options
context:
space:
mode:
authorakiyamn2021-04-22 11:23:47 +1000
committerakiyamn2021-04-22 11:23:47 +1000
commiteff620948c6140fea0d15f7e63055550a278a9bf (patch)
tree9f7d6665136d6a998fbd2b5f55866ba6be590277 /ass2
parent6a7732a192f9f6c3ebc16f86875e9fb627ce7af4 (diff)
downloadfit3155-eff620948c6140fea0d15f7e63055550a278a9bf.tar.gz
fit3155-eff620948c6140fea0d15f7e63055550a278a9bf.zip
Ass 2 - Ukkonen 2: Electric Boogaloo
Diffstat (limited to 'ass2')
-rw-r--r--ass2/q2/new_suffix_array.py188
-rw-r--r--ass2/q2/suffix_array.py3
2 files changed, 190 insertions, 1 deletions
diff --git a/ass2/q2/new_suffix_array.py b/ass2/q2/new_suffix_array.py
new file mode 100644
index 0000000..0c97137
--- /dev/null
+++ b/ass2/q2/new_suffix_array.py
@@ -0,0 +1,188 @@
+import sys
+
+
+class Node:
+ global_end = 0
+ all_nodes = []
+ string = ""
+
+ def __init__(self, start, end, root=False):
+ self.root = root
+ self.start = start
+ self.end = end
+ self.children = {}
+ self.id = len(self.all_nodes)
+ self.all_nodes.append(self)
+ self.parent = None
+ self.link = None
+
+ def __str__(self):
+ link_str = "" if self.link is None else f" -> {self.link.id}"
+ if not self.root:
+ j, i = self.tuple()
+ return f"[{self.id}, {self.tuple()}, {self.string[j:i + 1]}{link_str}]"
+ return f"[{self.id} root{link_str}]"
+
+ def __repr__(self):
+ return f"[{self.id}]"
+
+ def print_tree(self, spaces=1):
+ print(f"{self}")
+ for edge in self.children:
+ print(f" " * spaces, end="")
+ self.get_child(edge).print_tree(spaces=spaces + 1)
+
+ def first_char(self):
+ return self.string[self.start]
+
+ def get_child(self, char):
+ if char in self.children:
+ return self.children[char]
+ return None
+
+ def add_child(self, child):
+ child.parent = self
+ self.children[child.first_char()] = child
+ return child
+
+ def remove_child(self, child):
+ self.children.pop(child)
+
+ @property
+ def end_index(self):
+ return self.tuple()[1]
+
+ def tuple(self):
+ if self.root:
+ raise Exception("Can't get substring of root.")
+ if self.end == "#":
+ return self.start, self.global_end
+ return self.start, self.end
+
+ @property
+ def edge_length(self):
+ if self.root:
+ return 0
+ else:
+ start, end = self.tuple()
+ return end - start + 1
+
+ def detach(self):
+ self.parent.remove_child(self)
+ self.parent = None
+
+
+class Point:
+ def __init__(self, node, edge="", length=0):
+ assert isinstance(node, Node)
+ self.node = node
+ self.edge = edge
+ self.length = length
+
+ def __repr__(self):
+ return f"(Node {self.node.id}'s edge:'{self.edge}', {self.length} along.)"
+
+ def is_explicit(self): # a.k.a. is not on an edge
+ return self.edge == ""
+
+ def set_node(self, node):
+ self.node = node
+ self.edge = ""
+ self.length = 0
+ if not self.is_explicit():
+ print("WARNING: Node.set_node", file=sys.stderr)
+
+ @property
+ def edge_node(self) -> Node:
+ return self.node.get_child(self.edge)
+
+ def index_here(self):
+ if self.node.root:
+ return None
+ if self.is_explicit():
+ return self.node.start
+ return self.edge_node.start + self.length - 1
+
+ def char_here(self):
+ return Node.string[self.index_here()]
+
+
+def create_root():
+ assert len(Node.all_nodes) == 0
+ return Node(None, None, root=True)
+
+
+def do_phase(root: Node, active: Point, i, last_j):
+ root_point = Point(root)
+ Node.global_end += 1
+ did_rule_three = False
+ j = last_j + 1
+ while not did_rule_three:
+ curr_char = Node.string[i]
+ new_point = skip_count(i + 1, root_point)
+
+
+def char_is_after(point: Point, char):
+ if point.is_explicit():
+ return char in point.node.children
+ else:
+ if point.length < point.node.edge_length: # If not at the end of an edge
+ return Node.string[point.index_here() + point.length]
+ return None
+
+
+def skip_count(num_chars, start_point: Point):
+ incoming_length = -1
+ existing_length = 0
+ head = start_point
+ chars_left = num_chars
+ char = ""
+
+ if not head.is_explicit():
+ incoming_length = head.edge_node.edge_length - head.length
+ if num_chars < incoming_length:
+ head.length += num_chars
+ return head
+ head.set_node(head.edge_node)
+ chars_left -= incoming_length
+
+ direction = Node.string[0] if head.node.root else Node.string[head.node.end_index + 1]
+ # next_node = head.node.get_child(direction)
+ # incoming_length = next_node.edge_length
+ #
+ # chars_left -= incoming_length
+ # head.set_node(next_node)
+
+ while chars_left > incoming_length:
+ direction = Node.string[0] if head.node.root else Node.string[head.node.end_index + 1]
+ next_node = head.node.get_child(direction)
+ incoming_length = next_node.edge_length
+ if chars_left <= incoming_length:
+ break
+ chars_left -= incoming_length
+ head.set_node(next_node)
+
+
+ if chars_left > 0: # Landed on an edge
+ head.edge = direction
+ head.length = chars_left
+
+ return head
+
+
+def ukkonen(string):
+ Node.string = string
+ # string += "$"
+ n = len(string)
+ remainder = 0
+ last_j = 1
+ root = create_root()
+ root.add_child(Node(0, 3)).add_child(Node(4, 6)).add_child(Node(7, 7)).add_child(Node(8, "#"))
+ root.add_child(Node(1, 6))
+ active = Point(root)
+ Node.global_end = len(string)
+ root.print_tree()
+ print(skip_count(151, Point(root, "a", 2)))
+
+
+ukkonen("abcdefghijklmnop$")
diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py
index db22c3e..eccc99e 100644
--- a/ass2/q2/suffix_array.py
+++ b/ass2/q2/suffix_array.py
@@ -159,6 +159,7 @@ def suffix_tree(string):
Node.global_end += 1 # Rule 1s implicitly
for j in range(last_j + 1, i + 1):
print(string[0:i + 1], end=": ")
+ print(active_node, active_edge, active_length)
# if 2 or more letter string, just match the last character at the current position (we already went
# there) i.e. i i think
char = string[i] if i - j <= 1 else string[i]
@@ -196,4 +197,4 @@ def suffix_tree(string):
print("\n" * 10)
-suffix_tree("abacabad$")
+suffix_tree("abcabxabcd")