aboutsummaryrefslogtreecommitdiff
path: root/ass2/q2
diff options
context:
space:
mode:
Diffstat (limited to 'ass2/q2')
-rw-r--r--ass2/q2/gen_cases.py5
-rw-r--r--ass2/q2/new_suffix_array.py246
-rw-r--r--ass2/q2/suffix_array.py219
3 files changed, 31 insertions, 439 deletions
diff --git a/ass2/q2/gen_cases.py b/ass2/q2/gen_cases.py
index 82d165e..f32a5a4 100644
--- a/ass2/q2/gen_cases.py
+++ b/ass2/q2/gen_cases.py
@@ -1,7 +1,6 @@
import math
-from itertools import product, accumulate, count, repeat
-from functools import reduce
-from new_suffix_array import ukkonen
+from itertools import product
+from ass2.ukkonen import ukkonen
import timeit
import random
diff --git a/ass2/q2/new_suffix_array.py b/ass2/q2/new_suffix_array.py
deleted file mode 100644
index 841ff67..0000000
--- a/ass2/q2/new_suffix_array.py
+++ /dev/null
@@ -1,246 +0,0 @@
-import sys
-
-
-class Node:
- global_end = 0
- all_nodes = []
- string = ""
-
- def __init__(self, start, end):
- self.root = False
- self.start = start
- self.end = end
- self.children = {}
- self.id = len(self.all_nodes)
- self.all_nodes.append(self)
- self.parent = None
- self.link = None
-
- def __str__(self):
- link_str = "" if self.link is None else f" -> {self.link.id}"
- if not self.root:
- j, i = self.tuple()
- return f"[{self.id}, {self.tuple()}, {self.string[j:i + 1]}{link_str}]"
- return f"[{self.id} root{link_str}]"
-
- def __repr__(self):
- return f"[{self.id}]"
-
- def print_tree(self, spaces=1):
- print(f"{self}")
- for edge in self.children:
- print(f" " * spaces, end="")
- self.get_child(edge).print_tree(spaces=spaces + 1)
-
- def first_char(self):
- return self.string[self.start]
-
- def get_child(self, char):
- if char in self.children:
- return self.children[char]
- return None
-
- def add_child(self, child):
- child.parent = self
- self.children[child.first_char()] = child
- return child
-
- def remove_child(self, child):
- self.children.pop(child.first_char())
-
- @property
- def end_index(self):
- return self.tuple()[1]
-
- def tuple(self):
- if self.root:
- raise Exception("Can't get substring of root.")
- if self.end == "#":
- return self.start, self.global_end
- return self.start, self.end
-
- @property
- def edge_length(self):
- if self.root:
- return 0
- else:
- start, end = self.tuple()
- return end - start + 1
-
- def detach(self):
- self.parent.remove_child(self)
- self.parent = None
-
-
-class Point:
- def __init__(self, node, edge="", length=0):
- assert isinstance(node, Node)
- self.node = node
- self.edge = edge
- self.length = length
-
- def __repr__(self):
- return f"(Node {self.node.id}'s edge:'{self.edge}', {self.length} along.)"
-
- def is_explicit(self): # a.k.a. is not on an edge
- return self.edge == ""
-
- def set_node(self, node):
- self.node = node
- self.edge = ""
- self.length = 0
- if not self.is_explicit():
- print("WARNING: Node.set_node", file=sys.stderr)
-
- @property
- def edge_node(self) -> Node:
- return self.node.get_child(self.edge)
-
- def index_here(self):
- if self.is_explicit():
- return 0 if self.node.root else self.node.start
- return self.edge_node.start + self.length - 1
-
- def char_here(self):
- return Node.string[self.index_here()]
-
-
-def create_root():
- assert len(Node.all_nodes) == 0
- root = Node(None, None)
- root.root = True
- root.link = root
- return root
-
-
-def split_edge(split_point: Point):
- assert not split_point.is_explicit()
- edge = split_point.edge_node
- original = edge.tuple()
- edge.detach()
- mediator = Node(original[0], original[0] + split_point.length - 1)
- edge.start = original[0] + split_point.length
- assert edge.start <= edge.end_index
- mediator.add_child(edge)
- split_point.node.add_child(mediator)
- return mediator
-
-
-def pos(n: int):
- return max(n, 0)
-
-
-def do_phase(root: Node, active: Point, i, last_j, remainder):
- root_point = Point(root)
- Node.global_end += 1
- did_rule_three = False
- j = last_j + 1
- node_just_created = None
- while not did_rule_three and j <= i + 1:
-
- curr_char = Node.string[i]
- match = char_is_after(active, curr_char)
- if match:
- # print(3)
- remainder += 1
- if node_just_created is not None:
- node_just_created.link = active.node
- active = skip_count(1, active, i)
- did_rule_three = True
- else:
- # print(2)
- if not active.is_explicit():
- mediator = split_edge(active)
- mediator.add_child(Node(i, "#"))
- if node_just_created is not None:
- node_just_created.link = mediator
- node_just_created = mediator
- active.length -= 1
- if active.length == 0:
- active.set_node(active.node)
- else:
- active.node.add_child(Node(i, "#"))
- if node_just_created is not None and node_just_created.link is None:
- node_just_created.link = active.node
- remainder = pos(remainder - 1)
- active.set_node(active.node.link)
- if remainder > 0:
- active = skip_count(remainder, Point(root), i - remainder)
- last_j = j
- j += 1
- # print(active)
- # root.print_tree()
- return active, remainder, last_j
-
-
-def char_is_after(point: Point, char):
- if point.is_explicit():
- return char in point.node.children
- else:
- if point.length == point.edge_node.edge_length:
- return Node.string[point.edge_node.start] == char
- else: # If not at the end of an edge
- # return Node.string[point.index_here() + point.length] == char
- return Node.string[point.index_here() + 1] == char
-
-
-def skip_count(num_chars, start_point: Point, index):
- incoming_length = -1
- existing_length = 0
- head = start_point
- chars_left = num_chars
- char = ""
-
- if not head.is_explicit():
- incoming_length = head.edge_node.edge_length - head.length
- if num_chars < incoming_length:
- head.length += num_chars
- return head
- head.set_node(head.edge_node)
- chars_left -= incoming_length
- index += incoming_length
-
- # Node.string[i] if head.node.root else Node.string[head.node.end_index + 1]
- # assert head.node.end_index + 1 + chars_left < len(Node.string)
- while chars_left > 0:
- # assert head.node.end_index + 1 + chars_left < len(Node.string)
- direction = Node.string[index]
- next_node = head.node.get_child(direction)
- if next_node is None:
- raise Exception(f"Attempted to traverse char\n '{direction}' at point {head}. ({index=})")
- incoming_length = next_node.edge_length
- if chars_left < incoming_length:
- break
- chars_left -= incoming_length
- index += incoming_length
- head.set_node(next_node)
-
- # direction = Node.string[index]
-
- if chars_left > 0: # Landed on an edge
- head.edge = Node.string[index]
- head.length = chars_left
-
- return head
-
-
-def ukkonen(string):
- string += "$"
- Node.string = string
- Node.global_end = 0
- Node.all_nodes.clear()
- n = len(string)
- remainder = 0
- last_j = 1
- root = create_root()
- root.add_child(Node(0, "#"))
- active = Point(root)
- for i in range(1, n):
- active, remainder, last_j = do_phase(root, active, i, last_j, remainder)
-
-
-if __name__ == "__main__":
- # ukkonen("DEFDBEFFDDEFFFADEFFB")
- ukkonen("abacabad")
- print("done")
-# ukkonen("abcbcbc$")
diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py
index 61ac445..1c67433 100644
--- a/ass2/q2/suffix_array.py
+++ b/ass2/q2/suffix_array.py
@@ -1,201 +1,40 @@
-class Node:
- global_end = 0
- string = ""
- all = []
+from ass2.ukkonen import Node, ukkonen
- def __init__(self, data=None):
- self.data = data
- self.children = []
- self.id = len(self.all)
- self.all.append(self)
- self.parent = None
- self.link = None
- def __str__(self):
- tup = self.get_tuple()
- link_str = "" if self.link is None else f" -> {self.link.id}"
- if tup is not None:
- j, i = tup
- return f"[{self.id}, {self.data}, {self.string[j:i + 1]}{link_str}]"
- return f"[{self.id} root{link_str}]"
+def rank(char):
+ return ord("a") - ord(char) if char != "$" else 26
- def __repr__(self):
- return f"[{self.id}]"
- def print_children(self):
- print(self, end=" contains: ")
- for child in self.children:
- print(child, end=", ")
+# tree = ukkonen("abacabad")
+# # tree.print_tree()
+# for node in Node.all_nodes:
+# if not node.children:
+# print(node)
- def add_child(self, child):
- if child:
- child.parent = self
- self.children.append(child)
- def num_chars_deep(self):
- if self.data is None:
- return 0
- else:
- return self.parent.num_chars_deep() + self.edge_length()
+def depth_first_apply(node: Node, func):
+ if not node.children:
+ func(node.suffix_index)
+ else:
+ for child in node.children.ordered_items():
+ depth_first_apply(child, func)
- def remove_child(self, child):
- self.children.remove(child)
- def bastardise(self):
- self.parent.remove_child(self)
- self.parent = None
+# def calc_suffix_index():
+# lookup = {}
+# counter = 0
+# # leaves = filter(lambda n: not n.children, Node.all_nodes)
+# for node in Node.all_nodes:
+# if not node.children:
+# lookup
+#
+# print(list(leaves))
- def add_suffix_link(self, destination):
- assert self.link is None
- assert len(self.children) > 0
- assert isinstance(destination, Node)
- self.link = destination
- def follow_link(self):
- assert self.link is not None
- return self.link
+def accum():
+ array = []
+ depth_first_apply(tree, lambda x: array.append(x))
+ return array
- def get_tuple(self):
- if self.data is not None and self.data[1] == "#":
- return self.data[0], self.global_end
- return self.data
-
- def first_letter(self):
- return self.string[self.data[0]]
-
- def get_child(self, char):
- for child in self.children:
- if child.first_letter() == char:
- return child
- return None
-
- def edge_length(self):
- if self.data is None:
- return 0
- else:
- substring = self.get_tuple()
- return substring[1] - substring[0] + 1
-
- def print_tree(self, spaces=0):
- print(f"{self}")
- for child in self.children:
- print(f" " * spaces, end="")
- child.print_tree(spaces=spaces + 1)
-
-
-def branch_from_point(active_node, active_edge, active_length, j, need_link_to):
- edge = active_node.get_child(active_edge)
- edge.bastardise()
- original_substr = edge.get_tuple()
- edge.data = (edge.data[0] + active_length, edge.data[1])
- mediator = Node((original_substr[0], original_substr[0] + active_length - 1))
- active_node.add_child(mediator)
- mediator.add_child(edge)
- mediator.add_child(Node((j+1, '#')))
- # assert j+1 != 12
- return mediator
-
-
-def traverse_down(string, substring, active_node, active_edge, active_length, remainder):
- j, i = substring
- traversed = 0
- g = 0
- char = ""
- remainder -= 0 if active_node.data is None else active_node.data[0] + 1
- while remainder > 0:
-
- char = string[i + traversed]
- path = None
- if active_length > 0:
- path = active_node.get_child(active_edge)
- else:
- path = active_node.get_child(char)
- if path is None:
- raise Exception(f"Tried to go down '{char}' from point: {active_node, active_edge, active_length}")
- traversed += path.edge_length()
- g = min(remainder, path.edge_length())
- # if path is None:
- # return active_node, active_edge, active_length
- if remainder >= path.edge_length() - active_length:
- active_node = path
- print(active_node.link is None)
- active_length = 0
- active_edge = ''
- remainder -= path.edge_length() - active_length
- else:
- active_length = remainder
- active_edge = char if active_length != 0 else None
- break
- active_length = remainder
- return active_node, active_edge, active_length
-
-
-def char_match_after(char, active_node, active_edge, active_length):
- if active_length < 1:
- return active_node.get_child(char) is not None
- edge = active_node.get_child(active_edge)
- position = edge.data[0] + active_length
- return Node.string[position] == char
-
-def suffix_tree(string):
- # string += "$"
- Node.string = string
- n = len(string)
- last_j = 0
- root = Node()
- root.add_child(Node((0, '#')))
- root.add_suffix_link(root)
- need_link_to = root
- active_node = root
- active_edge = ""
- active_length = 0
- remainder = 0
- for i in range(1, n):
-
- # if active_node.link is not None:
- # active_node = active_node.link
- # if active_node == active_node.link:
- # remainder -= 1
- Node.global_end += 1 # Rule 1s implicitly
- for j in range(last_j + 1, i + 1):
- print(string[0:i + 1], end=": ")
- print(active_node, active_edge, active_length)
- # if 2 or more letter string, just match the last character at the current position (we already went
- # there) i.e. i i think
- char = string[i] if i - j <= 1 else string[i]
- match = char_match_after(char, active_node, active_edge, active_length)
- if match: # Rule 3
- print(3, -j)
- remainder += 1
- active_node, active_edge, active_length = traverse_down(string, (j, i), active_node, active_edge,
- active_length, remainder)
- break
- else: # Rule 2
- print(2, -j)
- # new_node = None
- if active_length > 0: # If the pointer is on an edge
- # if active_node.link is not None:
- # active_node = active_node.link
- new_node = branch_from_point(active_node, active_edge, active_length, j, need_link_to)
- new_node.add_suffix_link(need_link_to)
- need_link_to = new_node
- print(f"*** I want to go from {repr(active_node)} -> {repr(active_node.link)} ***")
- # if active_node.link is not None:
- # active_node = active_node.link
- else: # If the pointer is on a node
- new_node = Node((j, '#'))
- active_node.add_child(new_node)
-
- active_edge = ""
- remainder = max(remainder - 1, 0)
- active_length = 0
- last_j = j
-
- print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}")
- # print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}")
- root.print_tree()
- print("\n" * 10)
-
-
-if __name__ == "__main__":
- suffix_tree("abcabxabcd")
+tree = ukkonen("mississippi")
+print(accum())