aboutsummaryrefslogtreecommitdiff
path: root/ass2/ukkonen.py
diff options
context:
space:
mode:
Diffstat (limited to 'ass2/ukkonen.py')
-rw-r--r--ass2/ukkonen.py140
1 files changed, 104 insertions, 36 deletions
diff --git a/ass2/ukkonen.py b/ass2/ukkonen.py
index bc8b563..5aa94e7 100644
--- a/ass2/ukkonen.py
+++ b/ass2/ukkonen.py
@@ -1,8 +1,21 @@
+"""
+This file is imported into questions 2 and 3.
+"""
+
import sys
ALPHABET_SIZE = 28
+
class OrderedDict(dict):
+ """
+ A hybrid Python dictionary/list
+ All set/get item operations on this data structure are the same complexity of a normal dictionary O(1)-ish
+ For Ukkonen's operation, only the normal dictionary features are used.
+ As values are stored in the dictionary, the are also referenced in a list of size O(alphabet).
+ This acts as a kind of 'counting sort' when accessed is O(n), but provides a pre-sorted list of all children nodes
+ This is used for generating the suffix array.
+ """
def __init__(self):
super().__init__()
self.first_letters = [None for _ in range(ALPHABET_SIZE)]
@@ -18,11 +31,14 @@ class OrderedDict(dict):
super().__delitem__(key)
self.first_letters[self.rank(key)] = None
- def ordered_items(self):
+ def ordered_items(self): # Return iterable of pre-sorted items (for suffix array)
return filter(lambda x: x is not None, self.first_letters)
@staticmethod
def rank(char):
+ """
+ Define a number value to an alphabet letter, including special characters so they can fit in a list
+ """
if char == "$":
return 26
elif char == "&":
@@ -31,8 +47,11 @@ class OrderedDict(dict):
return ord(char) - 96
-
class Node:
+ """
+ Represents an arbitrary node in a suffix tree
+ Also statically sotres some state information about the algorithm (not pretty, I know)
+ """
global_end = 0
num_splits = 0
all_nodes = []
@@ -50,16 +69,22 @@ class Node:
self.link = None
def __str__(self):
+ """
+ String representation of node, shows important internal values of a node (for debug)
+ """
link_str = "" if self.link is None else f" -> {self.link.id}"
if not self.root:
j, i = self.tuple()
return f"[{self.id}, {self.tuple()}, {self.string[j:i + 1]}{link_str}]"
return f"[{self.id} root{link_str}]"
- def __repr__(self):
+ def __repr__(self): # Shorter representation of node
return f"[{self.id}]"
def print_tree(self, spaces=1):
+ """
+ Recursively prints tree of nodes (for debug)
+ """
print(f"{self}")
for edge in self.children:
print(f" " * spaces, end="")
@@ -82,13 +107,16 @@ class Node:
self.children.pop(child.first_char())
@property
- def end_index(self):
+ def end_index(self): # Translates end index into a number (could be '#' pointer)
return self.tuple()[1]
def tuple(self):
+ """
+ Returns the resolved start and end coordinates of the substring this node represents
+ """
if self.root:
raise Exception("Can't get substring of root.")
- if self.end == "#":
+ if self.end == "#": # Translate '#' into global_end
return self.start, self.global_end
return self.start, self.end
@@ -106,6 +134,12 @@ class Node:
class Point:
+ """
+ A representation of a single point on the tree. Used to store active node, edge and length data in one place
+ Could represent a place in the middle of an edge (implicit) or a place on a node (explicit).
+ Abstracts away a lot of tedium regarding working with these closely connected values.
+ Can be used to create 'pure' functions which return a transformation on a given point
+ """
def __init__(self, node, edge="", length=0):
assert isinstance(node, Node)
self.node = node
@@ -118,25 +152,34 @@ class Point:
def is_explicit(self): # a.k.a. is not on an edge
return self.edge == ""
- def set_node(self, node):
+ def set_node(self, node): # Set point to a specific node, reset other values
self.node = node
self.edge = ""
self.length = 0
@property
- def edge_node(self) -> Node:
+ def edge_node(self) -> Node: # Return the Node object of the edge this object points to
return self.node.get_child(self.edge)
def index_here(self):
+ """
+ Return the index in the original string that this point refers to
+ """
if self.is_explicit():
return 0 if self.node.root else self.node.start
return self.edge_node.start + self.length - 1
def char_here(self):
+ """
+ Return the char in the original string that this point refers to
+ """
return Node.string[self.index_here()]
def create_root():
+ """
+ Create a root node with special root properties. Used to initalise the algorithm
+ """
assert len(Node.all_nodes) == 0
root = Node(None, None)
root.root = True
@@ -145,6 +188,11 @@ def create_root():
def split_edge(split_point: Point):
+ """
+ Split a given edge into two separate edges, creating a new node in the middle (called a mediator in my code)
+ Used for Rule 2s on implicit suffixes.
+ Returns the newly created mediator node
+ """
assert not split_point.is_explicit()
edge = split_point.edge_node
original = edge.tuple()
@@ -164,41 +212,47 @@ def pos(n: int):
def do_phase(root: Node, active: Point, i, last_j, remainder):
+ """
+ Performs a single phase of Ukkonen's algorithm, returning values used for the next phase.
+ """
+
+ # Initialisation
root_point = Point(root)
- Node.global_end += 1
+ Node.global_end += 1 # Perform rapid leaf extension trick (Rule 1)
did_rule_three = False
j = last_j + 1
node_just_created = None
- while not did_rule_three and j <= i + 1:
+
+ while not did_rule_three and j <= i + 1: # Run only the required extensions for this phase
curr_char = Node.string[i]
match = char_is_after(active, curr_char)
- if match:
- # print(3)
+ if match: # Decide if Rule 2 or 3.
+ # RULE 3 LOGIC
remainder += 1
if node_just_created is not None:
- node_just_created.link = active.node
- active = skip_count(1, active, i)
- did_rule_three = True
+ node_just_created.link = active.node # Create suffix link (Rule 3)
+ active = skip_count(1, active, i) # Move active node
+ did_rule_three = True # Break loop
else:
- # print(2)
- if not active.is_explicit():
+ # RULE 2 LOGIC
+ if not active.is_explicit(): # Active point on an edge, need to split
mediator = split_edge(active)
- mediator.add_child(Node(i, "#"))
+ mediator.add_child(Node(i, "#")) # Dangle new character off of mediator node from split
if node_just_created is not None:
- node_just_created.link = mediator
+ node_just_created.link = mediator # Create suffix link (First sub-case)
node_just_created = mediator
active.length -= 1
if active.length == 0:
active.set_node(active.node)
- else:
+ else: # Active point on node, just dangle off a new node
active.node.add_child(Node(i, "#"))
if node_just_created is not None and node_just_created.link is None:
- node_just_created.link = active.node
+ node_just_created.link = active.node # Create suffix link (Second sub-case)
remainder = pos(remainder - 1)
- active.set_node(active.node.link)
+ active.set_node(active.node.link) # Go to suffix link
if remainder > 0:
- active = skip_count(remainder, Point(root), i - remainder)
+ active = skip_count(remainder, Point(root), i - remainder) # Traverse from root
last_j = j
j += 1
# print(active)
@@ -207,23 +261,34 @@ def do_phase(root: Node, active: Point, i, last_j, remainder):
def char_is_after(point: Point, char):
- if point.is_explicit():
+ """
+ Return if a given character is traversable directly after a given point
+ Used for Rule 2/3 selection
+ """
+ if point.is_explicit(): # If point on a node
return char in point.node.children
- else:
+ else: # If point on an edge
if point.length == point.edge_node.edge_length:
return Node.string[point.edge_node.start] == char
else: # If not at the end of an edge
- # return Node.string[point.index_here() + point.length] == char
return Node.string[point.index_here() + 1] == char
def skip_count(num_chars, start_point: Point, index):
+ """
+ Use the skip-counting trick to traverse num_chars down from point start_point.
+ Use index value as where to start looking in the string for char comparison
+ Returns the point that the traversal lands on
+ """
+
+ # Initialise
incoming_length = -1
existing_length = 0
head = start_point
chars_left = num_chars
char = ""
+ # Move point to nearest node if it is on an edge
if not head.is_explicit():
incoming_length = head.edge_node.edge_length - head.length
if num_chars < incoming_length:
@@ -233,24 +298,22 @@ def skip_count(num_chars, start_point: Point, index):
chars_left -= incoming_length
index += incoming_length
- # Node.string[i] if head.node.root else Node.string[head.node.end_index + 1]
- # assert head.node.end_index + 1 + chars_left < len(Node.string)
+ # Main traversal loop
while chars_left > 0:
- # assert head.node.end_index + 1 + chars_left < len(Node.string)
- direction = Node.string[index]
+ direction = Node.string[index] # Choose a direction to go from this point
next_node = head.node.get_child(direction)
- if next_node is None:
+ if next_node is None: # Went off the tree -> error
raise IndexError(f"Attempted to traverse char\n '{direction}' at point {head}. ({index=})")
incoming_length = next_node.edge_length
- if chars_left < incoming_length:
+ if chars_left < incoming_length: # Break if we able can't go down that edge
break
+ # Move down edge to next node
chars_left -= incoming_length
index += incoming_length
head.set_node(next_node)
- # direction = Node.string[index]
-
- if chars_left > 0: # Landed on an edge
+ # Return position on edge if couldn't traverse a final edge (search landed on edge)
+ if chars_left > 0:
head.edge = Node.string[index]
head.length = chars_left
@@ -258,6 +321,11 @@ def skip_count(num_chars, start_point: Point, index):
def ukkonen(string):
+ """
+ Reset the algorithm values and create return the root of a suffix tree for a given string
+ using everyone's favourite algorithm: Ukkonen's algorithm. O(n) time.
+ """
+ # Initialise values
string += "$"
Node.string = string
Node.global_end = 0
@@ -266,16 +334,16 @@ def ukkonen(string):
n = len(string)
remainder = 0
last_j = 1
+ # Perform base case i = 0 phase
root = create_root()
root.add_child(Node(0, "#"))
active = Point(root)
+ # Perform rest of phases
for i in range(1, n):
active, remainder, last_j = do_phase(root, active, i, last_j, remainder)
return root
if __name__ == "__main__":
- # ukkonen("DEFDBEFFDDEFFFADEFFB")
ukkonen("abacabad")
print("done")
-# ukkonen("abcbcbc$")