diff options
| -rw-r--r-- | ass2/q2/gen_cases.py | 5 | ||||
| -rw-r--r-- | ass2/q2/suffix_array.py | 219 | ||||
| -rw-r--r-- | ass2/ukkonen.py (renamed from ass2/q2/new_suffix_array.py) | 39 | 
3 files changed, 67 insertions, 196 deletions
| diff --git a/ass2/q2/gen_cases.py b/ass2/q2/gen_cases.py index 82d165e..f32a5a4 100644 --- a/ass2/q2/gen_cases.py +++ b/ass2/q2/gen_cases.py @@ -1,7 +1,6 @@  import math -from itertools import product, accumulate, count, repeat -from functools import reduce -from new_suffix_array import ukkonen +from itertools import product +from ass2.ukkonen import ukkonen  import timeit  import random diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py index 61ac445..1c67433 100644 --- a/ass2/q2/suffix_array.py +++ b/ass2/q2/suffix_array.py @@ -1,201 +1,40 @@ -class Node: -    global_end = 0 -    string = "" -    all = [] +from ass2.ukkonen import Node, ukkonen -    def __init__(self, data=None): -        self.data = data -        self.children = [] -        self.id = len(self.all) -        self.all.append(self) -        self.parent = None -        self.link = None -    def __str__(self): -        tup = self.get_tuple() -        link_str = "" if self.link is None else f" -> {self.link.id}" -        if tup is not None: -            j, i = tup -            return f"[{self.id}, {self.data}, {self.string[j:i + 1]}{link_str}]" -        return f"[{self.id} root{link_str}]" +def rank(char): +    return ord("a") - ord(char) if char != "$" else 26 -    def __repr__(self): -        return f"[{self.id}]" -    def print_children(self): -        print(self, end=" contains: ") -        for child in self.children: -            print(child, end=", ") +# tree = ukkonen("abacabad") +# # tree.print_tree() +# for node in Node.all_nodes: +#     if not node.children: +#         print(node) -    def add_child(self, child): -        if child: -            child.parent = self -            self.children.append(child) -    def num_chars_deep(self): -        if self.data is None: -            return 0 -        else: -            return self.parent.num_chars_deep() + self.edge_length() +def depth_first_apply(node: Node, func): +    if not node.children: +        func(node.suffix_index) +    else: +        for child in node.children.ordered_items(): +            depth_first_apply(child, func) -    def remove_child(self, child): -        self.children.remove(child) -    def bastardise(self): -        self.parent.remove_child(self) -        self.parent = None +# def calc_suffix_index(): +#     lookup = {} +#     counter = 0 +#     # leaves = filter(lambda n: not n.children, Node.all_nodes) +#     for node in Node.all_nodes: +#         if not node.children: +#             lookup +# +#     print(list(leaves)) -    def add_suffix_link(self, destination): -        assert self.link is None -        assert len(self.children) > 0 -        assert isinstance(destination, Node) -        self.link = destination -    def follow_link(self): -        assert self.link is not None -        return self.link +def accum(): +    array = [] +    depth_first_apply(tree, lambda x: array.append(x)) +    return array -    def get_tuple(self): -        if self.data is not None and self.data[1] == "#": -            return self.data[0], self.global_end -        return self.data - -    def first_letter(self): -        return self.string[self.data[0]] - -    def get_child(self, char): -        for child in self.children: -            if child.first_letter() == char: -                return child -        return None - -    def edge_length(self): -        if self.data is None: -            return 0 -        else: -            substring = self.get_tuple() -            return substring[1] - substring[0] + 1 - -    def print_tree(self, spaces=0): -        print(f"{self}") -        for child in self.children: -            print(f"   " * spaces, end="") -            child.print_tree(spaces=spaces + 1) - - -def branch_from_point(active_node, active_edge, active_length, j, need_link_to): -    edge = active_node.get_child(active_edge) -    edge.bastardise() -    original_substr = edge.get_tuple() -    edge.data = (edge.data[0] + active_length, edge.data[1]) -    mediator = Node((original_substr[0], original_substr[0] + active_length - 1)) -    active_node.add_child(mediator) -    mediator.add_child(edge) -    mediator.add_child(Node((j+1, '#'))) -    # assert j+1 != 12 -    return mediator - - -def traverse_down(string, substring, active_node, active_edge, active_length, remainder): -    j, i = substring -    traversed = 0 -    g = 0 -    char = "" -    remainder -= 0 if active_node.data is None else active_node.data[0] + 1 -    while remainder > 0: - -        char = string[i + traversed] -        path = None -        if active_length > 0: -            path = active_node.get_child(active_edge) -        else: -            path = active_node.get_child(char) -        if path is None: -            raise Exception(f"Tried to go down '{char}' from point: {active_node, active_edge, active_length}") -        traversed += path.edge_length() -        g = min(remainder, path.edge_length()) -        # if path is None: -        #     return active_node, active_edge, active_length -        if remainder >= path.edge_length() - active_length: -            active_node = path -            print(active_node.link is None) -            active_length = 0 -            active_edge = '' -            remainder -= path.edge_length() - active_length -        else: -            active_length = remainder -            active_edge = char if active_length != 0 else None -            break -    active_length = remainder -    return active_node, active_edge, active_length - - -def char_match_after(char, active_node, active_edge, active_length): -    if active_length < 1: -        return active_node.get_child(char) is not None -    edge = active_node.get_child(active_edge) -    position = edge.data[0] + active_length -    return Node.string[position] == char - -def suffix_tree(string): -    # string += "$" -    Node.string = string -    n = len(string) -    last_j = 0 -    root = Node() -    root.add_child(Node((0, '#'))) -    root.add_suffix_link(root) -    need_link_to = root -    active_node = root -    active_edge = "" -    active_length = 0 -    remainder = 0 -    for i in range(1, n): - -        # if active_node.link is not None: -        #     active_node = active_node.link -        # if active_node == active_node.link: -            # remainder -= 1 -        Node.global_end += 1  # Rule 1s implicitly -        for j in range(last_j + 1, i + 1): -            print(string[0:i + 1], end=": ") -            print(active_node, active_edge, active_length) -            # if 2 or more letter string, just match the last character at the current position (we already went -            # there) i.e. i i think -            char = string[i] if i - j <= 1 else string[i] -            match = char_match_after(char, active_node, active_edge, active_length) -            if match:  # Rule 3 -                print(3, -j) -                remainder += 1 -                active_node, active_edge, active_length = traverse_down(string, (j, i), active_node, active_edge, -                                                                        active_length, remainder) -                break -            else:  # Rule 2 -                print(2, -j) -                # new_node = None -                if active_length > 0:  # If the pointer is on an edge -                    # if active_node.link is not None: -                    #     active_node = active_node.link -                    new_node = branch_from_point(active_node, active_edge, active_length, j, need_link_to) -                    new_node.add_suffix_link(need_link_to) -                    need_link_to = new_node -                    print(f"*** I want to go from {repr(active_node)} -> {repr(active_node.link)} ***") -                    # if active_node.link is not None: -                    #     active_node = active_node.link -                else:  # If the pointer is on a node -                    new_node = Node((j, '#')) -                    active_node.add_child(new_node) - -                active_edge = "" -                remainder = max(remainder - 1, 0) -                active_length = 0 -                last_j = j - -            print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") -        # print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") -            root.print_tree() -            print("\n" * 10) - - -if __name__ == "__main__": -    suffix_tree("abcabxabcd") +tree = ukkonen("mississippi") +print(accum()) diff --git a/ass2/q2/new_suffix_array.py b/ass2/ukkonen.py index 841ff67..8d536c5 100644 --- a/ass2/q2/new_suffix_array.py +++ b/ass2/ukkonen.py @@ -1,8 +1,36 @@  import sys +class OrderedDict(dict): +    def __init__(self): +        super().__init__() +        self.first_letters = [None for _ in range(27)] + +    def __setitem__(self, key, value): +        super().__setitem__(key, value) +        try: +            self.first_letters[self.rank(key)] = value +        except IndexError: +            raise IndexError(f"Could not add item of key '{key}' since it is out of range of the rank function.") + +    def __delitem__(self, key): +        super().__delitem__(key) +        self.first_letters[self.rank(key)] = None + +    def ordered_items(self): +        return filter(lambda x: x is not None, self.first_letters) + +    @staticmethod +    def rank(char): +        # result = 26 if char == "$" else ord(char) - 97 +        result = 0 if char == "$" else ord(char) - 96 +        assert result in range(0, 27) +        return result + +  class Node:      global_end = 0 +    num_splits = 0      all_nodes = []      string = "" @@ -10,8 +38,9 @@ class Node:          self.root = False          self.start = start          self.end = end -        self.children = {} +        self.children = OrderedDict()          self.id = len(self.all_nodes) +        self.suffix_index = self.id - self.num_splits - 1          self.all_nodes.append(self)          self.parent = None          self.link = None @@ -118,7 +147,9 @@ def split_edge(split_point: Point):      edge = split_point.edge_node      original = edge.tuple()      edge.detach() +    Node.num_splits += 1      mediator = Node(original[0], original[0] + split_point.length - 1) +    mediator.suffix_index = None      edge.start = original[0] + split_point.length      assert edge.start <= edge.end_index      mediator.add_child(edge) @@ -168,8 +199,8 @@ def do_phase(root: Node, active: Point, i, last_j, remainder):                  active = skip_count(remainder, Point(root), i - remainder)              last_j = j              j += 1 -        # print(active) -        # root.print_tree() +        print(active) +        root.print_tree()      return active, remainder, last_j @@ -228,6 +259,7 @@ def ukkonen(string):      string += "$"      Node.string = string      Node.global_end = 0 +    Node.num_splits = 0      Node.all_nodes.clear()      n = len(string)      remainder = 0 @@ -237,6 +269,7 @@ def ukkonen(string):      active = Point(root)      for i in range(1, n):          active, remainder, last_j = do_phase(root, active, i, last_j, remainder) +    return root  if __name__ == "__main__": | 
