diff options
Diffstat (limited to 'ass2/q2')
| -rw-r--r-- | ass2/q2/gen_cases.py | 5 | ||||
| -rw-r--r-- | ass2/q2/new_suffix_array.py | 246 | ||||
| -rw-r--r-- | ass2/q2/suffix_array.py | 219 | 
3 files changed, 31 insertions, 439 deletions
| diff --git a/ass2/q2/gen_cases.py b/ass2/q2/gen_cases.py index 82d165e..f32a5a4 100644 --- a/ass2/q2/gen_cases.py +++ b/ass2/q2/gen_cases.py @@ -1,7 +1,6 @@  import math -from itertools import product, accumulate, count, repeat -from functools import reduce -from new_suffix_array import ukkonen +from itertools import product +from ass2.ukkonen import ukkonen  import timeit  import random diff --git a/ass2/q2/new_suffix_array.py b/ass2/q2/new_suffix_array.py deleted file mode 100644 index 841ff67..0000000 --- a/ass2/q2/new_suffix_array.py +++ /dev/null @@ -1,246 +0,0 @@ -import sys - - -class Node: -    global_end = 0 -    all_nodes = [] -    string = "" - -    def __init__(self, start, end): -        self.root = False -        self.start = start -        self.end = end -        self.children = {} -        self.id = len(self.all_nodes) -        self.all_nodes.append(self) -        self.parent = None -        self.link = None - -    def __str__(self): -        link_str = "" if self.link is None else f" -> {self.link.id}" -        if not self.root: -            j, i = self.tuple() -            return f"[{self.id}, {self.tuple()}, {self.string[j:i + 1]}{link_str}]" -        return f"[{self.id} root{link_str}]" - -    def __repr__(self): -        return f"[{self.id}]" - -    def print_tree(self, spaces=1): -        print(f"{self}") -        for edge in self.children: -            print(f"   " * spaces, end="") -            self.get_child(edge).print_tree(spaces=spaces + 1) - -    def first_char(self): -        return self.string[self.start] - -    def get_child(self, char): -        if char in self.children: -            return self.children[char] -        return None - -    def add_child(self, child): -        child.parent = self -        self.children[child.first_char()] = child -        return child - -    def remove_child(self, child): -        self.children.pop(child.first_char()) - -    @property -    def end_index(self): -        return self.tuple()[1] - -    def tuple(self): -        if self.root: -            raise Exception("Can't get substring of root.") -        if self.end == "#": -            return self.start, self.global_end -        return self.start, self.end - -    @property -    def edge_length(self): -        if self.root: -            return 0 -        else: -            start, end = self.tuple() -            return end - start + 1 - -    def detach(self): -        self.parent.remove_child(self) -        self.parent = None - - -class Point: -    def __init__(self, node, edge="", length=0): -        assert isinstance(node, Node) -        self.node = node -        self.edge = edge -        self.length = length - -    def __repr__(self): -        return f"(Node {self.node.id}'s edge:'{self.edge}', {self.length} along.)" - -    def is_explicit(self):  # a.k.a. is not on an edge -        return self.edge == "" - -    def set_node(self, node): -        self.node = node -        self.edge = "" -        self.length = 0 -        if not self.is_explicit(): -            print("WARNING: Node.set_node", file=sys.stderr) - -    @property -    def edge_node(self) -> Node: -        return self.node.get_child(self.edge) - -    def index_here(self): -        if self.is_explicit(): -            return 0 if self.node.root else self.node.start -        return self.edge_node.start + self.length - 1 - -    def char_here(self): -        return Node.string[self.index_here()] - - -def create_root(): -    assert len(Node.all_nodes) == 0 -    root = Node(None, None) -    root.root = True -    root.link = root -    return root - - -def split_edge(split_point: Point): -    assert not split_point.is_explicit() -    edge = split_point.edge_node -    original = edge.tuple() -    edge.detach() -    mediator = Node(original[0], original[0] + split_point.length - 1) -    edge.start = original[0] + split_point.length -    assert edge.start <= edge.end_index -    mediator.add_child(edge) -    split_point.node.add_child(mediator) -    return mediator - - -def pos(n: int): -    return max(n, 0) - - -def do_phase(root: Node, active: Point, i, last_j, remainder): -    root_point = Point(root) -    Node.global_end += 1 -    did_rule_three = False -    j = last_j + 1 -    node_just_created = None -    while not did_rule_three and j <= i + 1: - -        curr_char = Node.string[i] -        match = char_is_after(active, curr_char) -        if match: -            # print(3) -            remainder += 1 -            if node_just_created is not None: -                node_just_created.link = active.node -            active = skip_count(1, active, i) -            did_rule_three = True -        else: -            # print(2) -            if not active.is_explicit(): -                mediator = split_edge(active) -                mediator.add_child(Node(i, "#")) -                if node_just_created is not None: -                    node_just_created.link = mediator -                node_just_created = mediator -                active.length -= 1 -                if active.length == 0: -                    active.set_node(active.node) -            else: -                active.node.add_child(Node(i, "#")) -                if node_just_created is not None and node_just_created.link is None: -                    node_just_created.link = active.node -            remainder = pos(remainder - 1) -            active.set_node(active.node.link) -            if remainder > 0: -                active = skip_count(remainder, Point(root), i - remainder) -            last_j = j -            j += 1 -        # print(active) -        # root.print_tree() -    return active, remainder, last_j - - -def char_is_after(point: Point, char): -    if point.is_explicit(): -        return char in point.node.children -    else: -        if point.length == point.edge_node.edge_length: -            return Node.string[point.edge_node.start] == char -        else:  # If not at the end of an edge -            # return Node.string[point.index_here() + point.length] == char -            return Node.string[point.index_here() + 1] == char - - -def skip_count(num_chars, start_point: Point, index): -    incoming_length = -1 -    existing_length = 0 -    head = start_point -    chars_left = num_chars -    char = "" - -    if not head.is_explicit(): -        incoming_length = head.edge_node.edge_length - head.length -        if num_chars < incoming_length: -            head.length += num_chars -            return head -        head.set_node(head.edge_node) -        chars_left -= incoming_length -        index += incoming_length - -    # Node.string[i] if head.node.root else Node.string[head.node.end_index + 1] -    # assert head.node.end_index + 1 + chars_left < len(Node.string) -    while chars_left > 0: -        # assert head.node.end_index + 1 + chars_left < len(Node.string) -        direction = Node.string[index] -        next_node = head.node.get_child(direction) -        if next_node is None: -            raise Exception(f"Attempted to traverse char\n '{direction}' at point {head}. ({index=})") -        incoming_length = next_node.edge_length -        if chars_left < incoming_length: -            break -        chars_left -= incoming_length -        index += incoming_length -        head.set_node(next_node) - -    # direction = Node.string[index] - -    if chars_left > 0:  # Landed on an edge -        head.edge = Node.string[index] -        head.length = chars_left - -    return head - - -def ukkonen(string): -    string += "$" -    Node.string = string -    Node.global_end = 0 -    Node.all_nodes.clear() -    n = len(string) -    remainder = 0 -    last_j = 1 -    root = create_root() -    root.add_child(Node(0, "#")) -    active = Point(root) -    for i in range(1, n): -        active, remainder, last_j = do_phase(root, active, i, last_j, remainder) - - -if __name__ == "__main__": -    # ukkonen("DEFDBEFFDDEFFFADEFFB") -    ukkonen("abacabad") -    print("done") -# ukkonen("abcbcbc$") diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py index 61ac445..1c67433 100644 --- a/ass2/q2/suffix_array.py +++ b/ass2/q2/suffix_array.py @@ -1,201 +1,40 @@ -class Node: -    global_end = 0 -    string = "" -    all = [] +from ass2.ukkonen import Node, ukkonen -    def __init__(self, data=None): -        self.data = data -        self.children = [] -        self.id = len(self.all) -        self.all.append(self) -        self.parent = None -        self.link = None -    def __str__(self): -        tup = self.get_tuple() -        link_str = "" if self.link is None else f" -> {self.link.id}" -        if tup is not None: -            j, i = tup -            return f"[{self.id}, {self.data}, {self.string[j:i + 1]}{link_str}]" -        return f"[{self.id} root{link_str}]" +def rank(char): +    return ord("a") - ord(char) if char != "$" else 26 -    def __repr__(self): -        return f"[{self.id}]" -    def print_children(self): -        print(self, end=" contains: ") -        for child in self.children: -            print(child, end=", ") +# tree = ukkonen("abacabad") +# # tree.print_tree() +# for node in Node.all_nodes: +#     if not node.children: +#         print(node) -    def add_child(self, child): -        if child: -            child.parent = self -            self.children.append(child) -    def num_chars_deep(self): -        if self.data is None: -            return 0 -        else: -            return self.parent.num_chars_deep() + self.edge_length() +def depth_first_apply(node: Node, func): +    if not node.children: +        func(node.suffix_index) +    else: +        for child in node.children.ordered_items(): +            depth_first_apply(child, func) -    def remove_child(self, child): -        self.children.remove(child) -    def bastardise(self): -        self.parent.remove_child(self) -        self.parent = None +# def calc_suffix_index(): +#     lookup = {} +#     counter = 0 +#     # leaves = filter(lambda n: not n.children, Node.all_nodes) +#     for node in Node.all_nodes: +#         if not node.children: +#             lookup +# +#     print(list(leaves)) -    def add_suffix_link(self, destination): -        assert self.link is None -        assert len(self.children) > 0 -        assert isinstance(destination, Node) -        self.link = destination -    def follow_link(self): -        assert self.link is not None -        return self.link +def accum(): +    array = [] +    depth_first_apply(tree, lambda x: array.append(x)) +    return array -    def get_tuple(self): -        if self.data is not None and self.data[1] == "#": -            return self.data[0], self.global_end -        return self.data - -    def first_letter(self): -        return self.string[self.data[0]] - -    def get_child(self, char): -        for child in self.children: -            if child.first_letter() == char: -                return child -        return None - -    def edge_length(self): -        if self.data is None: -            return 0 -        else: -            substring = self.get_tuple() -            return substring[1] - substring[0] + 1 - -    def print_tree(self, spaces=0): -        print(f"{self}") -        for child in self.children: -            print(f"   " * spaces, end="") -            child.print_tree(spaces=spaces + 1) - - -def branch_from_point(active_node, active_edge, active_length, j, need_link_to): -    edge = active_node.get_child(active_edge) -    edge.bastardise() -    original_substr = edge.get_tuple() -    edge.data = (edge.data[0] + active_length, edge.data[1]) -    mediator = Node((original_substr[0], original_substr[0] + active_length - 1)) -    active_node.add_child(mediator) -    mediator.add_child(edge) -    mediator.add_child(Node((j+1, '#'))) -    # assert j+1 != 12 -    return mediator - - -def traverse_down(string, substring, active_node, active_edge, active_length, remainder): -    j, i = substring -    traversed = 0 -    g = 0 -    char = "" -    remainder -= 0 if active_node.data is None else active_node.data[0] + 1 -    while remainder > 0: - -        char = string[i + traversed] -        path = None -        if active_length > 0: -            path = active_node.get_child(active_edge) -        else: -            path = active_node.get_child(char) -        if path is None: -            raise Exception(f"Tried to go down '{char}' from point: {active_node, active_edge, active_length}") -        traversed += path.edge_length() -        g = min(remainder, path.edge_length()) -        # if path is None: -        #     return active_node, active_edge, active_length -        if remainder >= path.edge_length() - active_length: -            active_node = path -            print(active_node.link is None) -            active_length = 0 -            active_edge = '' -            remainder -= path.edge_length() - active_length -        else: -            active_length = remainder -            active_edge = char if active_length != 0 else None -            break -    active_length = remainder -    return active_node, active_edge, active_length - - -def char_match_after(char, active_node, active_edge, active_length): -    if active_length < 1: -        return active_node.get_child(char) is not None -    edge = active_node.get_child(active_edge) -    position = edge.data[0] + active_length -    return Node.string[position] == char - -def suffix_tree(string): -    # string += "$" -    Node.string = string -    n = len(string) -    last_j = 0 -    root = Node() -    root.add_child(Node((0, '#'))) -    root.add_suffix_link(root) -    need_link_to = root -    active_node = root -    active_edge = "" -    active_length = 0 -    remainder = 0 -    for i in range(1, n): - -        # if active_node.link is not None: -        #     active_node = active_node.link -        # if active_node == active_node.link: -            # remainder -= 1 -        Node.global_end += 1  # Rule 1s implicitly -        for j in range(last_j + 1, i + 1): -            print(string[0:i + 1], end=": ") -            print(active_node, active_edge, active_length) -            # if 2 or more letter string, just match the last character at the current position (we already went -            # there) i.e. i i think -            char = string[i] if i - j <= 1 else string[i] -            match = char_match_after(char, active_node, active_edge, active_length) -            if match:  # Rule 3 -                print(3, -j) -                remainder += 1 -                active_node, active_edge, active_length = traverse_down(string, (j, i), active_node, active_edge, -                                                                        active_length, remainder) -                break -            else:  # Rule 2 -                print(2, -j) -                # new_node = None -                if active_length > 0:  # If the pointer is on an edge -                    # if active_node.link is not None: -                    #     active_node = active_node.link -                    new_node = branch_from_point(active_node, active_edge, active_length, j, need_link_to) -                    new_node.add_suffix_link(need_link_to) -                    need_link_to = new_node -                    print(f"*** I want to go from {repr(active_node)} -> {repr(active_node.link)} ***") -                    # if active_node.link is not None: -                    #     active_node = active_node.link -                else:  # If the pointer is on a node -                    new_node = Node((j, '#')) -                    active_node.add_child(new_node) - -                active_edge = "" -                remainder = max(remainder - 1, 0) -                active_length = 0 -                last_j = j - -            print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") -        # print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") -            root.print_tree() -            print("\n" * 10) - - -if __name__ == "__main__": -    suffix_tree("abcabxabcd") +tree = ukkonen("mississippi") +print(accum()) | 
