diff options
| author | akiyamn | 2021-04-27 14:12:47 +1000 | 
|---|---|---|
| committer | akiyamn | 2021-04-27 14:12:47 +1000 | 
| commit | 1695a8e7c9e0d345918946ef6fbc8f56c7751e32 (patch) | |
| tree | 41829151bb336da54aab6b13ff4c47b3a6454698 /ass2/q2/suffix_array.py | |
| parent | e347fd3246a4282d7fea85c1ae727e48c810480b (diff) | |
| download | fit3155-1695a8e7c9e0d345918946ef6fbc8f56c7751e32.tar.gz fit3155-1695a8e7c9e0d345918946ef6fbc8f56c7751e32.zip | |
Ass 2: Guts working for suffix array
Diffstat (limited to 'ass2/q2/suffix_array.py')
| -rw-r--r-- | ass2/q2/suffix_array.py | 219 | 
1 files changed, 29 insertions, 190 deletions
| diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py index 61ac445..1c67433 100644 --- a/ass2/q2/suffix_array.py +++ b/ass2/q2/suffix_array.py @@ -1,201 +1,40 @@ -class Node: -    global_end = 0 -    string = "" -    all = [] +from ass2.ukkonen import Node, ukkonen -    def __init__(self, data=None): -        self.data = data -        self.children = [] -        self.id = len(self.all) -        self.all.append(self) -        self.parent = None -        self.link = None -    def __str__(self): -        tup = self.get_tuple() -        link_str = "" if self.link is None else f" -> {self.link.id}" -        if tup is not None: -            j, i = tup -            return f"[{self.id}, {self.data}, {self.string[j:i + 1]}{link_str}]" -        return f"[{self.id} root{link_str}]" +def rank(char): +    return ord("a") - ord(char) if char != "$" else 26 -    def __repr__(self): -        return f"[{self.id}]" -    def print_children(self): -        print(self, end=" contains: ") -        for child in self.children: -            print(child, end=", ") +# tree = ukkonen("abacabad") +# # tree.print_tree() +# for node in Node.all_nodes: +#     if not node.children: +#         print(node) -    def add_child(self, child): -        if child: -            child.parent = self -            self.children.append(child) -    def num_chars_deep(self): -        if self.data is None: -            return 0 -        else: -            return self.parent.num_chars_deep() + self.edge_length() +def depth_first_apply(node: Node, func): +    if not node.children: +        func(node.suffix_index) +    else: +        for child in node.children.ordered_items(): +            depth_first_apply(child, func) -    def remove_child(self, child): -        self.children.remove(child) -    def bastardise(self): -        self.parent.remove_child(self) -        self.parent = None +# def calc_suffix_index(): +#     lookup = {} +#     counter = 0 +#     # leaves = filter(lambda n: not n.children, Node.all_nodes) +#     for node in Node.all_nodes: +#         if not node.children: +#             lookup +# +#     print(list(leaves)) -    def add_suffix_link(self, destination): -        assert self.link is None -        assert len(self.children) > 0 -        assert isinstance(destination, Node) -        self.link = destination -    def follow_link(self): -        assert self.link is not None -        return self.link +def accum(): +    array = [] +    depth_first_apply(tree, lambda x: array.append(x)) +    return array -    def get_tuple(self): -        if self.data is not None and self.data[1] == "#": -            return self.data[0], self.global_end -        return self.data - -    def first_letter(self): -        return self.string[self.data[0]] - -    def get_child(self, char): -        for child in self.children: -            if child.first_letter() == char: -                return child -        return None - -    def edge_length(self): -        if self.data is None: -            return 0 -        else: -            substring = self.get_tuple() -            return substring[1] - substring[0] + 1 - -    def print_tree(self, spaces=0): -        print(f"{self}") -        for child in self.children: -            print(f"   " * spaces, end="") -            child.print_tree(spaces=spaces + 1) - - -def branch_from_point(active_node, active_edge, active_length, j, need_link_to): -    edge = active_node.get_child(active_edge) -    edge.bastardise() -    original_substr = edge.get_tuple() -    edge.data = (edge.data[0] + active_length, edge.data[1]) -    mediator = Node((original_substr[0], original_substr[0] + active_length - 1)) -    active_node.add_child(mediator) -    mediator.add_child(edge) -    mediator.add_child(Node((j+1, '#'))) -    # assert j+1 != 12 -    return mediator - - -def traverse_down(string, substring, active_node, active_edge, active_length, remainder): -    j, i = substring -    traversed = 0 -    g = 0 -    char = "" -    remainder -= 0 if active_node.data is None else active_node.data[0] + 1 -    while remainder > 0: - -        char = string[i + traversed] -        path = None -        if active_length > 0: -            path = active_node.get_child(active_edge) -        else: -            path = active_node.get_child(char) -        if path is None: -            raise Exception(f"Tried to go down '{char}' from point: {active_node, active_edge, active_length}") -        traversed += path.edge_length() -        g = min(remainder, path.edge_length()) -        # if path is None: -        #     return active_node, active_edge, active_length -        if remainder >= path.edge_length() - active_length: -            active_node = path -            print(active_node.link is None) -            active_length = 0 -            active_edge = '' -            remainder -= path.edge_length() - active_length -        else: -            active_length = remainder -            active_edge = char if active_length != 0 else None -            break -    active_length = remainder -    return active_node, active_edge, active_length - - -def char_match_after(char, active_node, active_edge, active_length): -    if active_length < 1: -        return active_node.get_child(char) is not None -    edge = active_node.get_child(active_edge) -    position = edge.data[0] + active_length -    return Node.string[position] == char - -def suffix_tree(string): -    # string += "$" -    Node.string = string -    n = len(string) -    last_j = 0 -    root = Node() -    root.add_child(Node((0, '#'))) -    root.add_suffix_link(root) -    need_link_to = root -    active_node = root -    active_edge = "" -    active_length = 0 -    remainder = 0 -    for i in range(1, n): - -        # if active_node.link is not None: -        #     active_node = active_node.link -        # if active_node == active_node.link: -            # remainder -= 1 -        Node.global_end += 1  # Rule 1s implicitly -        for j in range(last_j + 1, i + 1): -            print(string[0:i + 1], end=": ") -            print(active_node, active_edge, active_length) -            # if 2 or more letter string, just match the last character at the current position (we already went -            # there) i.e. i i think -            char = string[i] if i - j <= 1 else string[i] -            match = char_match_after(char, active_node, active_edge, active_length) -            if match:  # Rule 3 -                print(3, -j) -                remainder += 1 -                active_node, active_edge, active_length = traverse_down(string, (j, i), active_node, active_edge, -                                                                        active_length, remainder) -                break -            else:  # Rule 2 -                print(2, -j) -                # new_node = None -                if active_length > 0:  # If the pointer is on an edge -                    # if active_node.link is not None: -                    #     active_node = active_node.link -                    new_node = branch_from_point(active_node, active_edge, active_length, j, need_link_to) -                    new_node.add_suffix_link(need_link_to) -                    need_link_to = new_node -                    print(f"*** I want to go from {repr(active_node)} -> {repr(active_node.link)} ***") -                    # if active_node.link is not None: -                    #     active_node = active_node.link -                else:  # If the pointer is on a node -                    new_node = Node((j, '#')) -                    active_node.add_child(new_node) - -                active_edge = "" -                remainder = max(remainder - 1, 0) -                active_length = 0 -                last_j = j - -            print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") -        # print(f"{i=} '{string[i]}' => {(active_node, active_edge, active_length)}, {remainder}") -            root.print_tree() -            print("\n" * 10) - - -if __name__ == "__main__": -    suffix_tree("abcabxabcd") +tree = ukkonen("mississippi") +print(accum()) | 
