diff options
| -rw-r--r-- | ass2/q3/lcp.py | 108 | ||||
| -rw-r--r-- | ass2/q4/Makefile | 3 | ||||
| -rw-r--r-- | ass2/q4/fibonacci.md | 82 | ||||
| -rw-r--r-- | ass2/ukkonen.py | 16 | 
4 files changed, 174 insertions, 35 deletions
| diff --git a/ass2/q3/lcp.py b/ass2/q3/lcp.py index 96bd3ff..7e90ae0 100644 --- a/ass2/q3/lcp.py +++ b/ass2/q3/lcp.py @@ -1,4 +1,7 @@ -from ass2.ukkonen import ukkonen, skip_count, Point, char_is_after, Node +from random import randrange + +from ass2.ukkonen import ukkonen, Point, char_is_after, Node, skip_count +from itertools import product  # s1 = "abdabc" @@ -6,38 +9,91 @@ from ass2.ukkonen import ukkonen, skip_count, Point, char_is_after, Node  # s1 = "abaabaaba"  # s2 = "baabaaabaa" -def longest_prefix(root: Node, s1: str, s2: str, suffixes: tuple): -    print(s1[suffixes[0]:]) -    print(s2[suffixes[1]:]) -    matches = 0 -    if suffixes[0] == 0: -        point = Point(root, s1[0], 1) -    else: -        point = skip_count(suffixes[0], Point(root), 0) -    for i in range(suffixes[1], len(s2) + 1): -        print(s2[i], end=" ") -        if char_is_after(point, s2[i]): -            if point.is_explicit(): -                point = Point(point.node, s2[i], 1) -            else: -                point = skip_count(1, point, i) -            matches += 1 -            print("found") +# def longest_prefix(root: Node, s1: str, s2: str, suffixes: tuple): +#     print(s1[suffixes[0]:]) +#     print(s2[suffixes[1]:]) +#     matches = 0 +#     if suffixes[0] == 0: +#         point = Point(root, s1[0], 1) +#     else: +#         point = skip_count(suffixes[0], Point(root), 0) +#     for i in range(suffixes[1], len(s2) + 1): +#         print(s2[i], end=" ") +#         if char_is_after(point, s2[i]): +#             if point.is_explicit(): +#                 point = Point(point.node, s2[i], 1) +#             else: +#                 point = skip_count(1, point, i) +#             matches += 1 +#             print("found") +#         else: +#             print("not found") +#             break +#     return matches + +def naive(s1, s2, offset): +    count = 0 +    for t in list(map(lambda x: x[0] == x[1], zip(s1[offset[0]:], s2[offset[1]:]))): +        if t: +            count += 1 +        else: +            return count +    return count + + +def lcp(root: Node, s1: str, s2: str, offset: tuple): +    index = 0 +    head = Point(root) +    shortest = min(len(s1) - offset[0], len(s2) - offset[1]) +    while index < shortest: +        if s1[index + offset[0]] == s2[index + offset[1]]: +            going_to = head.node.get_child(s1[index + offset[0]]) +            head.set_node(going_to) +            index += going_to.edge_length          else: -            print("not found") -            break -    return matches +            return index +    return index + + +def test_everything_for(tree, s1, s2): +    # print(s1, s2) +    for i in range(len(s1)): +        for j in range(len(s2)): +            result = lcp(tree, s1, s2, (i, j)) +            a = naive(s1, s2, (i, j)) +            assert result == a + + +def new_string(max_letters, max_len): +    return "".join([chr(randrange(ord("a"), ord("a") + max_letters)) for _ in range(0, max_len)]) + + +def test(max_letters, max_len): +    alphabet = "".join(map(chr, range(ord("a"), ord("a") + max_letters))) +    counter = 0 +    while True: +        s1, s2 = new_string(max_letters, max_len), new_string(max_letters, max_len) +        tree = ukkonen(s1 + "z" + s2) +        test_everything_for(tree, s1, s2) +        counter += 1 +        if counter % 1000 == 0: +            print(counter)  def main():      s1 = "abcdacbdab"      s2 = "dacbdabc" -    print(s1) -    tree = ukkonen(s1) +    print(s1 + "&" + s2) +    tree = ukkonen(s1 + "&" + s2)      tree.print_tree() -    result = longest_prefix(tree, s1, s2, (0, 5)) -    print(result) + +    # tup = (4, 5) +    # result = longest_prefix(tree, s1, s2, (0, 5)) +    # result = lcp(tree, s1, s2, tup) +    # print(naive(s1, s2, tup)) +    # print(result)  if __name__ == "__main__": -    main() +    test(4, 50) +    # main() diff --git a/ass2/q4/Makefile b/ass2/q4/Makefile index 8116920..6962fd9 100644 --- a/ass2/q4/Makefile +++ b/ass2/q4/Makefile @@ -1,4 +1,3 @@  build: fibonacci.md  -	pandoc -V papersize:a4paper --pdf-engine=xelatex --filter pandoc-eqnos  fibonacci.md -o fibonacci.pdf - +	pandoc -V papersize:a4paper --pdf-engine=xelatex --filter pandoc-eqnos fibonacci.md -o fibonacci.pdf  # -f markdown+hard_line_breaks
\ No newline at end of file diff --git a/ass2/q4/fibonacci.md b/ass2/q4/fibonacci.md index 7cda442..f550e1b 100644 --- a/ass2/q4/fibonacci.md +++ b/ass2/q4/fibonacci.md @@ -234,3 +234,85 @@ This is not what I expected, since the amortised cost should take into account t  In a large amount of cases, a single operation does not need to go through every level of depth in the tree of the highest degree. It doesn't seem to take into account that a some operations consist of just decreasing the key and not deleting anything at all. I would expect something closer to constant time.  # Part i + +There are 3 cases to consider when performing `decrease_key`. + +## Case 1 + +In the case where a node's key is decreased without violating the properties of the heap, the number of marked nodes and number of nodes in the root list doesn't change. This means $\Phi$ is unaffected in these situations, +meaning $\alpha$ does not matter. This means the amortised cost for this trivial case is $ac_i = O(1)$. So we must only consider situations where the heap property *is* violated.  + +## Case 2a + +When a node with an **unmarked** parent is deleted, the original node is moved to the root list. One node (the parent) is also marked (leading the next case): $\Delta T(H) = 1$ and $\Delta M(H) = 1$. + +## Case 2b + +When a node with a **marked** parent is deleted, two nodes (the original node and the parent) are moved to the root list. That parent loses its mark, decreasing the overall marked nodes by one.  +One node (the grandparent) is also marked, but that credit is to be used up when one of the grandparent's other children are deleted later, so it is not relevant for this calculation. +$\Delta T(H) = 2$ and $\Delta M(H) = -1$. + +The difference in each parameter is summarised in this table: + +|$\Delta$|1|2a|2b| +|-|-|-|-| +|T(H)|0|$+1$|$+2$| +|M(H)|0|$+1$|$-1$| + +## Deriving $\alpha$ + +Consulting the above table, $1 + 2 = 3$ nodes are added to the root list when a mark is added then removed, since a Case 2a creates a case 2b. + +After either of these two cases are complete, we want the total change in cost of the heap to be *just* the number of trees added to the root list (*i.e.* $T(H)$) cancelling out $M(H)$. +This is because an `extract_min` operation will consolidate the stack, using exactly $T(H)$ potential. This leaves us with 0 net potential after a consolidation is done. + +We can form the following equation: +\begin{align*} +    \Phi(D_i) - \Phi(D_{i-1}) &= 3 \\ +    \Big[\Delta T(H) + \alpha \Delta M(H)\Big]_{i} - \Big[]\Delta T(H) + \alpha \Delta M(H)\Big]_{i-1} &= 3 \\ +    (1+ 1\alpha) - (2 + -\alpha) &= 3 \\ +    (1+ \alpha) - (2 -\alpha) &= 3 +\end{align*} + +Then solve for $\alpha$: + +\begin{align*} +    (1 + \alpha) - (2 -\alpha) &= 3 \\ +    2\alpha - 1 &= 3 \\ +    \alpha &= \frac{3 + 1}{2} \\ +    \alpha &= 2 +\end{align*} + +Therefore, the value for $\alpha$ is 2. + +## Amortised cost + +Plugging this information in gives us: $\Phi = T(H) + 2M(H)$ + +Since we know that the true cost $tc_i$ for `decrease_key` is $O(1)$ for reasons shown in Part g we can say that: $$ac_i = O(1) + (\Phi(D_{i}) - \Phi(D_{i-1}))$$ + +In a Case 1 situation, $\Phi(D_{i}) - \Phi(D_{i-1}) = 0$ since simply updating the key without violating the heap property does not affect the potential $\Phi$. This makes Case 1: $ac_i = O(1) + O(1) = O(1)$. + +In Case 2a and 2b, we know that once a case 2a is done, it marks a node, creating a case 2b (as explained earlier). +This means that: +\begin{align*} +    ac_i &= O(1) + \Phi(D_i) - \Phi(D_{i-1}) \\ +    &= O(1) + \Big[\Delta T(H) + 2\Delta M(H)\Big]_{i} - \Big[\Delta T(H) + 2\Delta M(H)\Big]_{i-1} \\ +    &= O(1) + (1 + 2 \times 1) - (2 + 2 \times -1) \\ +    &= O(1) + (1 + 2) - (2 - 2) \\ +    &= O(1) + 3\\ +    &= O(1)\\ +\end{align*} + +Therefore, using $\alpha = 2$ the amortised cost of `extract_min` is $O(1)$. + +# Part j + +Intuitively, $\alpha = 2$ makes sense because the marking of a node is exactly two delete operations (and therefore two new root nodes) ``waiting to happen''. + +A node can only be marked once before it itself is deleted. In order to ``get rid of'' a mark on a node, two nodes (a child and the node itself) need to be deleted. +This is why it makes sense for each marking operation to be worth 2 cost. + +Another thing to note is that 2 is the only positive integer constant that is related marking nodes, regardless of all parameters for the reasons stated above.  +The number 2 in this case is kind of baked into the idea of marking nodes in a Fibonacci heap. + diff --git a/ass2/ukkonen.py b/ass2/ukkonen.py index b4b0b98..bc8b563 100644 --- a/ass2/ukkonen.py +++ b/ass2/ukkonen.py @@ -1,10 +1,11 @@  import sys +ALPHABET_SIZE = 28  class OrderedDict(dict):      def __init__(self):          super().__init__() -        self.first_letters = [None for _ in range(27)] +        self.first_letters = [None for _ in range(ALPHABET_SIZE)]      def __setitem__(self, key, value):          super().__setitem__(key, value) @@ -22,10 +23,13 @@ class OrderedDict(dict):      @staticmethod      def rank(char): -        # result = 26 if char == "$" else ord(char) - 97 -        result = 0 if char == "$" else ord(char) - 96 -        assert result in range(0, 27) -        return result +        if char == "$": +            return 26 +        elif char == "&": +            return 27 +        else: +            return ord(char) - 96 +  class Node: @@ -118,8 +122,6 @@ class Point:          self.node = node          self.edge = ""          self.length = 0 -        if not self.is_explicit(): -            print("WARNING: Node.set_node", file=sys.stderr)      @property      def edge_node(self) -> Node: | 
