diff options
| author | akiyamn | 2021-04-30 11:49:04 +1000 | 
|---|---|---|
| committer | akiyamn | 2021-04-30 11:49:04 +1000 | 
| commit | 8159731e3a51027c44486caaa4458d9214753d43 (patch) | |
| tree | ece3f2119b0691ab4a6824e6de961547de8bf661 | |
| parent | 3ec37656910e3c71569408bd9be2dfa8db06efc2 (diff) | |
| download | fit3155-8159731e3a51027c44486caaa4458d9214753d43.tar.gz fit3155-8159731e3a51027c44486caaa4458d9214753d43.zip | |
Ass 2: Almost done
| -rw-r--r-- | ass2/q2/suffix_array.py | 18 | ||||
| -rw-r--r-- | ass2/q3/lcp.py | 43 | ||||
| -rw-r--r-- | ass2/q4/Makefile | 4 | ||||
| -rw-r--r-- | ass2/q4/fibonacci.md | 236 | ||||
| -rw-r--r-- | ass2/ukkonen.py | 6 | 
5 files changed, 298 insertions, 9 deletions
| diff --git a/ass2/q2/suffix_array.py b/ass2/q2/suffix_array.py index e743180..edfaa80 100644 --- a/ass2/q2/suffix_array.py +++ b/ass2/q2/suffix_array.py @@ -15,18 +15,24 @@ def read_in_string(filename):          return file.read() -def write_suffix_array(tree, filename): +def suffix_array(string): +    tree = ukkonen(string) +    buffer = [] +    depth_first_apply(tree, buffer.append) +    return buffer + + +def write_suffix_array(string, filename):      with open(filename, "w") as file: -        buffer = [] -        depth_first_apply(tree, buffer.append) +        buffer = suffix_array(string)          file.write("\n".join(map(str, buffer)))  def main():      assert len(sys.argv) == 2      string = read_in_string(sys.argv[1]) -    tree = ukkonen(string) -    write_suffix_array(tree, "output_suffix_array.txt") +    write_suffix_array(string, "output_suffix_array.txt") -main() +if __name__ == "__main__": +    main() diff --git a/ass2/q3/lcp.py b/ass2/q3/lcp.py new file mode 100644 index 0000000..96bd3ff --- /dev/null +++ b/ass2/q3/lcp.py @@ -0,0 +1,43 @@ +from ass2.ukkonen import ukkonen, skip_count, Point, char_is_after, Node + + +# s1 = "abdabc" +# s2 = "daba" +# s1 = "abaabaaba" +# s2 = "baabaaabaa" + +def longest_prefix(root: Node, s1: str, s2: str, suffixes: tuple): +    print(s1[suffixes[0]:]) +    print(s2[suffixes[1]:]) +    matches = 0 +    if suffixes[0] == 0: +        point = Point(root, s1[0], 1) +    else: +        point = skip_count(suffixes[0], Point(root), 0) +    for i in range(suffixes[1], len(s2) + 1): +        print(s2[i], end=" ") +        if char_is_after(point, s2[i]): +            if point.is_explicit(): +                point = Point(point.node, s2[i], 1) +            else: +                point = skip_count(1, point, i) +            matches += 1 +            print("found") +        else: +            print("not found") +            break +    return matches + + +def main(): +    s1 = "abcdacbdab" +    s2 = "dacbdabc" +    print(s1) +    tree = ukkonen(s1) +    tree.print_tree() +    result = longest_prefix(tree, s1, s2, (0, 5)) +    print(result) + + +if __name__ == "__main__": +    main() diff --git a/ass2/q4/Makefile b/ass2/q4/Makefile new file mode 100644 index 0000000..8116920 --- /dev/null +++ b/ass2/q4/Makefile @@ -0,0 +1,4 @@ +build: fibonacci.md  +	pandoc -V papersize:a4paper --pdf-engine=xelatex --filter pandoc-eqnos  fibonacci.md -o fibonacci.pdf + +# -f markdown+hard_line_breaks
\ No newline at end of file diff --git a/ass2/q4/fibonacci.md b/ass2/q4/fibonacci.md new file mode 100644 index 0000000..7cda442 --- /dev/null +++ b/ass2/q4/fibonacci.md @@ -0,0 +1,236 @@ +--- +title: Assignment 2, **Question 4** - FIT3155 +author: Alexander Occhipinti - 29994705 +date: Week 8 of Semester 2, 2021 +geometry: margin=2cm +header-includes: +- \usepackage{cancel} +- \usepackage{qtree} +--- + +#  Part a +Firstly, the sum of the amortised cost for every state $i$ of $D$ can be expressed as: + +$$ +\sum_{i=1}^n ac_{i} = \sum_{i=1}^n tc_{i} + \sum_{i=1}^n \Phi(D_{i}) - \Phi(D_{i-1})   +$$ {#eq:1} + +This is just taking equation 3 from the assignment paper, summing it then breaking each part into separate sums. + +Since we know that $0 < i \leq n$, we can expand out $\sum_{i=1}^n \Phi(D_{i})$ into: + +$$ +\sum_{i=1}^n \Phi(D_{i}) = \Phi(D_{n}) + \Phi(D_{n-1}) + \Phi(D_{n-2}) + \ldots + \Phi(D_{i}) + \Phi(D_{i-1}) + \ldots + \Phi(D_{1})  +$${#eq:2} + +$D_{i-1}$ is defined as the stage of the data structure before $D_{i}$. This means we can expand the right-hand side of (@eq:1) as: +\begin{align*} +\sum_{i=1}^n \Phi(D_{i}) - \Phi(D_{i-1}) = \Phi(D_{n})\;+\;&\cancel{\Phi(D_{n-1})} + \cancel{\Phi(D_{n-2})} + \ldots + \cancel{\Phi(D_{i})} + \cancel{\Phi(D_{i-1})} + \ldots + \cancel{\Phi(D_{1})} \\ +-\;&\cancel{\Phi(D_{n-1})} + \cancel{\Phi(D_{n-2})} + \ldots + \cancel{\Phi(D_{i})} + \cancel{\Phi(D_{i-1})} + \ldots + \cancel{\Phi(D_{1})} + \Phi(D_{0}) +\end{align*} +These expanded terms of $\Phi(D_{i})$ and $\Phi(D_{i-1})$ then cancel out to become $\Phi(D_{n}) - \Phi(D_{0})$. Therefore, it can be concluded that: +$$\sum_{i=1}^n ac_{i} = \sum_{i=1}^n tc_{i} + \sum_{i=1}^n \Phi(D_{n}) - \Phi(D_{0}) = \sum_{i=1}^n tc_{i} + \Phi(D_{n}) - \Phi(D_{0})$$ +This is the derived right-hand side. + +# Part b + +The true cost of inserting an element into a Fibonacci heap is $O(1)$.  + +This is because adding a new element to the heap is as simple as adding it directly to the end of the root list, and checking if it is the minimum element. Both of these operations are $O(1)$ each, combining to make $O(1)$. + +For a heap in state $H_{i-1}$, let $T(H)$ be the number of nodes in the root list of $H$. +When we insert a node into $H$, it is put directly into the root list. This means that the state after the insert has 1 more node than it did before the insert. +This means: +$$\Phi(D_{i}) - \Phi(D_{i-1})  = T(H) - T(H) + 1 = 1 $$ +So the change in the potential function increases by only 1 unit per insert. +This then shows that the amortised cost is: +\begin{align*} +    ac_i &= O(1) + 1 \\ +    &= O(1). +\end{align*} + +# Part c + +## True cost + +When `extract_min` is performed, the following three operations are performed: + +1. `find_min` (shown to be $O(1)$ in true and amortised cost in the question.) +2. Delete the min and promote its children +3. Consolidate. + +Deleting the minimum node (which is in the root list) will take in the worst possible case $O(D_{max}(N))$ operations.  +This is because $O(1)$ work is being done for each of the min's children, which could be at worst $D_{max}(N)$ children. This is true because a root of degree $k$ has $k$ children. + +Consolidating requires going over every root in the root list and checking whether it needs to be merged with another or not. This requires $O(1)$ work for $T(H)$ nodes, resulting in $O(T(H))$ complexity. + +Adding the time taken by these three operations results in: + +\begin{align*} +    tc_i &= O(1) + O(D_{max}(N)) + O(T(H)) \\ +    &= O\big(D_{max}(N) + T(H)\big) \text{ time.} +\end{align*} + +## Potential cost + +The potential function $\Phi$ we are using is the number of nodes in the root list. +When we delete the minimum element from the heap, its children become root nodes themselves. This means that the potential function increases by  +$D_{max}(N)$ (minus 1, but this isn't important for the sake of the cost function) as it is the maximum number of children that a node could have. +Therefore, the change in potential is: +$$ +\Phi(D_{i}) - \Phi(D_{i-1}) = D_{max}(N) +$$ + +### Scaling units of potential + +Plugging this into the original amortised cost equation: +\begin{align*} +    ac_i &= tc_i + m[\Phi(D_{i}) - \Phi(D_{i-1})] \\ +    &= O\big(D_{max}(N) + T(H)\big) + m[\Phi(D_{i}) - \Phi(D_{i-1})] \\ +    &= O\big(D_{max}(N) + T(H)\big) + m[D_{max}(N)] +\end{align*} + +# Part d + +Consider a Fibonacci heap where $D_{max}(N) = 3$ named $H_3$. + +In order to fill $H_3$ with the most amount of nodes possible without changing $D_{max}(N)$ to $4$,  +we need to add exactly one root node of each previous degree to the root list. This looks like so: + +$$\Tree [.H_3 0 [.1 0 ] [.2 [.1 0 ] 0 ] [.3 [.2 [.1 0 ] 0 ] [.1 0 ] 0 ]     ]$$ + +If we were to consolidate a tree filled like this, it would remain the same. If we add just one more node and consolidate, the whole heap will collapse in on itself and create a heap with a single root node of degree 4. +For a node of degree $k$, if nothing has been deleted below it, it will have $2^k$ nodes. +In $H_3$'s case, it has 15 nodes: +$$N = 2^0 + 2^1 + 2^2 + 2^3 = 1 + 2 + 4 + 8 = 15 $$  +Like mentioned earlier, adding one more node (16 nodes total) turns the heap into a single degree 4 root in the root list. This makes sense since $2^4 = 16$. +For $H_3$ the assertion that $D_{max}(N) \leq \lfloor\log_2(N)\rfloor$ holds true since:  +\begin{align*} +    3 &\leq \lfloor\log_2(15)\rfloor \\ +    &\leq \lfloor 3.907\ldots \rfloor \\ +    &\leq 3 +\end{align*} + +More generally, consider a Fibonacci heap where $D_{max}(N) = k$.  +This heap is maximally filled in the same way as the above example. It consists of exactly one root node of degree $k$ or smaller. $k$, $k-1$, $k-2$, \ldots $\;$ $0$. +$$\Tree [.H 0 [.1 0 ] {...} \qroof{\; ... \;}.k-1 \qroof{\; ... \;}.k ]$$ +If we were to add one more node and consolidate, the heap will consist of a root list containing just a degree $k+1$ node. This would no longer be a $D_{max}(N) = k$ heap. +Given that for a node of degree $k$, if nothing has been deleted below it, it will have $2^k$ nodes, we can calculate the maximum number of nodes in a heap of max degree $k$. +\begin{align*} +    &N = \sum_{d=0}^{k} \; d^2 \\ +    &\therefore \; k \leq \lfloor\log_2(N)\rfloor +\end{align*} +Substituting  $D_{max}(N)$ back in for $k$: +$$ +D_{max}(N) \leq \lfloor\log_2(N)\rfloor. +$$ + +# Part e + +Let $N_k$ denote the number of nodes in a *maximally decreased* tree of degree $k$. +By definition, a node of degree $k$ must have exactly $k$ children. By the rules of Fibonacci heaps and binomial trees, each of these children are subtrees with their own degrees. + +By looking at the maximally decreased trees shown in Figure 1 on the assignment sheet, it can be seen that the number of nodes in a tree follows the Fibonacci sequence. +This is evident from the fact that $N_0$, $N_1$, $N_2$ and $N_3$ are 1, 2, 3 and 5 nodes respectively. This also holds true for $N_4$ which has 8 nodes. + +Let $F_n$ represent the $n^{\text{th}}$ number in the Fibonacci sequence, defined in the same way as Figure 1 in the assignment sheet. (*i.e.* $F_0 = 0$, $F_1 = 1$, $F_2 = 1$ etc\ldots) + +The number of nodes in a maximally decreased tree of degree $k$ is: $N_k = F_{k+2}$. + +# Part f + +Using the definitions of $N_k$ and $F_n$ defined in Part e, we want to show that $N_k = F_{k+2}$. + +## Base cases +For $N_0$ this holds true since there is 1 node in a maximally decreased tree of degree 0: the root. If you were to delete this node, the tree would not exist. 1 is the $0+2 = 2^{\text{nd}}$ number in the Fibonacci sequence. +Therefore: $N_0 = F_{2} = 1$. + +This also holds true for $N_1$. Since deleting a tree of degree 1's only child will decrement the degree, a node of degree 1 is already maximally decreased. The number of nodes in this tree is $2$ and $N_1 = F_{2+1} = F_{3} = 2$. + +## Induction + +To define a maximally decreased tree of degree $k$, we must delete as many nodes as we can from that tree while preserving the degree of the tree's root.  +The root must have $k$ direct children at all times in order to remain a tree of degree $k$. + +Assume that this property holds for a maximally decreased tree $T$ of degree $k$. (*i.e.* $N_k = F_{k+2}$) *and* for a maximally decreased tree  of degree $k-1$. (*i.e.* $N_{k-1} = F_{k+1}$) + +This means that $T$ must have $k$ children, and that each of those subtrees must also be maximally decreased. This extends down to every node in $T$. + +To obtain a maximally decreased tree of degree $k$, we must ask each of the root's children to delete the node with the highest degree out of its children, then tell each of its children to do the same. +For nodes of rank $0$, they have no children so they remain unaffected. +The recursively applies the operation to all children in the tree.  +The reason this forms a maximally decreased tree is because: each node is deleting the most nodes it possibly can with a single delete action, marking that node. + +We know that a tree of degree $k+1$ can be split into two trees of degree $k$, joined by the link between the root and the node of degree $k$. Let us name the side without the root as $\alpha$ and the one with the root as $\beta$. +When $\alpha$ is maximally decreased, we delete the node of degree $k-1$, marking the node of degree $k$ of $\alpha$. The same is done to all of its children. This results in a maximally decreased tree of degree $k-1$, since the “root” of $\alpha$ was allowed to be marked. +Due to our assumption of $N_{k-1} = F_{k+1}$, we assume $\alpha$ has $F_{k+1}$ nodes. + +In $\beta$'s case: we do **not** want to mark the root node, since $\beta$'s root is the root of the whole tree of degree $k+1$. If you were to attempt to mark the root, it would delete one of the direct children of the $k+1$ degree tree, decrementing its degree (violating the rule of maximally decreasing).  +Instead, we can treat $\beta$ as a normal tree of degree $k$ and maximally decrease it as so. Due to our assumption of $N_k = F_{k}$, we assume $\beta$ has $F_{k+2}$ nodes. + +## Conclusion + +Given what's described above, the size of $T$ is the sum of both its halves: $\alpha$ and $\beta$. + +Since the definition of the Fibonacci sequence is: $F_n = F{n-1} + F{n-2}$, it can be said that: +\begin{align*} +    N_{k+1} &= \alpha + \beta \\ +    &= F_{k+1} + F_{k+2} \\ +    &= F_{k+3} +\end{align*} +Therefore by the Principle of Mathematical Induction: $N_{k} = F_{k+2}$. $\blacksquare$ + +# Part g + +Since we know that the number of nodes in a maximally decreased heap is $F_{k+2}$ nodes, we can define $g(k) = F_{k+2}$, substituting it into the assignment sheet's equation 7. + +Since we also know that $\mathbf{size}(r) \geq F_{k+2}$ and that $F_{k+2} \geq \phi^k$ (from the sheet's equation 8), we know that $\mathbf{size}(r) \geq \phi^k$. +This shows that $\mathbf{size}(r)$ is an upper bound on $F_k+2$. +Taking the $\log$ base $\phi$ of both sides we get: +$$ +    \log_\phi(\mathbf{size}(r)) \geq k +$$ +This means that $\log_\phi(\mathbf{size}(r))$ is an upper bound of the degree of that tree $k$, meaning that $k$ is bounded by $O(\log({\mathbf{size}}(r)))$. + +$D_{max}(N)$ represents the biggest degree $k$ in the heap and $N \geq \mathbf{size}(r)$ because the number of nodes in the whole heap will be at least the number of nodes in the biggest tree. Plugging in these values into the above equation yields: +\begin{align*} +    \log_\phi(\mathbf{size}(r)) &\geq D_{max}(N) \\ +    \log_\phi(N) &\geq D_{max}(N) \\ +\end{align*} + +From this, it can be concluded that $D_{max}(N)$ for `extract_min` grows at $O(\log(N))$. + +# Part h + +We can calculate the amortised cost of `decrease_key` using the equation $ac_i = tc_i + \Phi(D_{i}) - \Phi(D_{i-1})$. + +## True cost + +The true cost of a single `decrease_key` operation is $O(1)$.  This includes: changing the key value, checking its parent and marking/unmarking nodes. +This is because these operations do not depend on the state of the data structure for complexity. + +## Potential cost + +The cost function for this question is defined as $\Phi = T(H)$, where $T(H)$ is the number of nodes in the root list. +As described before, a maximum of $D_{max}(N)$ cuts is performed on a heap for a single `decrease_key` operation. +For each cascading cut, we need to add the newly orphaned node to the root list. This increases the amount of roots in the root list by 1 for every cut done, which could be at maximum $D_{max}(N)$ new roots. + +In other words: +\begin{align*} +\Phi(D_{i}) - \Phi(D_{i-1}) &= \Phi(D_{i}) - \Phi(D_{i-1}) + D_{max}(N) \\ +&= D_{max}(N) +\end{align*} + +## Amortised cost + +Plugging the two findings into the amortised cost function yields: +\begin{align*} +ac_i &= O(1) + D_{max}(N) \\ +&= O(D_{max}(N)) \\ +&= O(\log(N)) \;\; \textit{from part g} +\end{align*} + +This is not what I expected, since the amortised cost should take into account the fact that not every `decrease_key` operation takes $O(D_{max}(N))$ work.  +In a large amount of cases, a single operation does not need to go through every level of depth in the tree of the highest degree. It doesn't seem to take into account that a some operations consist of just decreasing the key and not deleting anything at all. I would expect something closer to constant time. + +# Part i diff --git a/ass2/ukkonen.py b/ass2/ukkonen.py index 8d536c5..b4b0b98 100644 --- a/ass2/ukkonen.py +++ b/ass2/ukkonen.py @@ -199,8 +199,8 @@ def do_phase(root: Node, active: Point, i, last_j, remainder):                  active = skip_count(remainder, Point(root), i - remainder)              last_j = j              j += 1 -        print(active) -        root.print_tree() +        # print(active) +        # root.print_tree()      return active, remainder, last_j @@ -238,7 +238,7 @@ def skip_count(num_chars, start_point: Point, index):          direction = Node.string[index]          next_node = head.node.get_child(direction)          if next_node is None: -            raise Exception(f"Attempted to traverse char\n '{direction}' at point {head}. ({index=})") +            raise IndexError(f"Attempted to traverse char\n '{direction}' at point {head}. ({index=})")          incoming_length = next_node.edge_length          if chars_left < incoming_length:              break | 
