aboutsummaryrefslogtreecommitdiff
path: root/ass3/q2/header.py
blob: 24964afa3d275462bffc84f536ca161ac44c4d7a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import heapq
import sys


# Represents a node in the tree used to construct a Huffman code
class HuffNode:
    def __init__(self, string, count):
        self.string = string
        self.count = count
        self.left = None
        self.right = None
        self.binary = None
        self.height = 0

    def __add__(self, other):
        assert isinstance(other, HuffNode)
        new_node = HuffNode(self.string + other.string, self.count + other.count)
        new_node.left = self
        new_node.right = other
        new_node.height += 1 + max(self.height, other.height)
        return new_node

    # Used to determine a node's place in the heap
    # Sorts based on number of occurrences first, height second.
    # This allows for a shorter code word.
    def __lt__(self, other):
        if self.count == other.count:
            return self.height < other.height
        return self.count < other.count

    def __repr__(self):
        return f"|'{self.string}', {self.count}|"

    def num_children(self):
        return (self.left is not None) + (self.right is not None)

    # Apply a function to all child nodes via DFS search
    def dfs_apply(self, func):
        func(self)
        if self.left is not None:
            self.left.dfs_apply(func)
        if self.right is not None:
            self.right.dfs_apply(func)


# Counts the number of unique characters in a string and places them in a dictionary
def count_unique(string):
    count = {}
    for char in string:
        if char in count:
            count[char] += 1
        else:
            count[char] = 1
    return count


# Given a string, returns a dictionary mapping from a string to a binary code
def huffman(string):
    count = count_unique(string)
    alphabet = "".join(count.keys())
    nodes = [HuffNode(letter, count[letter]) for letter in alphabet] # Node for each letter
    leaves = nodes[:]  # Just the leaf nodes used later for extracting leaf Huffman codes
    heapq.heapify(nodes)
    # Keep linking nodes together until there is only one left
    while len(nodes) > 1:
        first, second = heapq.heappop(nodes), heapq.heappop(nodes)  # Pop two off the heap
        new_node = first + second  # Combine the nodes into a new, combined node. (Non-destructive)
        heapq.heappush(nodes, new_node)  # Add this new node to the heap
    root = nodes[0]
    huffman_calculate(root)  # Calculate Huffman codes for the entire tree
    result = {}  # Combine results into a dictionary
    for leaf in leaves:
        result[leaf.string] = leaf.binary
    return result


# Calculates the Huffman codes of this node and all of its children recursively
# Non-leaf nodes are calculated as intermediate values. Only the leaves are considered important
def huffman_calculate(node: HuffNode, bin_prefix=""):
    if node.num_children() == 0:
        node.binary = bin_prefix
        return
    huffman_calculate(node.left, bin_prefix + "0")
    huffman_calculate(node.right, bin_prefix + "1")


# Calculate the Elias omega code for a given positive integer
def elias(n):
    assert n > 0
    binary = bin(n)[2:]
    result = binary
    while len(binary) != 1:
        binary = "0" + bin(len(binary) - 1)[3:]
        result = binary + result
    return result


# Generate the binary header for a given string
def header(string):
    huff_code = huffman(string)
    alphabet_size = len(huff_code.keys())
    output = f"{elias(alphabet_size)}"
    for char in huff_code:
        output += bin(ord(char))[2:]
        output += elias(len(huff_code[char]))
        output += huff_code[char]
    return output


# Read a file and return the contents
def read_file(filename):
    with open(filename, "r") as file:
        contents = file.read()
    return contents


# Write a given string to file
def write_file(filename, contents):
    with open(filename, "w") as file:
        file.write(contents)


def main():
    assert len(sys.argv) >= 2
    string = read_file(sys.argv[1])
    write_file("output_header.txt", header(string))


if __name__ == "__main__":
    main()