1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
import heapq
import sys
# Represents a node in the tree used to construct a Huffman code
class HuffNode:
def __init__(self, string, count):
self.string = string
self.count = count
self.left = None
self.right = None
self.binary = None
self.height = 0
def __add__(self, other):
assert isinstance(other, HuffNode)
new_node = HuffNode(self.string + other.string, self.count + other.count)
new_node.left = self
new_node.right = other
new_node.height += 1 + max(self.height, other.height)
return new_node
# Used to determine a node's place in the heap
# Sorts based on number of occurrences first, height second.
# This allows for a shorter code word.
def __lt__(self, other):
if self.count == other.count:
return self.height < other.height
return self.count < other.count
def __repr__(self):
return f"|'{self.string}', {self.count}|"
def num_children(self):
return (self.left is not None) + (self.right is not None)
# Apply a function to all child nodes via DFS search
def dfs_apply(self, func):
func(self)
if self.left is not None:
self.left.dfs_apply(func)
if self.right is not None:
self.right.dfs_apply(func)
# Counts the number of unique characters in a string and places them in a dictionary
def count_unique(string):
count = {}
for char in string:
if char in count:
count[char] += 1
else:
count[char] = 1
return count
# Given a string, returns a dictionary mapping from a string to a binary code
def huffman(string):
count = count_unique(string)
alphabet = "".join(count.keys())
nodes = [HuffNode(letter, count[letter]) for letter in alphabet] # Node for each letter
leaves = nodes[:] # Just the leaf nodes used later for extracting leaf Huffman codes
heapq.heapify(nodes)
# Keep linking nodes together until there is only one left
while len(nodes) > 1:
first, second = heapq.heappop(nodes), heapq.heappop(nodes) # Pop two off the heap
new_node = first + second # Combine the nodes into a new, combined node. (Non-destructive)
heapq.heappush(nodes, new_node) # Add this new node to the heap
root = nodes[0]
huffman_calculate(root) # Calculate Huffman codes for the entire tree
result = {} # Combine results into a dictionary
for leaf in leaves:
result[leaf.string] = leaf.binary
return result
# Calculates the Huffman codes of this node and all of its children recursively
# Non-leaf nodes are calculated as intermediate values. Only the leaves are considered important
def huffman_calculate(node: HuffNode, bin_prefix=""):
if node.num_children() == 0:
node.binary = bin_prefix
return
huffman_calculate(node.left, bin_prefix + "0")
huffman_calculate(node.right, bin_prefix + "1")
# Calculate the Elias omega code for a given positive integer
def elias(n):
assert n > 0
binary = bin(n)[2:]
result = binary
while len(binary) != 1:
binary = "0" + bin(len(binary) - 1)[3:]
result = binary + result
return result
# Generate the binary header for a given string
def header(string):
huff_code = huffman(string)
alphabet_size = len(huff_code.keys())
output = f"{elias(alphabet_size)}"
for char in huff_code:
output += bin(ord(char))[2:]
output += elias(len(huff_code[char]))
output += huff_code[char]
return output
# Read a file and return the contents
def read_file(filename):
with open(filename, "r") as file:
contents = file.read()
return contents
# Write a given string to file
def write_file(filename, contents):
with open(filename, "w") as file:
file.write(contents)
def main():
assert len(sys.argv) >= 2
string = read_file(sys.argv[1])
write_file("output_header.txt", header(string))
if __name__ == "__main__":
main()
|