import heapq import sys # Represents a node in the tree used to construct a Huffman code class HuffNode: def __init__(self, string, count): self.string = string self.count = count self.left = None self.right = None self.binary = None self.height = 0 def __add__(self, other): assert isinstance(other, HuffNode) new_node = HuffNode(self.string + other.string, self.count + other.count) new_node.left = self new_node.right = other new_node.height += 1 + max(self.height, other.height) return new_node # Used to determine a node's place in the heap # Sorts based on number of occurrences first, height second. # This allows for a shorter code word. def __lt__(self, other): if self.count == other.count: return self.height < other.height return self.count < other.count def __repr__(self): return f"|'{self.string}', {self.count}|" def num_children(self): return (self.left is not None) + (self.right is not None) # Apply a function to all child nodes via DFS search def dfs_apply(self, func): func(self) if self.left is not None: self.left.dfs_apply(func) if self.right is not None: self.right.dfs_apply(func) # Counts the number of unique characters in a string and places them in a dictionary def count_unique(string): count = {} for char in string: if char in count: count[char] += 1 else: count[char] = 1 return count # Given a string, returns a dictionary mapping from a string to a binary code def huffman(string): count = count_unique(string) alphabet = "".join(count.keys()) nodes = [HuffNode(letter, count[letter]) for letter in alphabet] # Node for each letter leaves = nodes[:] # Just the leaf nodes used later for extracting leaf Huffman codes heapq.heapify(nodes) # Keep linking nodes together until there is only one left while len(nodes) > 1: first, second = heapq.heappop(nodes), heapq.heappop(nodes) # Pop two off the heap new_node = first + second # Combine the nodes into a new, combined node. (Non-destructive) heapq.heappush(nodes, new_node) # Add this new node to the heap root = nodes[0] huffman_calculate(root) # Calculate Huffman codes for the entire tree result = {} # Combine results into a dictionary for leaf in leaves: result[leaf.string] = leaf.binary return result # Calculates the Huffman codes of this node and all of its children recursively # Non-leaf nodes are calculated as intermediate values. Only the leaves are considered important def huffman_calculate(node: HuffNode, bin_prefix=""): if node.num_children() == 0: node.binary = bin_prefix return huffman_calculate(node.left, bin_prefix + "0") huffman_calculate(node.right, bin_prefix + "1") # Calculate the Elias omega code for a given positive integer def elias(n): assert n > 0 binary = bin(n)[2:] result = binary while len(binary) != 1: binary = "0" + bin(len(binary) - 1)[3:] result = binary + result return result # Generate the binary header for a given string def header(string): huff_code = huffman(string) alphabet_size = len(huff_code.keys()) output = f"{elias(alphabet_size)}" for char in huff_code: output += bin(ord(char))[2:] output += elias(len(huff_code[char])) output += huff_code[char] return output # Read a file and return the contents def read_file(filename): with open(filename, "r") as file: contents = file.read() return contents # Write a given string to file def write_file(filename, contents): with open(filename, "w") as file: file.write(contents) def main(): assert len(sys.argv) >= 2 string = read_file(sys.argv[1]) write_file("output_header.txt", header(string)) if __name__ == "__main__": main()