Orig: Music to hear, why hear’st thou music sadly?Comp: 00100010111011111100101101110010100010100101110000111000000101101001001111111100111000101110000111000000100100011111110000101000111010101101101000100111011111100101101110010111111001011010010011100011011101Deco: Music to hear, why hear’st thou music sadly?Size uncompressed 44Size compressed 26
# This is part of my anti-atrophy tasks. As most of my professional software development# tasks are now is done through agentic development. This is fitness for the brain!## Solely based on what's here: https://en.wikipedia.org/wiki/Huffman_codingimport requestsimport pprintimport heapqimport math# Fetch the file, this is our corpusresponse = requests.get("https://www.gutenberg.org/cache/epub/100/pg100.txt")text = response.text# Calculate frequenciesfreqs = {}count = 0for c in text: if not c in freqs: freqs[c] = 0 freqs[c] += 1 count += 1# Normalizefor k in freqs: freqs[k] = freqs[k] / count# How priority queues aka head queues workp1 = []# heapq.heapify(p1) - No need to run for empty listsheapq.heappush(p1, (5, "not so important"))heapq.heappush(p1, (1, "Really important"))heapq.heappush(p1, (20, "Forget it"))heapq.heappush(p1, (24, ("t1", "t2", "t3")))important_task = heapq.heappop(p1)assert important_task[0] == 1# For heapq when using the tuples, it will go onto next element if the first element is identical.# The docs are framing the type construction as a payload to attach a payload.# This is over loaded semantics (shane on you, Python)tie_breaker = 0def next_tie_breaker(): global tie_breaker tie_breaker += 1 return tie_breakerleafs = []for k in freqs: heapq.heappush(leafs, (freqs[k], next_tie_breaker(), k))# Alternatively# leafs1 = [(v, next_tie_breaker(), k) for k, v in freqs.items()]# heapq.heapify(leafs1)# Representing the tree:# Leaf (frq, _, symbol)# Node: (frq, _, (left, right))while True: if len(leafs) == 1: break a = heapq.heappop(leafs) b = heapq.heappop(leafs) sumProp = a[0] + b[0] node = (sumProp, next_tie_breaker(), (a, b)) heapq.heappush(leafs, node)huffman_tree = heapq.heappop(leafs)huffman_dict = {}rev_huffman_dict = {}stack = [("", huffman_tree)]while len(stack) > 0: next = stack.pop() match(next): case (prefix, (frq, x, (left, right))): stack.append((prefix + "1", left)) stack.append((prefix + "0", right)) case (prefix, (frq, x, symbol)): huffman_dict[prefix] = symbol rev_huffman_dict[symbol] = prefix case _: pprint.pprint(next)# Let's compress something!string = "Music to hear, why hear’st thou music sadly?"# compresscompressed = ""for c in string: compressed += rev_huffman_dict[c]decompressed = ""tree_traverser = huffman_treefor c in compressed: # Traverse the tree if c == "1": tree_traverser = tree_traverser[2][0] if c == "0": tree_traverser = tree_traverser[2][1] # Are we at a leaf? if isinstance(tree_traverser[2], str): decompressed += tree_traverser[2] tree_traverser = huffman_treeprint(f"Orig: {string}")print(f"Comp: {compressed}")print(f"Deco: {decompressed}")print(f"Size uncompressed {len(string)}")print(f"Size compressed {math.ceil(len(compressed) / 8)}") # We approximate here