Orig: Music to hear, why hear’st thou music sadly?Comp: 00100010111011111100101101110010100010100101110000111000000101101001001111111100111000101110000111000000100100011111110000101000111010101101101000100111011111100101101110010111111001011010010011100011011101Deco: Music to hear, why hear’st thou music sadly?Size uncompressed 44Size compressed 26
# This is part of my anti-atrophy tasks. As most of my professional software development# tasks are now is done through agentic development. This is fitness for the brain!## Solely based on what's here: https://en.wikipedia.org/wiki/Huffman_codingimport requestsimport pprintimport heapqimport math# Fetch the file, this is our corpusresponse = requests.get("https://www.gutenberg.org/cache/epub/100/pg100.txt")text = response.text# Calculate frequenciesfreqs = {}count = 0for c in text:    if not c in freqs:        freqs[c] = 0    freqs[c] += 1    count += 1# Normalizefor k in freqs:    freqs[k] = freqs[k] / count# How priority queues aka head queues workp1 = []# heapq.heapify(p1) - No need to run for empty listsheapq.heappush(p1, (5, "not so important"))heapq.heappush(p1, (1, "Really important"))heapq.heappush(p1, (20, "Forget it"))heapq.heappush(p1, (24, ("t1", "t2", "t3")))important_task = heapq.heappop(p1)assert important_task[0] == 1# For heapq when using the tuples, it will go onto next element if the first element is identical.# The docs are framing the type construction as a payload to attach a payload.# This is over loaded semantics (shane on you, Python)tie_breaker = 0def next_tie_breaker():    global tie_breaker    tie_breaker += 1    return tie_breakerleafs = []for k in freqs:    heapq.heappush(leafs, (freqs[k], next_tie_breaker(), k))# Alternatively# leafs1 = [(v, next_tie_breaker(), k) for k, v in freqs.items()]# heapq.heapify(leafs1)# Representing the tree:# Leaf (frq, _, symbol)# Node: (frq, _, (left, right))while True:    if len(leafs) == 1:        break    a = heapq.heappop(leafs)    b = heapq.heappop(leafs)    sumProp = a[0] + b[0]    node = (sumProp, next_tie_breaker(), (a, b))    heapq.heappush(leafs, node)huffman_tree = heapq.heappop(leafs)huffman_dict = {}rev_huffman_dict = {}stack = [("", huffman_tree)]while len(stack) > 0:    next = stack.pop()    match(next):        case (prefix, (frq, x, (left, right))):            stack.append((prefix + "1", left))            stack.append((prefix + "0", right))        case (prefix, (frq, x, symbol)):            huffman_dict[prefix] = symbol            rev_huffman_dict[symbol] = prefix        case _:            pprint.pprint(next)# Let's compress something!string = "Music to hear, why hear’st thou music sadly?"# compresscompressed = ""for c in string:    compressed += rev_huffman_dict[c]decompressed = ""tree_traverser = huffman_treefor c in compressed:    # Traverse the tree    if c == "1":        tree_traverser = tree_traverser[2][0]    if c == "0":        tree_traverser = tree_traverser[2][1]    # Are we at a leaf?    if isinstance(tree_traverser[2], str):        decompressed += tree_traverser[2]        tree_traverser = huffman_treeprint(f"Orig: {string}")print(f"Comp: {compressed}")print(f"Deco: {decompressed}")print(f"Size uncompressed {len(string)}")print(f"Size compressed {math.ceil(len(compressed) / 8)}") # We approximate here