huffman encoding

← 🏠

For the impatient, the full implementation is at the bottom of this page.

Huffman Encoding is an easy to understand, and efficient data compression algorithm (in this example, ~75% compression ratio is attained). On this page we go through a working implementation. I wrote it in a couple hours whilst watching this great lecture on Huffman Encoding; needless to say this is for educational purposes only.

Please note, it uses the third-party bitarray python module.

Generate The Huffman Tree

graph TD 5 --> E 5 --> A 9 --> D 9 --> B 11 --> 5 11 --> C 20 --> 9 20 --> 11

The first step with Huffman encoding is to generate the encoding tree. This is done by taking the counts of each letter, and creating a hierarchy of counts.

 1from collections import Counter
 2from collections import namedtuple
 3from bitarray import bitarray
 4
 5node = namedtuple('node', 'char count left right')
 6hcode = namedtuple('hcode', 'char count code')
 7
 8def gen_huffman_tree(message):
 9    counts = Counter(message)
10    htree = {node(x[0], x[1], 0, 0) for x in sorted(counts.items(), key=lambda x: x[1])}
11    huff_min = lambda: min(htree, key=lambda x: x.count)
12    while len(htree) > 1:
13        a = huff_min()
14        htree.remove(a)
15        b = huff_min()
16        htree.remove(b)
17        htree.add(node('', a.count + b.count, a, b))
18    return next(iter(htree)), len(counts)
19
20def main():
21    # this is the message we want to encode
22    message  = 'BCCABBDDAECCBBAEDDCC'
23    
24    # compute the huffman tree
25    htree, count = gen_huffman_tree(message)
26
27    print(htree)
28
29if __name__ == '__main__':
30    main()

Generating the tree is quite straightforward, you can find out more on generating callgraphs in python with mermaid.

Generate the Huffman Codes

Next you'll want to generate the table of codes. i.e the letter frequencies, and the encoded path each letter takes down the Huffman tree.

 1def gen_huffman_codes(htree, count):
 2    code = [0]*count
 3    hcodes = {}
 4    def gen_codes(htree, depth=0):
 5        if htree.left:
 6            code[depth] = 0
 7            gen_codes(htree.left, depth+1)
 8        if htree.right:
 9            code[depth] = 1
10            gen_codes(htree.right, depth+1)
11        is_leaf = htree.left == htree.right == 0
12        if is_leaf:
13            hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
14    gen_codes(htree)
15    return hcodes
16
17def print_huffman_codec(hcodes):
18    for c, v in hcodes.items():
19        print(v)
20
21def main():
22    # this is the message we want to encode
23    message  = 'BCCABBDDAECCBBAEDDCC'
24    
25    # compute the huffman tree
26    htree, count = gen_huffman_tree(message)
27
28    # compute our huffman encoder
29    hcodes = gen_huffman_codes(htree, count)
30 
31    print_huffman_codec(hcodes)

Output:

1hcode(char='D', count=4, code=bitarray('00'))
2hcode(char='E', count=2, code=bitarray('010'))
3hcode(char='A', count=3, code=bitarray('011'))
4hcode(char='B', count=5, code=bitarray('10'))
5hcode(char='C', count=6, code=bitarray('11'))

Compress the Message

Next we'll want to encode the actual message.

 1def encode_message(message, hcodes):
 2    huff_encoding = bitarray()
 3    for c in message:
 4        huff_encoding.extend(hcodes[c].code)
 5    return huff_encoding
 6
 7
 8def main():
 9    # this is the message we want to encode
10    message  = 'BCCABBDDAECCBBAEDDCC'
11    
12    # compute the huffman tree
13    htree, count = gen_huffman_tree(message)
14
15    # compute our huffman encoder
16    hcodes = gen_huffman_codes(htree, count)
17
18    # print our huffman codec
19    print_huffman_codec(hcodes)
20
21    # compress the message into our huffman encoding
22    huff_encoding = encode_message(message, hcodes)

Output:

1bitarray('101111011101000000110101111101001101000001111')

Decompress the Message

Next we'll want to decode our Huffman encoded message. For that we simply perform a traveral, while following the directional rules of our encoded message. When we hit a leaf, that's the character we are after.

 1def decode_huffman_encoding(htree, huff_encoding):
 2    def decode(depth, htree):
 3        is_leaf = htree.left == htree.right == 0
 4        if is_leaf:
 5            offset[0] += depth
 6            decompressed_message.append(htree.char)
 7        else:
 8            bit = huff_encoding[depth+offset[0]]
 9            if htree.left and not bit:
10                decode(depth+1, htree.left)
11            elif htree.right and bit:
12                decode(depth+1, htree.right)
13
14    decompressed_message = []
15    offset = [0]
16
17    while offset[0] < len(huff_encoding):
18        decode(0, htree)
19
20    decompressed_message = ''.join(decompressed_message)
21    return decompressed_message
22
23def main():
24    # this is the message we want to encode
25    message  = 'BCCABBDDAECCBBAEDDCC'
26    
27    # compute the huffman tree
28    htree, count = gen_huffman_tree(message)
29
30    # compute our huffman encoder
31    hcodes = gen_huffman_codes(htree, count)
32
33    # print our huffman codec
34    print_huffman_codec(hcodes)
35
36    # compress the message into our huffman encoding
37    huff_encoding = encode_message(message, hcodes)
38
39    # decompress our huffman encoding back into a string
40    decompressed_message = decode_huffman_encoding(htree, huff_encoding)
41 
42    print(decompressed_message)

Output:

1BCCABBDDAECCBBAEDDCC

Printing some stats

The final step is to print some stats. Please note, i did not include the step of including the hcodes and htree into a final output, which would have indeed lead to a slight drop in the compression ratio.

 1def main():
 2    # this is the message we want to encode
 3    message  = 'BCCABBDDAECCBBAEDDCC'
 4    
 5    # compute the huffman tree
 6    htree, count = gen_huffman_tree(message)
 7
 8    # compute our huffman encoder
 9    hcodes = gen_huffman_codes(htree, count)
10
11    # print our huffman codec
12    print_huffman_codec(hcodes)
13
14    # compress the message into our huffman encoding
15    huff_encoding = encode_message(message, hcodes)
16
17    # decompress our huffman encoding back into a string
18    decompressed_message = decode_huffman_encoding(htree, huff_encoding)
19
20    # print stats and sanity check
21    print(f'Original message: {message}')
22    print(f'Huffman Encoded Message: {huff_encoding}')
23    print(f'Original Size: {len(message)*8} bits.')
24    print(f'New Size: {len(huff_encoding)} bits.')
25    print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
26    print(f'decompressed message: {decompressed_message}')
27    print(f'decompression success: {message == decompressed_message}')

Output:

1Original message: BCCABBDDAECCBBAEDDCC
2Huffman Encoded Message: bitarray('101111011101000000110101111101001101000001111')
3Original Size: 160 bits.
4New Size: 45 bits.
5Compression: 71.875%.
6decompressed message: BCCABBDDAECCBBAEDDCC
7decompression success: True

Calculating Encoding Size

Using the codes

The size of the final encoding can be calculated from the hcode var:

$\sum_{i=0}^c count_i \times len(code_i)$

It can also be calculated with with htree var:

$\sum_{i=0}^c d_i \times f_i$

Where di is the depth of the leaf, and fi is the count for the leaf.

Putting it all together

Here's the full implementation for your fun and enjoyment.

 1from collections import Counter
 2from collections import namedtuple
 3from bitarray import bitarray
 4
 5node = namedtuple('node', 'char count left right')
 6hcode = namedtuple('hcode', 'char count code')
 7
 8def gen_huffman_tree(message):
 9    counts = Counter(message)
10    htree = {node(x[0], x[1], 0, 0) for x in sorted([*counts.items()], key=lambda x: x[1])}
11    huff_min = lambda: min(htree, key=lambda x: x.count)
12    while len(htree) > 1:
13        a = huff_min()
14        htree.remove(a)
15        b = huff_min()
16        htree.remove(b)
17        htree.add(node('', a.count + b.count, a, b))
18    return next(iter(htree)), len(counts)
19
20def gen_huffman_codes(htree, count):
21    code = [0]*count
22    hcodes = {}
23    def gen_codes(htree, depth=0):
24        if htree.left:
25            code[depth] = 0
26            gen_codes(htree.left, depth+1)
27        if htree.right:
28            code[depth] = 1
29            gen_codes(htree.right, depth+1)
30        is_leaf = htree.left == htree.right == 0
31        if is_leaf:
32            hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
33    gen_codes(htree)
34    return hcodes
35
36def decode_huffman_encoding(htree, huff_encoding):
37    def decode(depth, htree):
38        is_leaf = htree.left == htree.right == 0
39        if is_leaf:
40            offset[0] += depth
41            decompressed_message.append(htree.char)
42        else:
43            bit = huff_encoding[depth+offset[0]]
44            if htree.left and not bit:
45                decode(depth+1, htree.left)
46            elif htree.right and bit:
47                decode(depth+1, htree.right)
48
49    decompressed_message = []
50    offset = [0]
51
52    while offset[0] < len(huff_encoding):
53        decode(0, htree)
54
55    decompressed_message = ''.join(decompressed_message)
56    return decompressed_message
57
58def print_huffman_codec(hcodes):
59    for c, v in hcodes.items():
60        print(v)
61
62def encode_message(message, hcodes):
63    huff_encoding = bitarray()
64    for c in message:
65        huff_encoding.extend(hcodes[c].code)
66    return huff_encoding
67
68def main():
69    # this is the message we want to encode
70    message  = 'BCCABBDDAECCBBAEDDCC'
71    
72    # compute the huffman tree
73    htree, count = gen_huffman_tree(message)
74
75    # compute our huffman encoder
76    hcodes = gen_huffman_codes(htree, count)
77
78    # print our huffman codec
79    print_huffman_codec(hcodes)
80
81    # compress the message into our huffman encoding
82    huff_encoding = encode_message(message, hcodes)
83
84    # decompress our huffman encoding back into a string
85    decompressed_message = decode_huffman_encoding(htree, huff_encoding)
86
87    # print stats and sanity check
88    print(f'Original message: {message}')
89    print(f'Huffman Encoded Message: {huff_encoding}')
90    print(f'Original Size: {len(message)*8} bits.')
91    print(f'New Size: {len(huff_encoding)} bits.')
92    print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
93    print(f'decompressed message: {decompressed_message}')
94    print(f'decompression success: {message == decompressed_message}')
95
96if __name__ == '__main__':
97    main()