huffman encoding
For the impatient, the full implementation is at the bottom of this page.
Huffman Encoding is an easy to understand, and efficient data compression algorithm (in this example, ~75% compression ratio is attained). On this page we go through a working implementation. I wrote it in a couple hours whilst watching this great lecture on Huffman Encoding; needless to say this is for educational purposes only.
Please note, it uses the third-party bitarray python module.
Generate The Huffman Tree
The first step with Huffman encoding is to generate the encoding tree. This is done by taking the counts of each letter, and creating a hierarchy of counts.
1from collections import Counter
2from collections import namedtuple
3from bitarray import bitarray
4
5node = namedtuple('node', 'char count left right')
6hcode = namedtuple('hcode', 'char count code')
7
8def gen_huffman_tree(message):
9 counts = Counter(message)
10 htree = {node(x[0], x[1], 0, 0) for x in sorted(counts.items(), key=lambda x: x[1])}
11 huff_min = lambda: min(htree, key=lambda x: x.count)
12 while len(htree) > 1:
13 a = huff_min()
14 htree.remove(a)
15 b = huff_min()
16 htree.remove(b)
17 htree.add(node('', a.count + b.count, a, b))
18 return next(iter(htree)), len(counts)
19
20def main():
21 # this is the message we want to encode
22 message = 'BCCABBDDAECCBBAEDDCC'
23
24 # compute the huffman tree
25 htree, count = gen_huffman_tree(message)
26
27 print(htree)
28
29if __name__ == '__main__':
30 main()
Generating the tree is quite straightforward, you can find out more on generating callgraphs in python with mermaid.
Generate the Huffman Codes
Next you'll want to generate the table of codes. i.e the letter frequencies, and the encoded path each letter takes down the Huffman tree.
1def gen_huffman_codes(htree, count):
2 code = [0]*count
3 hcodes = {}
4 def gen_codes(htree, depth=0):
5 if htree.left:
6 code[depth] = 0
7 gen_codes(htree.left, depth+1)
8 if htree.right:
9 code[depth] = 1
10 gen_codes(htree.right, depth+1)
11 is_leaf = htree.left == htree.right == 0
12 if is_leaf:
13 hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
14 gen_codes(htree)
15 return hcodes
16
17def print_huffman_codec(hcodes):
18 for c, v in hcodes.items():
19 print(v)
20
21def main():
22 # this is the message we want to encode
23 message = 'BCCABBDDAECCBBAEDDCC'
24
25 # compute the huffman tree
26 htree, count = gen_huffman_tree(message)
27
28 # compute our huffman encoder
29 hcodes = gen_huffman_codes(htree, count)
30
31 print_huffman_codec(hcodes)
Output:
1hcode(char='D', count=4, code=bitarray('00'))
2hcode(char='E', count=2, code=bitarray('010'))
3hcode(char='A', count=3, code=bitarray('011'))
4hcode(char='B', count=5, code=bitarray('10'))
5hcode(char='C', count=6, code=bitarray('11'))
Compress the Message
Next we'll want to encode the actual message.
1def encode_message(message, hcodes):
2 huff_encoding = bitarray()
3 for c in message:
4 huff_encoding.extend(hcodes[c].code)
5 return huff_encoding
6
7
8def main():
9 # this is the message we want to encode
10 message = 'BCCABBDDAECCBBAEDDCC'
11
12 # compute the huffman tree
13 htree, count = gen_huffman_tree(message)
14
15 # compute our huffman encoder
16 hcodes = gen_huffman_codes(htree, count)
17
18 # print our huffman codec
19 print_huffman_codec(hcodes)
20
21 # compress the message into our huffman encoding
22 huff_encoding = encode_message(message, hcodes)
Output:
1bitarray('101111011101000000110101111101001101000001111')
Decompress the Message
Next we'll want to decode our Huffman encoded message. For that we simply perform a traveral, while following the directional rules of our encoded message. When we hit a leaf, that's the character we are after.
1def decode_huffman_encoding(htree, huff_encoding):
2 def decode(depth, htree):
3 is_leaf = htree.left == htree.right == 0
4 if is_leaf:
5 offset[0] += depth
6 decompressed_message.append(htree.char)
7 else:
8 bit = huff_encoding[depth+offset[0]]
9 if htree.left and not bit:
10 decode(depth+1, htree.left)
11 elif htree.right and bit:
12 decode(depth+1, htree.right)
13
14 decompressed_message = []
15 offset = [0]
16
17 while offset[0] < len(huff_encoding):
18 decode(0, htree)
19
20 decompressed_message = ''.join(decompressed_message)
21 return decompressed_message
22
23def main():
24 # this is the message we want to encode
25 message = 'BCCABBDDAECCBBAEDDCC'
26
27 # compute the huffman tree
28 htree, count = gen_huffman_tree(message)
29
30 # compute our huffman encoder
31 hcodes = gen_huffman_codes(htree, count)
32
33 # print our huffman codec
34 print_huffman_codec(hcodes)
35
36 # compress the message into our huffman encoding
37 huff_encoding = encode_message(message, hcodes)
38
39 # decompress our huffman encoding back into a string
40 decompressed_message = decode_huffman_encoding(htree, huff_encoding)
41
42 print(decompressed_message)
Output:
1BCCABBDDAECCBBAEDDCC
Printing some stats
The final step is to print some stats. Please note, i did not include the step of including the hcodes and htree into a final output, which would have indeed lead to a slight drop in the compression ratio.
1def main():
2 # this is the message we want to encode
3 message = 'BCCABBDDAECCBBAEDDCC'
4
5 # compute the huffman tree
6 htree, count = gen_huffman_tree(message)
7
8 # compute our huffman encoder
9 hcodes = gen_huffman_codes(htree, count)
10
11 # print our huffman codec
12 print_huffman_codec(hcodes)
13
14 # compress the message into our huffman encoding
15 huff_encoding = encode_message(message, hcodes)
16
17 # decompress our huffman encoding back into a string
18 decompressed_message = decode_huffman_encoding(htree, huff_encoding)
19
20 # print stats and sanity check
21 print(f'Original message: {message}')
22 print(f'Huffman Encoded Message: {huff_encoding}')
23 print(f'Original Size: {len(message)*8} bits.')
24 print(f'New Size: {len(huff_encoding)} bits.')
25 print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
26 print(f'decompressed message: {decompressed_message}')
27 print(f'decompression success: {message == decompressed_message}')
Output:
1Original message: BCCABBDDAECCBBAEDDCC
2Huffman Encoded Message: bitarray('101111011101000000110101111101001101000001111')
3Original Size: 160 bits.
4New Size: 45 bits.
5Compression: 71.875%.
6decompressed message: BCCABBDDAECCBBAEDDCC
7decompression success: True
Calculating Encoding Size
Using the codes
The size of the final encoding can be calculated from the hcode var:
It can also be calculated with with htree var:
Where di
is the depth of the leaf, and fi
is the count for the leaf.
Putting it all together
Here's the full implementation for your fun and enjoyment.
1from collections import Counter
2from collections import namedtuple
3from bitarray import bitarray
4
5node = namedtuple('node', 'char count left right')
6hcode = namedtuple('hcode', 'char count code')
7
8def gen_huffman_tree(message):
9 counts = Counter(message)
10 htree = {node(x[0], x[1], 0, 0) for x in sorted([*counts.items()], key=lambda x: x[1])}
11 huff_min = lambda: min(htree, key=lambda x: x.count)
12 while len(htree) > 1:
13 a = huff_min()
14 htree.remove(a)
15 b = huff_min()
16 htree.remove(b)
17 htree.add(node('', a.count + b.count, a, b))
18 return next(iter(htree)), len(counts)
19
20def gen_huffman_codes(htree, count):
21 code = [0]*count
22 hcodes = {}
23 def gen_codes(htree, depth=0):
24 if htree.left:
25 code[depth] = 0
26 gen_codes(htree.left, depth+1)
27 if htree.right:
28 code[depth] = 1
29 gen_codes(htree.right, depth+1)
30 is_leaf = htree.left == htree.right == 0
31 if is_leaf:
32 hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
33 gen_codes(htree)
34 return hcodes
35
36def decode_huffman_encoding(htree, huff_encoding):
37 def decode(depth, htree):
38 is_leaf = htree.left == htree.right == 0
39 if is_leaf:
40 offset[0] += depth
41 decompressed_message.append(htree.char)
42 else:
43 bit = huff_encoding[depth+offset[0]]
44 if htree.left and not bit:
45 decode(depth+1, htree.left)
46 elif htree.right and bit:
47 decode(depth+1, htree.right)
48
49 decompressed_message = []
50 offset = [0]
51
52 while offset[0] < len(huff_encoding):
53 decode(0, htree)
54
55 decompressed_message = ''.join(decompressed_message)
56 return decompressed_message
57
58def print_huffman_codec(hcodes):
59 for c, v in hcodes.items():
60 print(v)
61
62def encode_message(message, hcodes):
63 huff_encoding = bitarray()
64 for c in message:
65 huff_encoding.extend(hcodes[c].code)
66 return huff_encoding
67
68def main():
69 # this is the message we want to encode
70 message = 'BCCABBDDAECCBBAEDDCC'
71
72 # compute the huffman tree
73 htree, count = gen_huffman_tree(message)
74
75 # compute our huffman encoder
76 hcodes = gen_huffman_codes(htree, count)
77
78 # print our huffman codec
79 print_huffman_codec(hcodes)
80
81 # compress the message into our huffman encoding
82 huff_encoding = encode_message(message, hcodes)
83
84 # decompress our huffman encoding back into a string
85 decompressed_message = decode_huffman_encoding(htree, huff_encoding)
86
87 # print stats and sanity check
88 print(f'Original message: {message}')
89 print(f'Huffman Encoded Message: {huff_encoding}')
90 print(f'Original Size: {len(message)*8} bits.')
91 print(f'New Size: {len(huff_encoding)} bits.')
92 print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
93 print(f'decompressed message: {decompressed_message}')
94 print(f'decompression success: {message == decompressed_message}')
95
96if __name__ == '__main__':
97 main()