optimal merge pattern
On this page we'll take a look at the optimal merge pattern discussed in this great lecture. Please note we use the optimal merge pattern to build our huffman encoding tree, although in this article we use a more efficient method (at the expense of being less concise).
This time, rather than explicitely making a call to min
as we did in our huffman encoding example, we use a heapq
.
using a heapq for efficiency
The heapq
, a binary tree, remains sorted, and has O(logn)
push and pop. Using a heapq
to generate our optimal merge pattern is therefore done in O(nlogn)
, while using min
, which takes O(n)
would have been in the order of O(n^2)
time.
1import heapq
2
3class MergeNode:
4
5 def __init__(self, name=0, size=0, l=0, r=0):
6 self.name = name
7 self.size = size if size else l.size + r.size
8 self.l = l
9 self.r = r
10
11 def __lt__(self, other):
12 return self.size < other.size
13
14 def __repr__(self):
15 return f"[{self.name}:{self.size}{self.l}:{self.r}]"
16
17 def count(self):
18 if self.l or self.r:
19 lc = self.l.count() if self.l else 0
20 rc = self.r.count() if self.r else 0
21 return lc + rc + self.size
22 return 0
23
24def gen_optimal_merge(items, print_tree=False):
25 heapq.heapify(items)
26
27 while len(items) > 1:
28 l = heapq.heappop(items)
29 r = heapq.heappop(items)
30 if print_tree:
31 if l.name:
32 print(f"{l.size+r.size} --> {l.name}[{l.name}:{l.size}]")
33 else:
34 print(f"{l.size+r.size} --> {l.size}")
35 if r.name:
36 print(f"{l.size+r.size} --> {r.name}[{r.name}:{r.size}]")
37 else:
38 print(f"{l.size+r.size} --> {r.size}")
39 heapq.heappush(items, MergeNode( name='',
40 size=0,
41 l=l,
42 r=r))
43 return next(iter(items))
44
45def main():
46 from itertools import combinations as comb
47 from string import ascii_lowercase as asc
48 from random import randint
49 r = iter(range(1, 1000000))
50 items = [MergeNode(''.join(x), next(r)) for x in comb(asc, 6)]
51 print(f'lists to merge: {len(items)}')
52 print_tree = False
53 if print_tree:
54 print('graph TD')
55
56 optimal_merge = gen_optimal_merge(items, print_tree)
57
58 if print_tree:
59 print(optimal_merge)
60 print(optimal_merge.count())
61
62if __name__ == '__main__':
63 main()
larger merge patterns
Let's try with a slightly larger tree. Please note this is graphed using the same technique as generating callgraphs in python with mermaid.
merging huge sequences
Here we build a optimal merge tree for a quarter of a million merge sequences, on one thread, in roughly 4 seconds.
1shyal:[~/dev/monolith] time python3 tmp.py
2lists to merge: 230230
3465300318788
4
5real 0m4.144s
6user 0m4.004s
7sys 0m0.119s