Wednesday, January 11, 2012

Merging content of several sorted files


#!/bin/env python
# This script merges metrics output from different runs
# Does a merge sort of the files and groups together equal keys

import sys, os, time
from heapq import heappush, heappop

DELIM = ", "
END_LINE = "\n"
BLANK = "-"

# Override this method for splitting input time to key and value
def parse_input(line):
parts = map(lambda x : x.strip(), line.split("\t"))
key = " ".join(parts[0:8])
value = parts[8]
return (key, value)

def pop_record(f, split_func):
if f:
try:
line = f.readline()
return split_func(line)
except:
print 'Error:', line, parts, f
return None
else:
return None

def generate_header(files):

fout = sys.stdout
header = []
for fname in files:
header.append(fname.split("/")[-1])

fout.write("KEY" + DELIM)
index = 0
for s in header:
if index != 0:
fout.write(",")
fout.write(s)
index += 1
fout.write(END_LINE)

def merge_datasets(files):
n_count = 0
flist = []
index = 0
for fname in files:
f = open(fname)
flist.append(f)
index += 1

fout = sys.stdout
#Initialize heap
index = 0; heap = []
for f in flist:
(key, value) = pop_record(f, parse_input)
heappush(heap, (n_count, key, index, value, f))
n_count += 1
index += 1
nfiles = index
#Iterate through values
while heap:
tup = []
for i in xrange(nfiles):
if heap:
(_, key, index, value, f) = heap[0]
if index == i:
out_key = key
tup.append(value)
heappop(heap)
(key, value) = pop_record(f, parse_input)
if f:
heappush(heap, (n_count, key, index, value, f))
n_count += 1
else:
tup.append(BLANK)
else:
tup.append(BLANK)
fout.write(out_key)
for t in tup:
fout.write(DELIM + t)
fout.write(END_LINE)
for f in flist:
f.close()

if __name__ == "__main__":
files = sys.argv[1:]
if files:
generate_header(files)
merge_datasets(files)
else:
print "Usage: " + sys.argv[0] + " {list of file names to compare}"