Published on

How to merge HDF5 files

For the first option, external link, which is fastest. But unfortunately, Visit doesn't support external link in current version. For details, you can check this thread in mailing list. Before that, you can build visit from source code in my branch.

import h5py
from tqdm import tqdm
 
input_file_names = ["phi1.xz.h5", "phi2.xz.h5"]
output_file_name = "phi.external.xz.h5"
 
exist_keys = set([])
 
with h5py.File(output_file_name, "w") as output_file:
    for input_file_name in input_file_names:
        with h5py.File(input_file_name, "r") as input_file:
            keys = input_file.keys()
            for key in tqdm(keys, desc = input_file_name):
                if key in exist_keys:
                    if key != "Parameters and Global Attributes":
                        print("dataset '{}' already exist, skipping".format(key))
                else:
                    exist_keys.add(key)
                    if key == "Parameters and Global Attributes":
                        input_file.copy(key, output_file)
                        output_file[key].attrs["nioprocs"] = 1
                    else:
                        output_file[key] = h5py.ExternalLink(input_file_name, key)

use "modern" hdf5 file format

The second option is using modern hdf5 file format. Although not as good as external links, it's still fast enough. Sadly, the precompiled version of visit is shipped with hdf5 1.8, so we can't use latest version by default, which is faster. But despite this, the speed has been significantly improved. I'm satisfied with file format v108. With this option, you can build visit from source code with latest hdf5 lib.

import h5py
from tqdm import tqdm
 
input_file_names = ["phi1.3b.xz.h5", "phi2.3b.xz.h5", "alp.3b.xz.h5"]
output_file_name = "data.3b.xz.h5"
 
exist_keys = set([])
 
# v108 is the default version of visit. Using latest is faster, but we need to compile visit from source code.
with h5py.File(output_file_name, "w", libver="v108") as output_file:
    for input_file_name in input_file_names:
        with h5py.File(input_file_name, "r") as input_file:
            keys = input_file.keys()
            for key in tqdm(keys, desc = input_file_name):
                if key in exist_keys:
                    if key != "Parameters and Global Attributes":
                        print("dataset '{}' already exist, skipping".format(key))
                else:
                    exist_keys.add(key)
                    if key == "Parameters and Global Attributes":
                        input_file.copy(key, output_file)
                        output_file[key].attrs["nioprocs"] = 1
                    else:
                        input_file.copy(key, output_file)
import h5py
from tqdm import tqdm
 
input_file_names = ["phi1.xz.h5", "phi2.xz.h5"]
output_file_name = "phi.latest.xz.h5"
 
exist_keys = set([])
 
# v108 is the default version of visit. Using latest is faster, but we need to compile visit from source code.
with h5py.File(output_file_name, "w", libver="latest") as output_file:
    for input_file_name in input_file_names:
        with h5py.File(input_file_name, "r") as input_file:
            keys = input_file.keys()
            for key in tqdm(keys, desc = input_file_name):
                if key in exist_keys:
                    if key != "Parameters and Global Attributes":
                        print("dataset '{}' already exist, skipping".format(key))
                else:
                    exist_keys.add(key)
                    if key == "Parameters and Global Attributes":
                        input_file.copy(key, output_file)
                        output_file[key].attrs["nioprocs"] = 1
                    else:
                        input_file.copy(key, output_file)