Write to HDF5 and shuffle big arrays of data

Question

I have downloaded Caltech101. Its structure is:

#Caltech101 dir #class1 dir #images of class1 jpgs #class2 dir #images of class2 jpgs ... #class100 dir #images of class100 jpgs

My problem is that I can't keep in memory two np arrays x and y of shape (9144, 240, 180, 3) and (9144). So my solution is to overallocate a h5py dataset, load them in 2 chunks and write them to file one after the other. Precisely:

from __future__ import print_function
import os
import glob
from scipy.misc import imread, imresize
from sklearn.utils import shuffle
import numpy as np
import h5py
from time import time


def load_chunk(images_dset, labels_dset, chunk_of_classes, counter, type_key, prev_chunk_length):
    # getting images and processing
    xtmp = []
    ytmp = []
    for label in chunk_of_classes:
        img_list = sorted(glob.glob(os.path.join(dir_name, label, "*.jpg")))
        for img in img_list:
            img = imread(img, mode='RGB')
            img = imresize(img, (240, 180))
            xtmp.append(img)
            ytmp.append(label)
        print(label, 'done')

    x = np.concatenate([arr[np.newaxis] for arr in xtmp])
    y = np.array(ytmp, dtype=type_key)
    print('x: ', type(x), np.shape(x), 'y: ', type(y), np.shape(y))

    # writing to dataset
    a = time()
    images_dset[prev_chunk_length:prev_chunk_length+x.shape[0], :, :, :] = x
    print(labels_dset.shape)
    print(y.shape, y.shape[0])
    print(type(y), y.dtype)
    print(prev_chunk_length)
    labels_dset[prev_chunk_length:prev_chunk_length+y.shape[0]] = y
    b = time()
    print('Chunk', counter, 'written in', b-a, 'seconds')
    return prev_chunk_length+x.shape[0]


def write_to_file(remove_DS_Store):
    if os.path.isfile('caltech101.h5'):
        print('File exists already')
        return
    else:
        # the name of each dir is the name of a class
        classes = os.listdir(dir_name)
        if remove_DS_Store:
            classes.pop(0)  # removes .DS_Store - may not be used on other terminals

        # need the dtype of y in order to initialize h5 dataset
        s = ''
        key_type_y = s.join(['S', str(len(max(classes, key=len)))])
        classes = np.array(classes, dtype=key_type_y)

        # number of chunks in which the dataset must be divided
        nb_chunks = 2
        nb_chunks_loaded = 0
        prev_chunk_length = 0
        # open file and allocating a dataset
        f = h5py.File('caltech101.h5', 'a')
        imgs = f.create_dataset('images', shape=(9144, 240, 180, 3), dtype='uint8')
        labels = f.create_dataset('labels', shape=(9144,), dtype=key_type_y)
        for class_sublist in np.array_split(classes, nb_chunks):
            # loading chunk by chunk in a function to avoid memory overhead
            prev_chunk_length = load_chunk(imgs, labels, class_sublist, nb_chunks_loaded, key_type_y, prev_chunk_length)
            nb_chunks_loaded += 1
        f.close()
        print('Images and labels saved to \'caltech101.h5\'')
    return

dir_name = '../Datasets/Caltech101'
write_to_file(remove_DS_Store=True)

This works quite well, and also reading is actually fast enough. The problem is that I need to shuffle the dataset.

Observations:

Shuffling the dataset objects: obviously veeeery slow because they're on disk.
Creating an array of shuffled indices and use advanced numpy indexing. This means slower reading from file.
Shuffling before writing to file would be nice, problem: I have only about half of the dataset in memory each time. I would get an improper shuffling.

Can you think of a way to shuffle before writing? I'm open also to solutions which rethink the writing process, as long as it doesn't use a lot of memory.

thertweck thertweck · Accepted Answer · 2017-01-25T10:55:27

You could shuffle the file paths before reading the image data.

Instead of shuffling the image data in memory, create a list of all file paths that belong to the dataset. Then shuffle the list of file paths. Now you can create your HDF5 database as before.

You could for example use glob to create the list of files for shuffling:

import glob
import random

files = glob.glob('../Datasets/Caltech101/*/*.jpg')
shuffeled_files = random.shuffle(files)

You could then retrieve the class label and image name from the path:

import os

for file_path in shuffeled_files:
    label = os.path.basename(os.path.dirname(file_path))
    image_id = os.path.splitext(os.path.basename(file_path))[0]

Write to HDF5 and shuffle big arrays of data

1 Answers