I am using keras to train my model on ImageNet2012. When I use a batch size of 256 on a single GPU, it can train normally.
When I use 6 GPUs, I set the batch size to 1024, I am facing out of memory issues:
My environment:
keras-2.2.4
tensorflow-gpu-1.14.0
python-3.6.8
CUDA-10.1
Code:
Reference from >https://github.com/david8862/keras-YOLOv3-model-set
import os, sys, argparse
import numpy as np
from multiprocessing import cpu_count
from multi_gpu import ParallelModel
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TerminateOnNaN
from tensorflow.keras.utils import multi_gpu_model
from yolo3.models.yolo3_nano import NanoNet
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5"
def get_model(model_type):
if model_type == 'nanonet':
with tf.device('/cpu:0'):
model = NanoNet(input_shape=(224, 224, 3), weights=None)
else:
raise ValueError('Unsupported model type')
return model
def main(args):
log_dir = args.log_dir#'logs/'
# prepare model
model = get_model("nanonet")
# support multi-gpu training
paralleled_model = multi_gpu_model(model, gpus=6)
if args.weights_path:
paralleled_model.load_weights(args.weights_path)
# callbacks for training process
checkpoint = ModelCheckpoint(args.model_save_dir + 'ep{epoch:03d}-val_loss{val_loss:.3f}-val_acc{val_acc:.3f}-val_top_k_categorical_accuracy{val_top_k_categorical_accuracy:.3f}.h5',
monitor='val_acc',
mode='max',
verbose=1,
save_weights_only=False,
save_best_only=True,
period=1)
logging = TensorBoard(log_dir=args.model_save_dir, histogram_freq=0, write_graph=False, write_grads=False, write_images=False, update_freq='batch')
terminate_on_nan = TerminateOnNaN()
learn_rates = [0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]
lr_scheduler = LearningRateScheduler(lambda epoch: learn_rates[epoch // 30])
# data generator
train_datagen = ImageDataGenerator(preprocessing_function=preprocess,
zoom_range=0.25,
#shear_range=0.2,
#channel_shift_range=0.1,
#rotation_range=0.1,
width_shift_range=0.05,
height_shift_range=0.05,
horizontal_flip=True)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess)
train_generator = train_datagen.flow_from_directory(
args.train_data_path,
target_size=(224, 224),
batch_size=1204)
test_generator = test_datagen.flow_from_directory(
args.val_data_path,
target_size=(224, 224),
batch_size=1024)
# get optimizer
optimizer = get_optimizer(args.optim_type, args.learning_rate)
# start training
paralleled_model.compile(
optimizer=optimizer,
metrics=['accuracy', 'top_k_categorical_accuracy'],
loss='categorical_crossentropy')
paralleled_model.summary()
print('Train on {} samples, val on {} samples, with batch size {}.'.format(train_generator.samples, test_generator.samples, args.batch_size))
paralleled_model.fit_generator(
train_generator,
steps_per_epoch=train_generator.samples // args.batch_size,
epochs=args.total_epoch,
workers=cpu_count()-1, #Try to parallized feeding image data but leave one cpu core idle
initial_epoch=args.init_epoch,
use_multiprocessing=True,
validation_data=test_generator,
validation_steps=test_generator.samples // args.batch_size,
callbacks=[logging, checkpoint, lr_scheduler, terminate_on_nan])
# Finally store model
model.save(log_dir + 'trained_final.h5')
Error:
2019-11-22 05:06:34.936131: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 8 Chunks of size 152785920 totalling 1.14GiB
2019-11-22 05:06:34.936139: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 156898816 totalling 149.63MiB
2019-11-22 05:06:34.936147: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 166076416 totalling 158.38MiB
2019-11-22 05:06:34.936153: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 15 Chunks of size 204718080 totalling 2.86GiB
2019-11-22 05:06:34.936160: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 5 Chunks of size 209534976 totalling 999.14MiB
2019-11-22 05:06:34.936168: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 239148800 totalling 228.07MiB
2019-11-22 05:06:34.936177: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 245235712 totalling 233.88MiB
2019-11-22 05:06:34.936186: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 259511808 totalling 247.49MiB
2019-11-22 05:06:34.936192: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 3 Chunks of size 305571840 totalling 874.25MiB
2019-11-22 05:06:34.936201: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 316582656 totalling 301.92MiB
2019-11-22 05:06:34.936207: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 19 Chunks of size 409436160 totalling 7.24GiB
2019-11-22 05:06:34.936216: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 2 Chunks of size 416780288 totalling 794.95MiB
2019-11-22 05:06:34.936222: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 6 Chunks of size 419069952 totalling 2.34GiB
2019-11-22 05:06:34.936230: I tensorflow/core/common_runtime/bfc_allocator.cc:812] 1 Chunks of size 426586880 totalling 406.82MiB
2019-11-22 05:06:34.936239: I tensorflow/core/common_runtime/bfc_allocator.cc:816] Sum Total of in-use chunks: 28.46GiB
2019-11-22 05:06:34.936245: I tensorflow/core/common_runtime/bfc_allocator.cc:818] total_region_allocated_bytes_: 30652445440 memory_limit_: 30652445491 available bytes: 51 curr_region_allocation_bytes_: 34359738368
2019-11-22 05:06:34.936267: I tensorflow/core/common_runtime/bfc_allocator.cc:824] Stats:
Limit: 30652445491
InUse: 30563432960
MaxInUse: 30652445440
NumAllocs: 5911
MaxAllocSize: 616562688
2019-11-22 05:06:34.936554: W tensorflow/core/common_runtime/bfc_allocator.cc:319] ****************************************************************************************************
2019-11-22 05:06:34.936604: W tensorflow/core/framework/op_kernel.cc:1622] OP_REQUIRES failed at constant_op.cc:172 : Resource exhausted: OOM when allocating tensor with shape[170,12,224,224] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
Epoch 1/150
Traceback (most recent call last):
File "yolo3/models/backbones/imagenet_training/train_imagenet.py", line 211, in <module>
main(args)
File "yolo3/models/backbones/imagenet_training/train_imagenet.py", line 168, in main
callbacks=[logging, checkpoint, lr_scheduler, terminate_on_nan])
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py", line 1272, in fit_generator
steps_name='steps_per_epoch')
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_generator.py", line 265, in model_iteration
batch_outs = batch_function(*batch_data)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py", line 997, in train_on_batch
outputs = self.train_function(ins) # pylint: disable=not-callable
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/backend.py", line 3343, in __call__
run_metadata=self.run_metadata)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py", line 1459, in __call__
run_metadata_ptr)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: 2 root error(s) found.
(0) Resource exhausted: OOM when allocating tensor with shape[170,56,56,70] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node training/SGD/gradients/gradients/zeros_2558-0-0-TransposeNCHWToNHWC-LayoutOptimizer}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
(1) Resource exhausted: OOM when allocating tensor with shape[170,56,56,70] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node training/SGD/gradients/gradients/zeros_2558-0-0-TransposeNCHWToNHWC-LayoutOptimizer}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[replica_1/nano_net/pep_block_12_preproject_BN/cond/Merge_2/_21339]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
0 successful operations.
5 derived errors ignored.