# Copyright (c) Microsoft. All rights reserved. # Licensed under the MIT license. See LICENSE.md file in the project root # for full license information. # ============================================================================== import os import math import argparse import numpy as np import cntk as C from InceptionV3 import inception_v3_norm_model # default Paths relative to current python file. abs_path = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.join(abs_path, "..", "..", "..", "..", "DataSets", "ImageNet", "test_data") config_path = abs_path model_path = os.path.join(abs_path, "Models") log_dir = None # model dimensions IMAGE_HEIGHT = 299 IMAGE_WIDTH = 299 NUM_CHANNELS = 3 # RGB NUM_CLASSES = 1000 model_name = "InceptionV3.model" # Create a minibatch source. def create_image_mb_source(map_file, is_training, total_number_of_samples): if not os.path.exists(map_file): raise RuntimeError("File '%s' does not exist." % (map_file)) # transformation pipeline for the features has jitter/crop only when training transforms = [] if is_training: transforms += [ C.io.transforms.crop(crop_type='randomarea', area_ratio=(0.05, 1.0), aspect_ratio=(0.75, 1.0), jitter_type='uniratio'), # train uses jitter C.io.transforms.scale(width=IMAGE_WIDTH, height=IMAGE_HEIGHT, channels=NUM_CHANNELS, interpolations='linear'), C.io.transforms.color(brightness_radius=0.125, contrast_radius=0.5, saturation_radius=0.5) ] else: transforms += [ C.io.transforms.crop(crop_type='center', side_ratio=0.875), # test has no jitter C.io.transforms.scale(width=IMAGE_WIDTH, height=IMAGE_HEIGHT, channels=NUM_CHANNELS, interpolations='linear') ] # deserializer return C.io.MinibatchSource( C.io.ImageDeserializer(map_file, C.io.StreamDefs( features=C.io.StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image' labels=C.io.StreamDef(field='label', shape=NUM_CLASSES))), # and second as 'label' randomize=is_training, max_samples=total_number_of_samples, multithreaded_deserializer=True) # Create the network. def create_inception_v3(): # Input variables denoting the features and label data feature_var = C.ops.input_variable((NUM_CHANNELS, IMAGE_HEIGHT, IMAGE_WIDTH)) label_var = C.ops.input_variable((NUM_CLASSES)) drop_rate = 0.2 bn_time_const = 4096 out = inception_v3_norm_model(feature_var, NUM_CLASSES, drop_rate, bn_time_const) # loss and metric aux_weight = 0.3 ce_aux = C.losses.cross_entropy_with_softmax(out['aux'], label_var) ce_z = C.losses.cross_entropy_with_softmax(out['z'], label_var) ce = C.ops.plus(C.ops.element_times(ce_aux, aux_weight), ce_z) pe = C.metrics.classification_error(out['z'], label_var) pe5 = C.metrics.classification_error(out['z'], label_var, topN=5) C.logging.log_number_of_parameters(out['z']) print() return { 'feature' : feature_var, 'label' : label_var, 'ce' : ce, 'pe' : pe, 'pe5' : pe5, 'output' : out['z'], 'outputAux' : out['aux'] } # Create trainer def create_trainer(network, epoch_size, num_epochs, minibatch_size): # CNTK weights new gradient by (1-momentum) for unit gain, # thus we divide Caffe's learning rate by (1-momentum) initial_learning_rate = 0.45 # equal to 0.045 in caffe initial_learning_rate *= minibatch_size / 32 learn_rate_adjust_interval = 2 learn_rate_decrease_factor = 0.94 # Set learning parameters lr_per_mb = [] learning_rate = initial_learning_rate for i in range(0, num_epochs, learn_rate_adjust_interval): lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor lr_schedule = C.learners.learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = C.learners.momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner learner = C.learners.nesterov(network['ce'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) # Create trainer return C.train.Trainer(network['output'], (network['ce'], network['pe']), learner) # Train and test def train_and_test(network, trainer, train_source, test_source, progress_printer, max_epochs, minibatch_size, epoch_size, restore, profiler_dir, testing_parameters): # define mapping from intput streams to network inputs input_map = { network['feature']: train_source.streams.features, network['label']: train_source.streams.labels } # perform model training if profiler_dir: C.debugging.start_profiler(profiler_dir, True) for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = train_source.next_minibatch(min(minibatch_size, epoch_size-sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far progress_printer.update_with_trainer(trainer, with_metric=True) # log progress progress_printer.epoch_summary(with_metric=True) network['output'].save(os.path.join(model_path, "BN-Inception_{}.model".format(epoch))) C.debugging.enable_profiler() # begin to collect profiler data after first epoch if profiler_dir: C.debugging.stop_profiler() # Finished # Evaluation parameters test_epoch_size, test_minibatch_size = testing_parameters # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < test_epoch_size: current_minibatch = min(test_minibatch_size, test_epoch_size - sample_count) # Fetch next test min batch. data = test_source.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[network['label']].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom # Train and evaluate the network. def inception_v3_train_and_eval(train_data, test_data, minibatch_size=32, epoch_size=1281167, max_epochs=300, restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, profiler_dir=None, testing_parameters=(5000, 32)): C.debugging.set_computation_network_trace_level(1) progress_printer = C.logging.ProgressPrinter( freq=num_mbs_per_log, tag='Training', log_to_file=log_to_file, gen_heartbeat=gen_heartbeat, num_epochs=max_epochs) network = create_inception_v3() trainer = create_trainer(network, epoch_size, max_epochs, minibatch_size) train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size) test_source = create_image_mb_source(test_data, False, total_number_of_samples=C.io.FULL_DATA_SWEEP) return train_and_test(network, trainer, train_source, test_source, progress_printer, max_epochs, minibatch_size, epoch_size, restore, profiler_dir, testing_parameters) if __name__=='__main__': parser = argparse.ArgumentParser() parser.add_argument('-datadir', '--datadir', help='Data directory where the cifar-10 dataset is located', required=False, default=data_path) parser.add_argument('-configdir', '--configdir', help='Config directory where this python script is located', required=False, default=config_path) parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None) parser.add_argument('-profilerdir', '--profilerdir', help='Directory for saving profiler output', required=False, default=None) parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None) parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default='300') parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default='32') parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default='1281167') parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true') parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None) args = vars(parser.parse_args()) if args['outputdir'] is not None: model_path = args['outputdir'] + "/models" if args['logdir'] is not None: log_dir = args['logdir'] if args['profilerdir'] is not None: profiler_dir = args['profilerdir'] if args['device'] is not None: C.device.try_set_default_device(C.device.gpu(args['device'])) data_path = args['datadir'] if not os.path.isdir(data_path): raise RuntimeError("Directory %s does not exist" % data_path) os.chdir(data_path) train_data = os.path.join(data_path, 'train_map.txt') test_data = os.path.join(data_path, 'val_map.txt') inception_v3_train_and_eval(train_data, test_data, minibatch_size=args['minibatch_size'], epoch_size=args['epoch_size'], max_epochs=args['num_epochs'], restore=not args['restart'], log_to_file=args['logdir'], num_mbs_per_log=100, gen_heartbeat=True, profiler_dir=args['profilerdir'])