# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """SliceNet.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function # Dependency imports from six.moves import xrange # pylint: disable=redefined-builtin from six.moves import zip # pylint: disable=redefined-builtin from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers from tensor2tensor.utils import registry from tensor2tensor.utils import t2t_model import tensorflow as tf def attention(targets_shifted, inputs_encoded, norm_fn, hparams, bias=None): """Complete attention layer with preprocessing.""" separabilities = [hparams.separability, hparams.separability] if hparams.separability < 0: separabilities = [hparams.separability - 1, hparams.separability] targets_timed = common_layers.subseparable_conv_block( common_layers.add_timing_signal(targets_shifted), hparams.hidden_size, [((1, 1), (5, 1)), ((4, 1), (5, 1))], normalizer_fn=norm_fn, padding="LEFT", separabilities=separabilities, name="targets_time") if hparams.attention_type == "transformer": targets_timed = tf.squeeze(targets_timed, 2) target_shape = tf.shape(targets_timed) targets_segment = tf.zeros([target_shape[0], target_shape[1]]) target_attention_bias = common_attention.attention_bias( targets_segment, targets_segment, lower_triangular=True) inputs_attention_bias = tf.zeros([ tf.shape(inputs_encoded)[0], hparams.num_heads, tf.shape(targets_segment)[1], tf.shape(inputs_encoded)[1] ]) qv = common_attention.multihead_attention( targets_timed, None, target_attention_bias, hparams.hidden_size, hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, name="self_attention") qv = common_attention.multihead_attention( qv, inputs_encoded, inputs_attention_bias, hparams.hidden_size, hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, name="encdec_attention") return tf.expand_dims(qv, 2) elif hparams.attention_type == "simple": targets_with_attention = common_layers.simple_attention( targets_timed, inputs_encoded, bias=bias) return norm_fn(targets_shifted + targets_with_attention, name="attn_norm") def multi_conv_res(x, padding, name, layers, hparams, mask=None, source=None): """A stack of separable convolution blocks with residual connections.""" with tf.variable_scope(name): padding_bias = None if mask is not None: padding_bias = (1.0 - mask) * -1e9 # Bias to not attend to padding. if padding == "LEFT": # Do not mask anything when left-padding. mask = None if (hparams.kernel_scheme in _KERNEL_SCHEMES and hparams.dilation_scheme in _DILATION_SCHEMES): kernels = _KERNEL_SCHEMES[hparams.kernel_scheme] dilations = _DILATION_SCHEMES[hparams.dilation_scheme] dilations_and_kernels = list(zip(dilations, kernels)) dilations_and_kernels1 = dilations_and_kernels[:2] dilations_and_kernels2 = dilations_and_kernels[2:] else: k = (hparams.kernel_height, hparams.kernel_width) k2 = (hparams.large_kernel_size, 1) dilations_and_kernels1 = [((1, 1), k), ((1, 1), k)] dilations_and_kernels2 = [((1, 1), k2), ((4, 4), k2)] separabilities1 = [hparams.separability, hparams.separability] separabilities2 = [hparams.separability] * len(dilations_and_kernels2) if hparams.separability < 0: separabilities1 = [hparams.separability - 1, hparams.separability] separabilities2 = [ hparams.separability - i for i in reversed(range(len(dilations_and_kernels2))) ] def norm_fn(x, name): with tf.variable_scope(name, default_name="norm"): return common_layers.apply_norm( x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon) for layer in xrange(layers): with tf.variable_scope("layer_%d" % layer): y = common_layers.subseparable_conv_block( x, hparams.hidden_size, dilations_and_kernels1, normalizer_fn=norm_fn, padding=padding, mask=mask, separabilities=separabilities1, name="residual1") x += common_layers.subseparable_conv_block( x + y, hparams.hidden_size, dilations_and_kernels2, normalizer_fn=norm_fn, padding=padding, mask=mask, separabilities=separabilities2, name="residual2") + y if source is not None and hparams.attention_type != "none": x += attention(x, source, norm_fn, hparams, bias=padding_bias) if mask is not None: x *= mask return tf.nn.dropout(x, 1.0 - hparams.dropout) def rank_loss(sentence_emb, image_emb, margin=0.2): """Experimental rank loss, thanks to kkurach@ for the code.""" with tf.name_scope("rank_loss"): # Normalize first as this is assumed in cosine similarity later. sentence_emb = tf.nn.l2_normalize(sentence_emb, 1) image_emb = tf.nn.l2_normalize(image_emb, 1) # Both sentence_emb and image_emb have size [batch, depth]. scores = tf.matmul(image_emb, tf.transpose(sentence_emb)) # [batch, batch] diagonal = tf.diag_part(scores) # [batch] cost_s = tf.maximum(0.0, margin - diagonal + scores) # [batch, batch] cost_im = tf.maximum( 0.0, margin - tf.reshape(diagonal, [-1, 1]) + scores) # [batch, batch] # Clear diagonals. batch_size = tf.shape(sentence_emb)[0] empty_diagonal_mat = tf.ones_like(cost_s) - tf.eye(batch_size) cost_s *= empty_diagonal_mat cost_im *= empty_diagonal_mat return tf.reduce_mean(cost_s) + tf.reduce_mean(cost_im) def similarity_cost(inputs_encoded, targets_encoded): """Loss telling to be more similar to your own targets than to others.""" # This is a first very simple version: handle variable-length by padding # to same length and putting everything into batch. In need of a better way. x, y = common_layers.pad_to_same_length(inputs_encoded, targets_encoded) depth = tf.shape(inputs_encoded)[3] x, y = tf.reshape(x, [-1, depth]), tf.reshape(y, [-1, depth]) return rank_loss(x, y) def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams): """Middle part of slicenet, connecting encoder and decoder.""" def norm_fn(x, name): with tf.variable_scope(name, default_name="norm"): return common_layers.apply_norm( x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon) # Flatten targets and embed target_space_id. targets_flat = tf.expand_dims(common_layers.flatten4d3d(targets), axis=2) target_space_emb = tf.tile(target_space_emb, [tf.shape(targets_flat)[0], 1, 1, 1]) # Calculate similarity loss (but don't run if not needed). if len(hparams.problems) > 1 and hparams.sim_loss_mult > 0.00001: targets_timed = common_layers.add_timing_signal(targets_flat) extra_layers = int(hparams.num_hidden_layers * 1.5) with tf.variable_scope(tf.get_variable_scope(), reuse=True): targets_encoded = multi_conv_res(targets_timed, "SAME", "encoder", extra_layers, hparams) with tf.variable_scope("similarity_loss"): similarity_loss = similarity_cost(inputs_encoded, targets_encoded) similarity_loss *= hparams.sim_loss_mult else: similarity_loss = 0.0 # Use attention from each target to look at input and retrieve. targets_shifted = common_layers.shift_right( targets_flat, pad_value=target_space_emb) if hparams.attention_type == "none": targets_with_attention = tf.zeros_like(targets_shifted) else: inputs_padding_bias = (1.0 - mask) * -1e9 # Bias to not attend to padding. targets_with_attention = attention( targets_shifted, inputs_encoded, norm_fn, hparams, bias=inputs_padding_bias) # Positional targets: merge attention and raw. kernel = (hparams.kernel_height, hparams.kernel_width) targets_merged = common_layers.subseparable_conv_block( tf.concat([targets_with_attention, targets_shifted], axis=3), hparams.hidden_size, [((1, 1), kernel)], normalizer_fn=norm_fn, padding="LEFT", separability=4, name="targets_merge") return targets_merged, similarity_loss def embed_target_space(target_space_id, hidden_size): target_space_emb = common_layers.embedding( target_space_id, 32, hidden_size, name="target_space_embedding") return tf.reshape(target_space_emb, [1, 1, 1, -1]) def embedding_to_padding(emb): """Input embeddings -> is_padding.""" emb_sum = tf.reduce_sum(tf.abs(emb), axis=-1, keep_dims=True) return tf.to_float(tf.equal(emb_sum, 0.0)) def slicenet_internal(inputs, targets, target_space, problem_idx, hparams): """The slicenet model, main step used for training.""" with tf.variable_scope("slicenet"): # Flatten inputs and encode. inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2) inputs_mask = 1.0 - embedding_to_padding(inputs) inputs = common_layers.add_timing_signal(inputs) # Add position info. target_space_emb = embed_target_space(target_space, hparams.hidden_size) extra_layers = int(hparams.num_hidden_layers * 1.5) inputs_encoded = multi_conv_res( inputs, "SAME", "encoder", extra_layers, hparams, mask=inputs_mask) target_modality_name = hparams.problems[problem_idx].target_modality.name if "class_label_modality" in target_modality_name: # If we're just predicing a class, there is no use for a decoder. return inputs_encoded # Do the middle part. decoder_start, similarity_loss = slicenet_middle( inputs_encoded, targets, target_space_emb, inputs_mask, hparams) # Decode. decoder_final = multi_conv_res( decoder_start, "LEFT", "decoder", hparams.num_hidden_layers, hparams, mask=inputs_mask, source=inputs_encoded) return decoder_final, tf.reduce_mean(similarity_loss) @registry.register_model class SliceNet(t2t_model.T2TModel): def model_fn_body(self, features): return slicenet_internal(features["inputs"], features["targets"], features["target_space_id"], self._problem_idx, self._hparams) _KERNEL_SCHEMES = { "3.3.3.3": [(3, 1), (3, 1), (3, 1), (3, 1)], "3.7.7.7": [(3, 1), (7, 1), (7, 1), (7, 1)], "3.7.15.15": [(3, 1), (7, 1), (15, 1), (15, 1)], "3.7.15.31": [(3, 1), (7, 1), (15, 1), (31, 1)], "3.7.15.31.63": [(3, 1), (7, 1), (15, 1), (31, 1), (63, 1)], } _DILATION_SCHEMES = { "1.1.1.1.1": [(1, 1), (1, 1), (1, 1), (1, 1), (1, 1)], "1.1.1.1": [(1, 1), (1, 1), (1, 1), (1, 1)], "1.1.1.2": [(1, 1), (1, 1), (1, 1), (2, 1)], "1.1.2.4": [(1, 1), (1, 1), (2, 1), (4, 1)], "1.2.4.8": [(1, 1), (2, 1), (4, 1), (8, 1)], } @registry.register_hparams("slicenet_1") def slicenet_params1(): """Set of hyperparameters.""" hparams = common_hparams.basic_params1() hparams.batch_size = 1024 hparams.hidden_size = 768 hparams.dropout = 0.5 hparams.symbol_dropout = 0.2 hparams.label_smoothing = 0.1 hparams.clip_grad_norm = 2.0 hparams.num_hidden_layers = 4 hparams.kernel_height = 3 hparams.kernel_width = 1 hparams.norm_type = "layer" hparams.learning_rate_decay_scheme = "exp50k" hparams.learning_rate = 0.05 hparams.learning_rate_warmup_steps = 3000 hparams.initializer_gain = 1.0 hparams.weight_decay = 3.0 hparams.num_sampled_classes = 0 hparams.sampling_method = "argmax" hparams.optimizer_adam_epsilon = 1e-6 hparams.optimizer_adam_beta1 = 0.85 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("large_kernel_size", 15) # New ones are added like this. hparams.add_hparam("separability", -2) # A dilation scheme, one of _DILATION_SCHEMES. hparams.add_hparam("dilation_scheme", "1.1.1.1") # A kernel scheme, one of _KERNEL_SCHEMES; overrides large_kernel_size. hparams.add_hparam("kernel_scheme", "3.7.15.31") hparams.add_hparam("audio_compression", 8) # attention-related flags hparams.add_hparam("attention_type", "simple") hparams.add_hparam("num_heads", 8) hparams.add_hparam("attention_key_channels", 0) hparams.add_hparam("attention_value_channels", 0) hparams.add_hparam("sim_loss_mult", 0.0) # Try 10.0 for experiments. hparams.add_hparam("attention_dropout", 0.2) hparams.shared_embedding_and_softmax_weights = int(True) return hparams @registry.register_hparams("slicenet_1noam") def slicenet_params1_noam(): """Version with Noam's decay scheme.""" hparams = slicenet_params1() hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 1.0 hparams.learning_rate_warmup_steps = 4000 hparams.initializer = "uniform_unit_scaling" hparams.optimizer_adam_epsilon = 1e-9 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.98 return hparams @registry.register_hparams("slicenet_1tiny") def slicenet_params1_tiny(): """Version for fast local runs.""" hparams = slicenet_params1() hparams.attention_type = "simple" hparams.separability = 0 hparams.hidden_size = 128 hparams.num_hidden_layers = 2 hparams.batch_size = 512 hparams.learning_rate_warmup_steps = 200 return hparams @registry.register_ranged_hparams("slicenet1") def slicenet_range1(ranged_hparams): """Small range of hyperparameters.""" rhp = ranged_hparams hparams = slicenet_params1() common_hparams.fill_ranged_hparams_from_hparams(hparams, rhp) rhp.set_float("clip_grad_norm", 1.0, 10.0, scale=rhp.LOG_SCALE) rhp.set_float("learning_rate", 0.02, 1.0, scale=rhp.LOG_SCALE) rhp.set_float("optimizer_adam_beta2", 0.995, 0.998) rhp.set_float("weight_decay", 1.0, 5.0)