# Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Convolution blocks for mobilenet.""" import contextlib import functools import tensorflow as tf slim = tf.contrib.slim def _fixed_padding(inputs, kernel_size, rate=1): """Pads the input along the spatial dimensions independently of input size. Pads the input such that if it was used in a convolution with 'VALID' padding, the output would have the same dimensions as if the unpadded input was used in a convolution with 'SAME' padding. Args: inputs: A tensor of size [batch, height_in, width_in, channels]. kernel_size: The kernel to be used in the conv2d or max_pool2d operation. rate: An integer, rate for atrous convolution. Returns: output: A tensor of size [batch, height_out, width_out, channels] with the input, either intact (if kernel_size == 1) or padded (if kernel_size > 1). """ kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1), kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)] pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1] pad_beg = [pad_total[0] // 2, pad_total[1] // 2] pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]] padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]], [pad_beg[1], pad_end[1]], [0, 0]]) return padded_inputs def _make_divisible(v, divisor, min_value=None): if min_value is None: min_value = divisor new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v def _split_divisible(num, num_ways, divisible_by=8): """Evenly splits num, num_ways so each piece is a multiple of divisible_by.""" assert num % divisible_by == 0 assert num / num_ways >= divisible_by # Note: want to round down, we adjust each split to match the total. base = num // num_ways // divisible_by * divisible_by result = [] accumulated = 0 for i in range(num_ways): r = base while accumulated + r < num * (i + 1) / num_ways: r += divisible_by result.append(r) accumulated += r assert accumulated == num return result @contextlib.contextmanager def _v1_compatible_scope_naming(scope): if scope is None: # Create uniqified separable blocks. with tf.variable_scope(None, default_name='separable') as s, \ tf.name_scope(s.original_name_scope): yield '' else: # We use scope_depthwise, scope_pointwise for compatibility with V1 ckpts. # which provide numbered scopes. scope += '_' yield scope @slim.add_arg_scope def split_separable_conv2d(input_tensor, num_outputs, scope=None, normalizer_fn=None, stride=1, rate=1, endpoints=None, use_explicit_padding=False): """Separable mobilenet V1 style convolution. Depthwise convolution, with default non-linearity, followed by 1x1 depthwise convolution. This is similar to slim.separable_conv2d, but differs in tha it applies batch normalization and non-linearity to depthwise. This matches the basic building of Mobilenet Paper (https://arxiv.org/abs/1704.04861) Args: input_tensor: input num_outputs: number of outputs scope: optional name of the scope. Note if provided it will use scope_depthwise for deptwhise, and scope_pointwise for pointwise. normalizer_fn: which normalizer function to use for depthwise/pointwise stride: stride rate: output rate (also known as dilation rate) endpoints: optional, if provided, will export additional tensors to it. use_explicit_padding: Use 'VALID' padding for convolutions, but prepad inputs so that the output dimensions are the same as if 'SAME' padding were used. Returns: output tesnor """ with _v1_compatible_scope_naming(scope) as scope: dw_scope = scope + 'depthwise' endpoints = endpoints if endpoints is not None else {} kernel_size = [3, 3] padding = 'SAME' if use_explicit_padding: padding = 'VALID' input_tensor = _fixed_padding(input_tensor, kernel_size, rate) net = slim.separable_conv2d( input_tensor, None, kernel_size, depth_multiplier=1, stride=stride, rate=rate, normalizer_fn=normalizer_fn, padding=padding, scope=dw_scope) endpoints[dw_scope] = net pw_scope = scope + 'pointwise' net = slim.conv2d( net, num_outputs, [1, 1], stride=1, normalizer_fn=normalizer_fn, scope=pw_scope) endpoints[pw_scope] = net return net def expand_input_by_factor(n, divisible_by=8): return lambda num_inputs, **_: _make_divisible(num_inputs * n, divisible_by) @slim.add_arg_scope def expanded_conv(input_tensor, num_outputs, expansion_size=expand_input_by_factor(6), stride=1, rate=1, kernel_size=(3, 3), residual=True, normalizer_fn=None, split_projection=1, split_expansion=1, expansion_transform=None, depthwise_location='expansion', depthwise_channel_multiplier=1, endpoints=None, use_explicit_padding=False, scope=None): """Depthwise Convolution Block with expansion. Builds a composite convolution that has the following structure expansion (1x1) -> depthwise (kernel_size) -> projection (1x1) Args: input_tensor: input num_outputs: number of outputs in the final layer. expansion_size: the size of expansion, could be a constant or a callable. If latter it will be provided 'num_inputs' as an input. For forward compatibility it should accept arbitrary keyword arguments. Default will expand the input by factor of 6. stride: depthwise stride rate: depthwise rate kernel_size: depthwise kernel residual: whether to include residual connection between input and output. normalizer_fn: batchnorm or otherwise split_projection: how many ways to split projection operator (that is conv expansion->bottleneck) split_expansion: how many ways to split expansion op (that is conv bottleneck->expansion) ops will keep depth divisible by this value. expansion_transform: Optional function that takes expansion as a single input and returns output. depthwise_location: where to put depthwise covnvolutions supported values None, 'input', 'output', 'expansion' depthwise_channel_multiplier: depthwise channel multiplier: each input will replicated (with different filters) that many times. So if input had c channels, output will have c x depthwise_channel_multpilier. endpoints: An optional dictionary into which intermediate endpoints are placed. The keys "expansion_output", "depthwise_output", "projection_output" and "expansion_transform" are always populated, even if the corresponding functions are not invoked. use_explicit_padding: Use 'VALID' padding for convolutions, but prepad inputs so that the output dimensions are the same as if 'SAME' padding were used. scope: optional scope. Returns: Tensor of depth num_outputs Raises: TypeError: on inval """ with tf.variable_scope(scope, default_name='expanded_conv') as s, \ tf.name_scope(s.original_name_scope): prev_depth = input_tensor.get_shape().as_list()[3] if depthwise_location not in [None, 'input', 'output', 'expansion']: raise TypeError('%r is unknown value for depthwise_location' % depthwise_location) padding = 'SAME' if use_explicit_padding: padding = 'VALID' depthwise_func = functools.partial( slim.separable_conv2d, num_outputs=None, kernel_size=kernel_size, depth_multiplier=depthwise_channel_multiplier, stride=stride, rate=rate, normalizer_fn=normalizer_fn, padding=padding, scope='depthwise') # b1 -> b2 * r -> b2 # i -> (o * r) (bottleneck) -> o input_tensor = tf.identity(input_tensor, 'input') net = input_tensor if depthwise_location == 'input': if use_explicit_padding: net = _fixed_padding(net, kernel_size, rate) net = depthwise_func(net, activation_fn=None) if callable(expansion_size): inner_size = expansion_size(num_inputs=prev_depth) else: inner_size = expansion_size if inner_size > net.shape[3]: net = split_conv( net, inner_size, num_ways=split_expansion, scope='expand', stride=1, normalizer_fn=normalizer_fn) net = tf.identity(net, 'expansion_output') if endpoints is not None: endpoints['expansion_output'] = net if depthwise_location == 'expansion': if use_explicit_padding: net = _fixed_padding(net, kernel_size, rate) net = depthwise_func(net) net = tf.identity(net, name='depthwise_output') if endpoints is not None: endpoints['depthwise_output'] = net if expansion_transform: net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor) # Note in contrast with expansion, we always have # projection to produce the desired output size. net = split_conv( net, num_outputs, num_ways=split_projection, stride=1, scope='project', normalizer_fn=normalizer_fn, activation_fn=tf.identity) if endpoints is not None: endpoints['projection_output'] = net if depthwise_location == 'output': if use_explicit_padding: net = _fixed_padding(net, kernel_size, rate) net = depthwise_func(net, activation_fn=None) if callable(residual): # custom residual net = residual(input_tensor=input_tensor, output_tensor=net) elif (residual and # stride check enforces that we don't add residuals when spatial # dimensions are None stride == 1 and # Depth matches net.get_shape().as_list()[3] == input_tensor.get_shape().as_list()[3]): net += input_tensor return tf.identity(net, name='output') def split_conv(input_tensor, num_outputs, num_ways, scope, divisible_by=8, **kwargs): """Creates a split convolution. Split convolution splits the input and output into 'num_blocks' blocks of approximately the same size each, and only connects $i$-th input to $i$ output. Args: input_tensor: input tensor num_outputs: number of output filters num_ways: num blocks to split by. scope: scope for all the operators. divisible_by: make sure that every part is divisiable by this. **kwargs: will be passed directly into conv2d operator Returns: tensor """ b = input_tensor.get_shape().as_list()[3] if num_ways == 1 or min(b // num_ways, num_outputs // num_ways) < divisible_by: # Don't do any splitting if we end up with less than 8 filters # on either side. return slim.conv2d(input_tensor, num_outputs, [1, 1], scope=scope, **kwargs) outs = [] input_splits = _split_divisible(b, num_ways, divisible_by=divisible_by) output_splits = _split_divisible( num_outputs, num_ways, divisible_by=divisible_by) inputs = tf.split(input_tensor, input_splits, axis=3, name='split_' + scope) base = scope for i, (input_tensor, out_size) in enumerate(zip(inputs, output_splits)): scope = base + '_part_%d' % (i,) n = slim.conv2d(input_tensor, out_size, [1, 1], scope=scope, **kwargs) n = tf.identity(n, scope + '_output') outs.append(n) return tf.concat(outs, 3, name=scope + '_concat')