# Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Provides data for the ImageNet ILSVRC 2012 Dataset plus some bounding boxes. Some images have one or more bounding boxes associated with the label of the image. See details here: http://image-net.org/download-bboxes ImageNet is based upon WordNet 3.0. To uniquely identify a synset, we use "WordNet ID" (wnid), which is a concatenation of POS ( i.e. part of speech ) and SYNSET OFFSET of WordNet. For more information, please refer to the WordNet documentation[http://wordnet.princeton.edu/wordnet/documentation/]. "There are bounding boxes for over 3000 popular synsets available. For each synset, there are on average 150 images with bounding boxes." WARNING: Don't use for object detection, in this case all the bounding boxes of the image belong to just one class. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os from six.moves import urllib import tensorflow as tf from datasets import dataset_utils slim = tf.contrib.slim # TODO(nsilberman): Add tfrecord file type once the script is updated. _FILE_PATTERN = '%s-*' _SPLITS_TO_SIZES = { 'train': 1281167, 'validation': 50000, } _ITEMS_TO_DESCRIPTIONS = { 'image': 'A color image of varying height and width.', 'label': 'The label id of the image, integer between 0 and 999', 'label_text': 'The text of the label.', 'object/bbox': 'A list of bounding boxes.', 'object/label': 'A list of labels, one per each object.', } _NUM_CLASSES = 1001 def create_readable_names_for_imagenet_labels(): """Create a dict mapping label id to human readable string. Returns: labels_to_names: dictionary where keys are integers from to 1000 and values are human-readable names. We retrieve a synset file, which contains a list of valid synset labels used by ILSVRC competition. There is one synset one per line, eg. # n01440764 # n01443537 We also retrieve a synset_to_human_file, which contains a mapping from synsets to human-readable names for every synset in Imagenet. These are stored in a tsv format, as follows: # n02119247 black fox # n02119359 silver fox We assign each synset (in alphabetical order) an integer, starting from 1 (since 0 is reserved for the background class). Code is based on https://github.com/tensorflow/models/blob/master/research/inception/inception/data/build_imagenet_data.py#L463 """ # pylint: disable=g-line-too-long base_url = 'https://raw.githubusercontent.com/tensorflow/models/master/research/inception/inception/data/' synset_url = '{}/imagenet_lsvrc_2015_synsets.txt'.format(base_url) synset_to_human_url = '{}/imagenet_metadata.txt'.format(base_url) filename, _ = urllib.request.urlretrieve(synset_url) synset_list = [s.strip() for s in open(filename).readlines()] num_synsets_in_ilsvrc = len(synset_list) assert num_synsets_in_ilsvrc == 1000 filename, _ = urllib.request.urlretrieve(synset_to_human_url) synset_to_human_list = open(filename).readlines() num_synsets_in_all_imagenet = len(synset_to_human_list) assert num_synsets_in_all_imagenet == 21842 synset_to_human = {} for s in synset_to_human_list: parts = s.strip().split('\t') assert len(parts) == 2 synset = parts[0] human = parts[1] synset_to_human[synset] = human label_index = 1 labels_to_names = {0: 'background'} for synset in synset_list: name = synset_to_human[synset] labels_to_names[label_index] = name label_index += 1 return labels_to_names def get_split(split_name, dataset_dir, file_pattern=None, reader=None): """Gets a dataset tuple with instructions for reading ImageNet. Args: split_name: A train/test split name. dataset_dir: The base directory of the dataset sources. file_pattern: The file pattern to use when matching the dataset sources. It is assumed that the pattern contains a '%s' string so that the split name can be inserted. reader: The TensorFlow reader type. Returns: A `Dataset` namedtuple. Raises: ValueError: if `split_name` is not a valid train/test split. """ if split_name not in _SPLITS_TO_SIZES: raise ValueError('split name %s was not recognized.' % split_name) if not file_pattern: file_pattern = _FILE_PATTERN file_pattern = os.path.join(dataset_dir, file_pattern % split_name) # Allowing None in the signature so that dataset_factory can use the default. if reader is None: reader = tf.TFRecordReader keys_to_features = { 'image/encoded': tf.FixedLenFeature( (), tf.string, default_value=''), 'image/format': tf.FixedLenFeature( (), tf.string, default_value='jpeg'), 'image/class/label': tf.FixedLenFeature( [], dtype=tf.int64, default_value=-1), 'image/class/text': tf.FixedLenFeature( [], dtype=tf.string, default_value=''), 'image/object/bbox/xmin': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature( dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature( dtype=tf.float32), 'image/object/class/label': tf.VarLenFeature( dtype=tf.int64), } items_to_handlers = { 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 'label': slim.tfexample_decoder.Tensor('image/class/label'), 'label_text': slim.tfexample_decoder.Tensor('image/class/text'), 'object/bbox': slim.tfexample_decoder.BoundingBox( ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), 'object/label': slim.tfexample_decoder.Tensor('image/object/class/label'), } decoder = slim.tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) labels_to_names = None if dataset_utils.has_labels(dataset_dir): labels_to_names = dataset_utils.read_label_file(dataset_dir) else: labels_to_names = create_readable_names_for_imagenet_labels() dataset_utils.write_label_file(labels_to_names, dataset_dir) return slim.dataset.Dataset( data_sources=file_pattern, reader=reader, decoder=decoder, num_samples=_SPLITS_TO_SIZES[split_name], items_to_descriptions=_ITEMS_TO_DESCRIPTIONS, num_classes=_NUM_CLASSES, labels_to_names=labels_to_names)