#!/usr/bin/env python
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import argparse
import os
import glob
import multiprocessing
import json

import pandas as pd

from preprocessing_utils import parallel_preprocess

parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.')
parser.add_argument('--input_dir', type=str, required=True,
                    help='LibriSpeech collection input dir')
parser.add_argument('--dest_dir', type=str, required=True,
                    help='Output dir')
parser.add_argument('--output_json', type=str, default='./',
                    help='name of the output json file.')
parser.add_argument('-s', '--speed', type=float, nargs='*',
                    help='Speed perturbation ratio')
parser.add_argument('--target_sr', type=int, default=None,
                    help='Target sample rate. '
                         'defaults to the input sample rate')
parser.add_argument('--overwrite', action='store_true',
                    help='Overwrite file if exists')
parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(),
                    help='Number of threads to use when processing audio files')
parser.add_argument('--subset_list', type=str, required=False, default=None,
                    help='A subset of files to include, otherwise all are included.')
args = parser.parse_args()

args.input_dir = args.input_dir.rstrip('/')
args.dest_dir = args.dest_dir.rstrip('/')

subset = None

def build_subset_dict(subset_file):
    subset = set()
    with open(subset_file) as fp:
        for line in fp:
            unique=os.path.splitext(os.path.basename(line))[0]
            subset.add(unique)

    return subset

def build_input_arr(input_dir, subset):
    txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'),
                          recursive=True)
    input_data = []
    for txt_file in txt_files:
        rel_path = os.path.relpath(txt_file, input_dir)
        with open(txt_file) as fp:
            for line in fp:
                fname, _, transcript = line.partition(' ')
                if (subset is not None) and (fname not in subset):
                    continue
                input_data.append(dict(input_relpath=os.path.dirname(rel_path),
                                       input_fname=fname + '.flac',
                                       transcript=transcript))
    return input_data

if args.subset_list is not None:
    print("[%s] Reading subset list..." % os.path.basename(args.subset_list))
    subset = build_subset_dict(args.subset_list)

print("[%s] Scaning input dir..." % args.output_json)
dataset = build_input_arr(input_dir=args.input_dir, subset=subset)

print("[%s] Converting audio files..." % args.output_json)
dataset = parallel_preprocess(dataset=dataset,
                              input_dir=args.input_dir,
                              dest_dir=args.dest_dir,
                              target_sr=args.target_sr,
                              speed=args.speed,
                              overwrite=args.overwrite,
                              parallel=args.parallel)

print("[%s] Generating json..." % args.output_json)
df = pd.DataFrame(dataset, dtype=object)

# Save json with python. df.to_json() produces back slashed in file paths
dataset = df.to_dict(orient='records')
with open(args.output_json, 'w') as fp:
    json.dump(dataset, fp, indent=2)