/** * @license * Copyright 2019 Google LLC. All Rights Reserved. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ============================================================================= */ import * as tf from '@tensorflow/tfjs-core'; import * as tfl from '@tensorflow/tfjs-layers'; import * as fs from 'fs'; /// import * as wav from 'node-wav'; import * as path from 'path'; import {Dataset} from './dataset'; import {WavFileFeatureExtractor} from './wav_file_feature_extractor'; /** * Audio Model that creates tf.Model for a fix amount of labels. It requires a * feature extractor to convert the audio stream into input tensors for the * internal tf.Model. * It provide datasets loading, training, and model saving functions. */ export class AudioModel { private model: tfl.LayersModel; /** * * @param inputShape Input tensor shape. * @param labels Audio command label list * @param dataset Dataset class to store the loaded data. * @param featureExtractor converter to extractor features from audio stream * as input tensors */ constructor( inputShape: number[], private labels: string[], private dataset: Dataset, private featureExtractor: WavFileFeatureExtractor) { this.featureExtractor.config({ melCount: 40, bufferLength: 480, hopLength: 160, targetSr: 16000, isMfccEnabled: true, duration: 1.0 }); this.model = this.createModel(inputShape); } private createModel(inputShape: number[]): tfl.LayersModel { const model = tfl.sequential(); model.add(tfl.layers.conv2d( {filters: 8, kernelSize: [4, 2], activation: 'relu', inputShape})); model.add(tfl.layers.maxPooling2d({poolSize: [2, 2], strides: [2, 2]})); model.add(tfl.layers.conv2d( {filters: 32, kernelSize: [4, 2], activation: 'relu'})); model.add(tfl.layers.maxPooling2d({poolSize: [2, 2], strides: [2, 2]})); model.add(tfl.layers.conv2d( {filters: 32, kernelSize: [4, 2], activation: 'relu'})); model.add(tfl.layers.maxPooling2d({poolSize: [2, 2], strides: [2, 2]})); model.add(tfl.layers.conv2d( {filters: 32, kernelSize: [4, 2], activation: 'relu'})); model.add(tfl.layers.maxPooling2d({poolSize: [2, 2], strides: [1, 2]})); model.add(tfl.layers.flatten({})); model.add(tfl.layers.dropout({rate: 0.25})); model.add(tfl.layers.dense({units: 2000, activation: 'relu'})); model.add(tfl.layers.dropout({rate: 0.5})); model.add( tfl.layers.dense({units: this.labels.length, activation: 'softmax'})); model.compile({ loss: 'categoricalCrossentropy', optimizer: tf.train.sgd(0.01), metrics: ['accuracy'] }); model.summary(); return model; } /** * Load all dataset for the root directory, all the subdirectories that have * matching name to the entries in model label list, contained audio files * will be converted to input tensors and stored in the dataset for training. * @param dir The root directory of the audio dataset * @param callback Callback function for display training logs */ async loadAll(dir: string, callback: Function) { const promises = []; this.labels.forEach(async (label, index) => { callback(`loading label: ${label} (${index})`); promises.push( this.loadDataArray(path.resolve(dir, label), callback).then(v => { callback(`finished loading label: ${label} (${index})`, true); return [v, index]; })); }); let allSpecs = await Promise.all(promises); allSpecs = allSpecs .map((specs, i) => { const index = specs[1]; return specs[0].map(spec => [spec, index]); }) .reduce((acc, currentValue) => acc.concat(currentValue), []); tf.util.shuffle(allSpecs); const specs = allSpecs.map(spec => spec[0]); const labels = allSpecs.map(spec => spec[1]); this.dataset.addExamples( this.melSpectrogramToInput(specs), tf.oneHot(labels, this.labels.length)); } /** * Load one dataset from directory, all contained audio files * will be converted to input tensors and stored in the dataset for training. * @param dir The directory of the audio dataset * @param label The label for the audio dataset * @param callback Callback function for display training logs */ async loadData(dir: string, label: string, callback: Function) { const index = this.labels.indexOf(label); const specs = await this.loadDataArray(dir, callback); this.dataset.addExamples( this.melSpectrogramToInput(specs), tf.oneHot(tf.fill([specs.length], index, 'int32'), this.labels.length)); } private loadDataArray(dir: string, callback: Function) { return new Promise((resolve, reject) => { fs.readdir(dir, (err, filenames) => { if (err) { reject(err); } let specs: Float32Array[][] = []; filenames.forEach((filename) => { callback('decoding ' + dir + '/' + filename + '...'); const spec = this.splitSpecs(this.decode(dir + '/' + filename)); if (!!spec) { specs = specs.concat(spec); } callback('decoding ' + dir + '/' + filename + '...done'); }); resolve(specs); }); }); } private decode(filename: string) { const result = wav.decode(fs.readFileSync(filename)); return this.featureExtractor.start(result.channelData[0]); } /** * Train the model for stored dataset. The method call be called multiple * times. * @param epochs iteration of the training * @param trainCallback */ async train(epochs?: number, trainCallback?: tfl.CustomCallbackArgs) { return this.model.fit(this.dataset.xs, this.dataset.ys, { batchSize: 64, epochs: epochs || 100, shuffle: true, validationSplit: 0.1, callbacks: trainCallback }); } /** * Save the model to the specified directory. * @param dir Directory to store the model. */ save(dir: string): Promise { return this.model.save('file://' + dir); } /** * Return the size of the dataset in string. */ size(): string { return this.dataset.xs ? `xs: ${this.dataset.xs.shape} ys: ${this.dataset.ys.shape}` : '0'; } private splitSpecs(spec: Float32Array[]) { if (spec.length >= 98) { const output = []; for (let i = 0; i <= (spec.length - 98); i += 32) { output.push(spec.slice(i, i + 98)); } return output; } return undefined; } private melSpectrogramToInput(specs: Float32Array[][]): tf.Tensor { // Flatten this spectrogram into a 2D array. const batch = specs.length; const times = specs[0].length; const freqs = specs[0][0].length; const data = new Float32Array(batch * times * freqs); console.log(data.length); for (let j = 0; j < batch; j++) { const spec = specs[j]; for (let i = 0; i < times; i++) { const mel = spec[i]; const offset = j * freqs * times + i * freqs; data.set(mel, offset); } } // Normalize the whole input to be in [0, 1]. const shape: [number, number, number, number] = [batch, times, freqs, 1]; // this.normalizeInPlace(data, 0, 1); return tf.tensor4d(data, shape); } }