文章/答案/技术大牛

发布

社区首页 >问答首页 >TensorFlow简单音频识别:不能压缩dim[1]，期望维数为1，得到2

问TensorFlow简单音频识别:不能压缩dim[1]，期望维数为1，得到2
EN

Stack Overflow用户

提问于 2021-02-02 01:56:32

回答 2查看 920关注 0票数 4

TensorFlow: 2.1.0

我正在尝试使用TensorFlow的例子来制作我自己的音频分类器，found 这里。我正在慢慢地完成我的例子

我的目标是让我的分类器能够检测到某人说了一个词。目前我有两个标签，一个带有15个单词的样本被稍微不同地说，另一个有15个噪音样本，每一个持续大约1秒。所有这些记录都是.wav文件。

我已经到了一个不断犯错误的地步

tensorflow.python.framework.errors_impl.InvalidArgumentError:不能压缩dim1，期望维数为1，得到2[{节点压缩}}]

在这一行(121)

for waveform, label in waveform_ds.take(1):
    label = label.numpy().decode('utf-8')
    spectrogram = get_spectrogram(waveform)

打印waveform_ds产量： )

我不知道在这个阶段该做什么，任何正确的方向上的轻推，我如何能解决这个问题，是值得赞赏的。完整的代码如下所示。

import os
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display

def decode_audio(audio_binary):
    audio, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis=-1)

def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)

    # Note: You'll use indexing here instead of tuple unpacking to enable this 
    # to work in a TensorFlow graph.
    return parts[-2]

def get_waveform_and_label(file_path):
    label = get_label(file_path)
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    return waveform, label


def get_spectrogram(waveform):
    # Padding for files with less than 16000 samples
    zero_padding = tf.zeros([16000] - tf.shape(waveform), dtype=tf.float32)

    # Concatenate audio with padding so that all audio clips will be of the 
    # same length
    waveform = tf.cast(waveform, tf.float32)
    equal_length = tf.concat([waveform, zero_padding], 0)
    spectrogram = tf.signal.stft(
      equal_length, frame_length=255, frame_step=128)

    spectrogram = tf.abs(spectrogram)

    return spectrogram

def plot_spectrogram(spectrogram, ax):
    # Convert to frequencies to log scale and transpose so that the time is
    # represented in the x-axis (columns).
    log_spec = np.log(spectrogram.T)
    height = log_spec.shape[0]
    X = np.arange(16000, step=height + 1)
    Y = range(height)
    ax.pcolormesh(X, Y, log_spec)
    
def get_spectrogram_and_label_id(audio, label):
    spectrogram = get_spectrogram(audio)
    spectrogram = tf.expand_dims(spectrogram, -1)
    label_id = tf.argmax(label == commands)
    return spectrogram, label_id

def preprocess_dataset(files):
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    output_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=AUTOTUNE)
    output_ds = output_ds.map(
      get_spectrogram_and_label_id,  num_parallel_calls=AUTOTUNE)
    return output_ds

# Set seed for experiment reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

data_dir = pathlib.Path('D:\\WordAudioAI\\training_data') # have two folders inside this directory: noise and the word

commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[commands != 'README.md']
print('Commands:', commands)

filenames = tf.io.gfile.glob(str(data_dir) + '/*/*')
filenames = tf.random.shuffle(filenames)
num_samples = len(filenames)
print('Number of total examples:', num_samples)
print('Number of examples per label:', len(tf.io.gfile.listdir(str(data_dir/commands[0]))))
print('Example file tensor:', filenames[0])

# Split the files into training, validation and test sets using a 80:10:10 ratio, respectively.

train_ratio = int(len(filenames)*0.8)
val_ratio = int(len(filenames)*0.1)

train_files = filenames[:train_ratio]
val_files = filenames[train_ratio: train_ratio + val_ratio]
test_files = filenames[-val_ratio:]

print('Training set size', len(train_files))
print('Validation set size', len(val_files))
print('Test set size', len(test_files))

AUTOTUNE = tf.data.experimental.AUTOTUNE
files_ds = tf.data.Dataset.from_tensor_slices(train_files)
waveform_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=AUTOTUNE)

# Let's examine a few audio waveforms with their corresponding labels

#rows = 3
#cols = 3
#n = rows*cols
#fig, axes = plt.subplots(rows, cols, figsize=(10, 12))
#for i, (audio, label) in enumerate(waveform_ds.take(n)):
    #r = i // cols
    #c = i % cols
    #ax = axes[r][c]
    #ax.plot(audio.numpy())
    #ax.set_yticks(np.arange(-1.2, 1.2, 0.2))
    #label = label.numpy().decode('utf-8')
    #ax.set_title(label)

#plt.show()

# ***PROBLEM AREA***
print(waveform_ds)
for waveform, label in waveform_ds.take(1):
    label = label.numpy().decode('utf-8')
    spectrogram = get_spectrogram(waveform)

print('Label:', label)
print('Waveform shape:', waveform.shape)
print('Spectrogram shape:', spectrogram.shape)
print('Audio playback')
display.display(display.Audio(waveform, rate=16000))

fig, axes = plt.subplots(2, figsize=(12, 8))
timescale = np.arange(waveform.shape[0])
axes[0].plot(timescale, waveform.numpy())
axes[0].set_title('Waveform')
axes[0].set_xlim([0, 16000])
plot_spectrogram(spectrogram.numpy(), axes[1])
axes[1].set_title('Spectrogram')
plt.show()

spectrogram_ds = waveform_ds.map(
    get_spectrogram_and_label_id, num_parallel_calls=AUTOTUNE)

# Examine the spectrogram "images" for different samples of the dataset.

#rows = 3
#cols = 3
#n = rows*cols
#fig, axes = plt.subplots(rows, cols, figsize=(10, 10))
#for i, (spectrogram, label_id) in enumerate(spectrogram_ds.take(n)):
    #r = i // cols
    #c = i % cols
    #ax = axes[r][c]
    #plot_spectrogram(np.squeeze(spectrogram.numpy()), ax)
    #ax.set_title(commands[label_id.numpy()])
    #ax.axis('off')

#plt.show()

train_ds = spectrogram_ds
val_ds = preprocess_dataset(val_files)
test_ds = preprocess_dataset(test_files)

#batch_size = 64
#train_ds = train_ds.batch(batch_size)
#val_ds = val_ds.batch(batch_size)

train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds = val_ds.cache().prefetch(AUTOTUNE)

for spectrogram, _ in spectrogram_ds.take(1):
    input_shape = spectrogram.shape
print('Input shape:', input_shape)
num_labels = len(commands)

norm_layer = preprocessing.Normalization()
norm_layer.adapt(spectrogram_ds.map(lambda x, _: x))

model = models.Sequential([
    layers.Input(shape=input_shape),
    preprocessing.Resizing(32, 32), 
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

model.summary()

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

EPOCHS = 10
history = model.fit(
    train_ds, 
    validation_data=val_ds,  
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

metrics = history.history
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

test_audio = []
test_labels = []

for audio, label in test_ds:
    test_audio.append(audio.numpy())
    test_labels.append(label.numpy())

test_audio = np.array(test_audio)
test_labels = np.array(test_labels)

y_pred = np.argmax(model.predict(test_audio), axis=1)
y_true = test_labels

test_acc = sum(y_pred == y_true) / len(y_true)
print(f'Test set accuracy: {test_acc:.0%}')

confusion_mtx = tf.math.confusion_matrix(y_true, y_pred) 
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx, xticklabels=commands, yticklabels=commands, annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

sample_file = data_dir/'no/01bb6a2a_nohash_0.wav'

sample_ds = preprocess_dataset([str(sample_file)])

for spectrogram, label in sample_ds.batch(1):
    prediction = model(spectrogram)
    plt.bar(commands, tf.nn.softmax(prediction[0]))
    plt.title(f'Predictions for "{commands[label[0]]}"')
    plt.show()

python

tensorflow

audio

回答 2

Stack Overflow用户

发布于 2021-03-02 18:34:03

您可以尝试像这样添加desired_channels=1：

def decode_audio(audio_binary):
    audio, _ = tf.audio.decode_wav(audio_binary, desired_channels=1,) 
    return tf.squeeze(audio, axis=-1)

票数 3

Stack Overflow用户

发布于 2021-02-02 19:35:37

经过调查，我找到了tt，因为我的.wav文件是对决频道，而不是单通道。

如果有人知道更容易或更好的方法来训练音频分类器，请告诉我:)

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/66002660

复制

相似问题

问TensorFlow简单音频识别:不能压缩dim[1]，期望维数为1，得到2
EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问TensorFlow简单音频识别:不能压缩dim[1]，期望维数为1，得到2EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问TensorFlow简单音频识别:不能压缩dim[1]，期望维数为1，得到2
EN