问题描述

speech recognize是kaggle上1,2年前的一个赛题,主要描述的是如果在各种环境中识别出简单的英文单词发音,比如bed,cat,right之类的语音。提供的训练集是.wav格式的语音,解压后数据集大小在2G左右。

训练集分析

语音识别相对图片识别来说,是一个区别较大的领域。所以首先要了解训练集的基本特征,理清不同语音之间的共性。

振幅和频率

def log_specgram(audio, sample_rate, window_size=20,

step_size=10, eps=1e-10):

nperseg = int(round(window_size * sample_rate / 1e3))

noverlap = int(round(step_size * sample_rate / 1e3))

freqs, times, spec = signal.spectrogram(audio,

fs=sample_rate,

window='hann',

nperseg=nperseg,

noverlap=noverlap,

detrend=False)

return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def plt_specgram(freqs, times, spectrogram):

fig = plt.figure(figsize=(14, 8))

ax1 = fig.add_subplot(211)

ax1.set_title('Raw wave of ' + filename)

ax1.set_ylabel('Amplitude')

ax1.plot(np.linspace(0, sample_rate / len(samples), sample_rate), samples)

ax2 = fig.add_subplot(212)

ax2.imshow(spectrogram.T, aspect='auto', origin='lower',

extent=[times.min(), times.max(), freqs.min(), freqs.max()])

ax2.set_yticks(freqs[::16])

ax2.set_xticks(times[::16])

ax2.set_title('Spectrogram of ' + filename)

ax2.set_ylabel('Freqs in Hz')

ax2.set_xlabel('Seconds')

plt.savefig('./output/yes_0a7c2a8d_nohash_0.png')

plt.show()

train_audio_path = './input/train/audio/'

filename = '/yes/0a7c2a8d_nohash_0.wav'

sample_rate, samples = wavfile.read(str(train_audio_path) + filename)

# print(sample_rate, samples)

# 频谱图

freqs, times, spectrogram = log_specgram(samples, sample_rate)

plt_specgram(freqs, times, spectrogram)

声音强度

def plt_spectrogram():

plt.figure(figsize=(12, 4))

librosa.display.specshow(log_S, sr=sample_rate, x_axis='time', y_axis='mel')

plt.title('Mel power spectrogram ')

plt.colorbar(format='%+02.0f dB')

plt.tight_layout()

plt.savefig('./output/spectrogram.png')

plt.show()

# 光谱图

sig = samples

sig = sig / max(abs(sig))

S = librosa.feature.melspectrogram(y=sig, sr=sample_rate, n_mels=128)

log_S = librosa.power_to_db(S, ref=np.max)

plt_spectrogram()

三维图像

def spectrogram_3d():

data = [go.Surface(z=spectrogram.T)]

layout = go.Layout(

title='Specgtrogram of "yes" in 3d',

scene=dict(

yaxis=dict(title='Frequencies'),

xaxis=dict(title='Time'),

zaxis=dict(title='Log amplitude'),

),

)

fig = go.Figure(data=data, layout=layout)

py.plot(fig)

spectrogram_3d()

不同音频的总数统计

def count_summary():

dirs.sort()

print('Number of labels: ' + str(len(dirs)))

number_of_recordings = []

for direct in dirs:

waves = [f for f in os.listdir(join(train_audio_path, direct)) if f.endswith('.wav')]

number_of_recordings.append(len(waves))

speech_count = dict(map(lambda x, y: [x, y], dirs, number_of_recordings))

print(speech_count)

count_summary()

"""输出

{'_background_noise_': 6, 'bed': 1713, 'bird': 1731, 'cat': 1733, 'dog': 1746, 'down': 2359, 'eight': 2352, 'five': 2357, 'four': 2372, 'go': 2372, 'happy': 1742, 'house': 1750, 'left': 2353, 'marvin': 1746, 'nine': 2364, 'no': 2375, 'off': 2357, 'on': 2367, 'one': 2370, 'right': 2367, 'seven': 2377, 'sheila': 1734, 'six': 2369, 'stop': 2380, 'three': 2356, 'tree': 1733, 'two': 2373, 'up': 2375, 'wow': 1745, 'yes': 2377, 'zero': 2376}

"""

每个英文发音的特征识别

def mean_fft():

to_keep = 'yes no up down left right on off stop go'.split()

dir = [d for d in dirs if d in to_keep]

print(dir)

for direct in dir:

vals_all = []

spec_all = []

waves = [f for f in os.listdir(join(train_audio_path, direct)) if f.endswith('.wav')]

for wav in waves:

sample_rate, samples = wavfile.read(train_audio_path + direct + '/' + wav)

if samples.shape[0] != 16000:

continue

xf, vals = custom_fft(samples, 16000)

vals_all.append(vals)

freqs, times, spec = log_specgram(samples, 16000)

spec_all.append(spec)

plt.figure(figsize=(14, 4))

plt.subplot(121)

plt.title('Mean fft of ' + direct)

plt.plot(np.mean(np.array(vals_all), axis=0))

plt.grid()

plt.subplot(122)

plt.title('Mean specgram of ' + direct)

plt.imshow(np.mean(np.array(spec_all), axis=0).T, aspect='auto', origin='lower',

extent=[times.min(), times.max(), freqs.min(), freqs.max()])

plt.yticks(freqs[::16])

plt.xticks(times[::16])

plt.savefig('./output/mean_fft_' + direct + '.png')

plt.show()

mean_fft()

代码是把所有英文单词的特征图都输出,这里只展示down和yes,2个特征图

原始数据处理

for label, fname in zip(labels, fnames):

sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname))

samples = pad_audio(samples)

if len(samples) > 16000:

n_samples = chop_audio(samples)

else:

n_samples = [samples]

for samples in n_samples:

resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))

_, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)

y_train.append(label)

x_train.append(specgram)

x_train = np.array(x_train)

x_train = x_train.reshape(tuple(list(x_train.shape) + [1]))

y_train = label_transform(y_train)

label_index = y_train.columns.values

y_train = y_train.values

y_train = np.array(y_train)

del labels, fnames

gc.collect()

cnn建模

def model_cnn(x_train, y_train):

input_shape = (99, 81, 1)

nclass = 12

inp = Input(shape=input_shape)

norm_inp = BatchNormalization()(inp)

img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(norm_inp)

img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(img_1)

img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)

img_1 = Dropout(rate=0.2)(img_1)

img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)

img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)

img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)

img_1 = Dropout(rate=0.2)(img_1)

img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu)(img_1)

img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)

img_1 = Dropout(rate=0.2)(img_1)

img_1 = Flatten()(img_1)

dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(img_1))

dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))

dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

model = models.Model(inputs=inp, outputs=dense_1)

opt = optimizers.Adam()

model.compile(optimizer=opt, loss=losses.binary_crossentropy)

model.summary()

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=2017)

model.fit(x_train, y_train, batch_size=16, validation_data=(x_valid, y_valid), epochs=3, shuffle=True, verbose=2)

model.save(os.path.join(model_path, 'cnn.model'))

return model

model = model_cnn(x_train, y_train)

"""输出

Using TensorFlow backend.

2019-06-15 20:59:45.453845 task begin

./input/train/audio

2019-06-15 20:59:46.634215 xy begin

/Users/user/Library/Python/3.6/lib/python/site-packages/scipy/io/wavfile.py:273: WavFileWarning: Chunk (non-data) not understood, skipping it.

WavFileWarning)

2019-06-15 21:02:35.116550 reshape begin

2019-06-15 21:02:46.166546 model begin

WARNING:tensorflow:From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.

Instructions for updating:

Colocations handled automatically by placer.

2019-06-15 21:02:46.278603: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA

_________________________________________________________________

Layer (type) Output Shape Param #

=================================================================

input_1 (InputLayer) (None, 99, 81, 1) 0

_________________________________________________________________

batch_normalization_1 (Batch (None, 99, 81, 1) 4

_________________________________________________________________

conv2d_1 (Conv2D) (None, 98, 80, 8) 40

_________________________________________________________________

conv2d_2 (Conv2D) (None, 97, 79, 8) 264

_________________________________________________________________

max_pooling2d_1 (MaxPooling2 (None, 48, 39, 8) 0

_________________________________________________________________

dropout_1 (Dropout) (None, 48, 39, 8) 0

_________________________________________________________________

conv2d_3 (Conv2D) (None, 46, 37, 16) 1168

_________________________________________________________________

conv2d_4 (Conv2D) (None, 44, 35, 16) 2320

_________________________________________________________________

max_pooling2d_2 (MaxPooling2 (None, 22, 17, 16) 0

_________________________________________________________________

dropout_2 (Dropout) (None, 22, 17, 16) 0

_________________________________________________________________

conv2d_5 (Conv2D) (None, 20, 15, 32) 4640

_________________________________________________________________

max_pooling2d_3 (MaxPooling2 (None, 10, 7, 32) 0

_________________________________________________________________

dropout_3 (Dropout) (None, 10, 7, 32) 0

_________________________________________________________________

flatten_1 (Flatten) (None, 2240) 0

_________________________________________________________________

dense_1 (Dense) (None, 128) 286848

_________________________________________________________________

batch_normalization_2 (Batch (None, 128) 512

_________________________________________________________________

dense_2 (Dense) (None, 128) 16512

_________________________________________________________________

batch_normalization_3 (Batch (None, 128) 512

_________________________________________________________________

dense_3 (Dense) (None, 12) 1548

=================================================================

Total params: 314,368

Trainable params: 313,854

Non-trainable params: 514

_________________________________________________________________

Instructions for updating:

Use tf.cast instead.

Train on 58356 samples, validate on 6485 samples

Epoch 1/3

- 737s - loss: 0.1415 - val_loss: 0.0874

Epoch 2/3

- 608s - loss: 0.0807 - val_loss: 0.0577

Epoch 3/3

- 518s - loss: 0.0636 - val_loss: 0.0499

2019-06-15 21:33:58.518621 predict begin

"""

预测

del x_train, y_train

gc.collect()

index = []

results = []

for fnames, imgs in test_data_generator(batch=32):

predicts = model.predict(imgs)

predicts = np.argmax(predicts, axis=1)

predicts = [label_index[p] for p in predicts]

index.extend(fnames)

results.extend(predicts)

df = pd.DataFrame(columns=['fname', 'label'])

df['fname'] = index

df['label'] = results

df.to_csv(os.path.join(out_path, 'sub.csv'), index=False)

关于预测的数据集,kaggle提供的压缩包有2,3G,解压后有将近10w条音频,个人笔记本吃不消。于是只选取其中的100条来测试,根据预测出的结果,和自己听取wav音频的结果对比,是正确的。但是并没有在大规模的数据集上预测,所以准确率不可知。后续在GPU上训练时,再考虑预测所有的数据。

完整代码下载

python语音引擎深度学习_python深度学习之语音识别(speech recognize)相关推荐

  1. python pytorch语音识别_PyTorch通过ASR实现语音到文本端的模型以及pytorch语音识别(speech) - pytorch中文网...

    ASR,英文的全称是Automated Speech Recognition,即自动语音识别技术,它是一种将人的语音转换为文本的技术.今天我们主要了解pytorch实现语音到文本的端到端模型. spe ...

  2. python读取文件数据恢复软件_python深度学习pdf恢复

    3步快速找回,让数据恢复变得简单 版权所有 1990-2020 B计划信息技术有限公司 python深度学习pdf python深度学习pdf Windows 10,Windows 7,Windows ...

  3. python编程教程第九讲_Python入门学习视频,最全面讲解视频无偿分享,这些基础知识你都懂了吗?...

    2020最新Python零基础到精通资料教材,干货分享,新基础Python教材,看这里,这里有你想要的所有资源哦,最强笔记,教你怎么入门提升!让你对自己更加有信心,重点是资料都是免费的,免费!!! 如 ...

  4. python封面是什么样子_Python的学习路线是怎么样的?

    这题我会,Python学习资料很多,多到什么程度?随便网上冲浪一小会儿,马上就会塞满2T的度盘. 面对形形色色的资料,大多数人看到都处于一脸懵逼的状态,这时候如何建立起一套标准的学习体系就变得非常重要 ...

  5. python能做机器人吗_python深度学习 人工智能是做机器人吗?

    我们经常听到"Python"与"人工智能"这两个词, 也很容易混淆这两个词, 那么Python和人工智能到底什么关系呢? Python人工智能工程师待遇怎么样? ...

  6. python如何实现找图_Python深度学习,手把手教你实现「以图搜图」

    随着深度学习的崛起,极大的推动了图像领域的发展,在提取特征这方面而言,神经网络目前有着不可替代的优势.之前文章中我们也介绍了图像检索往往是基于图像的特征比较,看特征匹配的程度有多少,从而检索出相似度高 ...

  7. python时间序列分析航空旅人_Python深度学习教程:LSTM时间序列预测小练习—国航乘客数量预测...

    Python深度学习教程:LSTM时间序列预测小练习-国航乘客数量预测 参考数据: 数据一共两列,左边是日期,右边是乘客数量 对数据做可视化:import math import numpy as n ...

  8. python进阶学什么意思_Python进阶学习

    学习目录阿力阿哩哩:深度学习 | 学习目录​zhuanlan.zhihu.com 上一期我们讲到阿力阿哩哩:Python基础​zhuanlan.zhihu.com 3.3Python进阶学习 3.3. ...

  9. 人工智能python线上培训系统_Python人工智能学习线路

    Python有非常多优秀的深度学习库可用,比如matplotlib.Numpy.sklearn.keras等大量的库,像pandas.sklearn.matplotlib这些库都是做数据处理.数据分析 ...

最新文章

  1. Creating a LINQ Enabled ASP.NET Web application template using C#.[转]
  2. nagios插件之登陆防火墙实现session监控
  3. linux常用shell命令面试,shell经典笔试题目总结
  4. 深入理解javascript原型和闭包(5)——instanceof
  5. 试编写算法,设任意n个整数存放于数组A[1...n]中,将所有正数排在所有负数前面(要求:算法时间复杂度为O(n))
  6. 美团大脑 | 知识图谱的建模方法及其应用
  7. 算法提高 数的划分 动态规划 无序
  8. ai怎么渐变颜色_你根本想不到AI的混合工具有多神奇!
  9. 学java有什么技巧?
  10. python 安装xlwt失败_安装python库xlwt的时候遇到超时的情况。
  11. 【CCCC】L3-005 垃圾箱分布 (30分),Dijkstra跑n遍 = 多源最短路,emm
  12. linux开发板命令rx,linux 常用命令汇总
  13. java中级项目案例_60个Java练手项目案例,看了让你茅塞顿开~
  14. oracle比较日期大小函数输出,oracle 日期比较及惯用函数
  15. html图片自动跳转,点击图片跳转链接(html点击图片跳转链接)
  16. OSChina 周三乱弹 —— 总觉得路过是 VIVO 大酒店
  17. Programming TCP/IP Windows Sockets in C++
  18. 北京信息科技大学计算机学院官网,北京信息科技大学教务处官网入口地址
  19. LaTeX技巧004:给文字添加下划线、波浪线等样式
  20. php中文数组按拼音排序问题

热门文章

  1. python保存文件到指定文件夹_python实现指定文件夹下的指定文件移动到指定位置...
  2. Ubuntu 16.04 QT ‘usr/bin/ld cannot find -IGL‘
  3. php敏感字符串过滤_PHP代码审计入门:常见的危险函数和审计点
  4. qnx efs文件系统binary修复
  5. webview 个人小程序_微信小程序新增Webview它是什么东西?
  6. Java并发编程实战_一线大厂架构师整理:java并发编程实践教程
  7. 山东师范大学计算机尹副教授,我校举办首届优秀教案展评工作
  8. 大学物理实验试卷1到8_物理实验在绝对的“理论”面前就是“纸老虎”
  9. canvas在舞台上点击后图片旋转_View绘制系列(10)Canvas基础变换
  10. qt连接错误ip的sqlserver超时时间_参数设置导致请求超时案例