AI之语音转写项目实践

最近项目需要，做了一个语音转写的小demo，结合了微信小程序

前端：微信小程序，录音后上传到服务器
服务器：express作为服务器，multer作为上传中间件，ffmpeg作为格式转换工具，使用百度nodejs的sdk进行语音的转换

语音输入

考虑到需要做语音输入，如果使用HTML5的getUserMedia，兼容性问题有点问题，最近在学微信小程序，索性就直接拿这个项目开撸了。

微信小程序的步骤

注册，获取appid 步骤
下载微信小程序开发工具
新建模板项目

在pages/index/index.wxml增加录音按钮

<button class="start-record" bindtap="startRecord">开始录音</button>
<button class="stop-record" bindtap="stopRecord">停止录音</button>
<button class="play-record" bindtap="playRecord">播放录音</button>
<button class="upload-record" bindtap="uploadRecord">上传录音</button>
<text class="voice-text">语音转换后的文字：{{text}}</text>

在pages/index/index.js中增加业务逻辑

onLoad中增加init方法，初始化recorderManager和innerAudioContext，分别是录音的管理器和播放语音的管理器

onLoad: function () {this.init()// ....
}
init: function() {const recorderManager = wx.getRecorderManager()recorderManager.onStart(() => {console.log('recorder start')})recorderManager.onPause(() => {console.log('recorder pause')})recorderManager.onStop((res) => {console.log('recorder stop', res)const { tempFilePath } = resthis.setData({recordSrc: res.tempFilePath,})})recorderManager.onFrameRecorded((res) => {const { frameBuffer } = resconsole.log('frameBuffer.byteLength', frameBuffer.byteLength)})const innerAudioContext = wx.createInnerAudioContext()innerAudioContext.autoplay = falseinnerAudioContext.onPlay(() => {console.log('开始播放')})innerAudioContext.onError((res) => {console.log(res.errMsg)console.log(res.errCode)})this.setData({recorderManager: recorderManager,innerAudioContext: innerAudioContext});
},

录音和结束录音

startRecord: function() {const options = {duration: 600000,sampleRate: 44100,numberOfChannels: 1,encodeBitRate: 192000,format: 'aac',frameSize: 50}this.data.recorderManager.start(options);
},
stopRecord: function() {this.data.recorderManager.stop();
},

为了更好的验证，播放录音:

playRecord: function() {this.data.innerAudioContext.src = this.data.recordSrcthis.data.innerAudioContext.play()
},

语音上传

语音上传客户端使用的是wx.uploadFile接口

uploadRecord: function() {let that = this;wx.showLoading({title: '上传中...'});const uploadTask = wx.uploadFile({url: 'http://localhost:8080/upload', //仅为示例，非真实的接口地址filePath: this.data.recordSrc,name: 'voice',formData: {'token': '12fdfsdadf',},success: function(res){wx.hideLoading();console.log(res);var data = JSON.parse(res.data);that.setData({text: data.text});},error: function(res) {wx.hideLoading();wx.showToast(res.msg)console.log(res);}});uploadTask.onProgressUpdate((res) => {console.log('上传进度', res.progress)console.log('已经上传的数据长度', res.totalBytesSent)console.log('预期需要上传的数据总长度', res.totalBytesExpectedToSend)})
}

这里有个地方需要注意，我们的url写的是http开头的，所以需要设置一下
勾选工具-项目详情-不校验合法域名、web-view（业务域名）、TLS 版本以及 HTTPS 证书，否则会报错

上传服务器:

const convert = require('./convert');
const voice = require('./voice');
const express = require('express');
const co = require('co');
const app = express();
const multer  = require('multer');const upload = multer({ dest: 'uploads/' });app.post('/upload', upload.array('voice'), function (req, res, next) {const rpath = req.files[0].path;const rname = req.files[0].filename;convert(rpath, rname).then(function(fpath){return voice(fpath)}).then(function(result){// 文件转换console.log(result.result[0]);res.status(200).json({text: result.result[0]});}).catch(function(err) {console.log(err);res.status(200).json({msg: err});});
}) app.listen(8080);

语音格式转换

因为微信上传的语音只有mp3和aac格式的，而百度的restful的api只支持pcm和wav格式的语音，所以我们需要进行语音格式的转换，转换工具使用的是ffmpeg

首先安装ffmpeg brew install ffmpeg

然后在convert.js利用shell命令进行格式转换

const process = require('child_process');function convert (fpath, fname) {console.log('in convert');const command = 'ffmpeg -y  -i ./' + fpath + '  -acodec pcm_s16le -f s16le -ac 1 -ar 16000 ' + fname + '.pcm';return new Promise(function(resolve, reject) {console.log(command);process.exec(command, function (error, stdout, stderr) {console.log(error);if (error !== null) {console.log('exec error: ' + error);reject(error);return;}resolve(fname + '.pcm');});});}module.exports = convert;

语音转文字

最后一步就是利用百度的sdk进行语音转文字啦，不过各位需要自己去百度的开放平台注册，然后新建应用，获取到APPID/AK/SK

const AipSpeechClient = require("baidu-aip-sdk").speech;// 设置APPID/AK/SK
const APP_ID = "...";
const API_KEY = "...";
const SECRET_KEY = "...";// 新建一个对象，建议只保存一个对象调用服务接口
const client = new AipSpeechClient(APP_ID, API_KEY, SECRET_KEY);const fs = require('fs');function voiceToText(fpath) {console.log('in voiceToText');return new Promise(function(resolve, reject) {const voice = fs.readFileSync(fpath);const voiceBuffer = new Buffer(voice);// 识别本地文件client.recognize(voiceBuffer, 'pcm', 16000).then(function (result) {console.log('<recognize>: ' + JSON.stringify(result));if (result.err_no == 0) {resolve(result);} else {reject(result);}}, function(err) {console.log(err);reject(err);});});}module.exports = voiceToText

有一些注册的步骤，省略了，如果有疑问可以交流