unimrcp的voice activity dector

unimrcp本身提供了一个简单的VAD算法模块，这里并不讨论这个算法的好坏，只记录一下它的用法流程。

算法实现代码：ibs/mpf/src/mpf_activity_detector.c

使用时，调用mpf_activity_detector_create()创建，以demo_recog_engine为例：

    /* create demo recog channel */demo_recog_channel_t *recog_channel = apr_palloc(pool,sizeof(demo_recog_channel_t));recog_channel->demo_engine = engine->obj;recog_channel->recog_request = NULL;recog_channel->stop_response = NULL;recog_channel->detector = mpf_activity_detector_create(pool);recog_channel->audio_out = NULL;

mpf_activity_detector_create的实现代码：

/** Create activity detector */
MPF_DECLARE(mpf_activity_detector_t*) mpf_activity_detector_create(apr_pool_t *pool)
{mpf_activity_detector_t *detector = apr_palloc(pool,sizeof(mpf_activity_detector_t));detector->level_threshold = 2; /* 0 .. 255 */detector->speech_timeout = 300; /* 0.3 s */detector->silence_timeout = 300; /* 0.3 s */detector->noinput_timeout = 5000; /* 5 s */detector->duration = 0;detector->state = DETECTOR_STATE_INACTIVITY;return detector;
}

就是初始化detector的参数，包括能量阈值、状态转换的时间阈值。

这些值，都提供了设置的接口：

/** Create activity detector */
MPF_DECLARE(mpf_activity_detector_t*) mpf_activity_detector_create(apr_pool_t *pool);/** Reset activity detector */
MPF_DECLARE(void) mpf_activity_detector_reset(mpf_activity_detector_t *detector);/** Set threshold of voice activity (silence) level */
MPF_DECLARE(void) mpf_activity_detector_level_set(mpf_activity_detector_t *detector, apr_size_t level_threshold);/** Set noinput timeout */
MPF_DECLARE(void) mpf_activity_detector_noinput_timeout_set(mpf_activity_detector_t *detector, apr_size_t noinput_timeout);/** Set timeout required to trigger speech (transition from inactive to active state) */
MPF_DECLARE(void) mpf_activity_detector_speech_timeout_set(mpf_activity_detector_t *detector, apr_size_t speech_timeout);/** Set timeout required to trigger silence (transition from active to inactive state) */
MPF_DECLARE(void) mpf_activity_detector_silence_timeout_set(mpf_activity_detector_t *detector, apr_size_t silence_timeout);

如果使用1.7里缺省的level_threshold 2，我的实测结果是收不到打断事件。

收到媒体包时，调用mpf_activity_detector_process处理，里面维护了一个状态机：

if(recog_channel->recog_request) {mpf_detector_event_e det_event = mpf_activity_detector_process(recog_channel->detector,frame);switch(det_event) {case MPF_DETECTOR_EVENT_ACTIVITY:apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"Detected Voice Activity " APT_SIDRES_FMT,MRCP_MESSAGE_SIDRES(recog_channel->recog_request));demo_recog_start_of_input(recog_channel);break;case MPF_DETECTOR_EVENT_INACTIVITY:apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"Detected Voice Inactivity " APT_SIDRES_FMT,MRCP_MESSAGE_SIDRES(recog_channel->recog_request));demo_recog_recognition_complete(recog_channel,RECOGNIZER_COMPLETION_CAUSE_SUCCESS);break;case MPF_DETECTOR_EVENT_NOINPUT:apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"Detected Noinput " APT_SIDRES_FMT,MRCP_MESSAGE_SIDRES(recog_channel->recog_request));if(recog_channel->timers_started == TRUE) {demo_recog_recognition_complete(recog_channel,RECOGNIZER_COMPLETION_CAUSE_NO_INPUT_TIMEOUT);}break;default:break;

/** Process current frame */
MPF_DECLARE(mpf_detector_event_e) mpf_activity_detector_process(mpf_activity_detector_t *detector, const mpf_frame_t *frame)
{mpf_detector_event_e det_event = MPF_DETECTOR_EVENT_NONE;apr_size_t level = 0;if((frame->type & MEDIA_FRAME_TYPE_AUDIO) == MEDIA_FRAME_TYPE_AUDIO) {/* first, calculate current activity level of processed frame */level = mpf_activity_detector_level_calculate(frame);
#if 0apt_log(APT_LOG_MARK,APT_PRIO_INFO,"Activity Detector [%"APR_SIZE_T_FMT"]",level);
#endif}if(detector->state == DETECTOR_STATE_INACTIVITY) {if(level >= detector->level_threshold) {/* start to detect activity */mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY_TRANSITION);}else {detector->duration += CODEC_FRAME_TIME_BASE;if(detector->duration >= detector->noinput_timeout) {/* detected noinput */det_event = MPF_DETECTOR_EVENT_NOINPUT;}}}else if(detector->state == DETECTOR_STATE_ACTIVITY_TRANSITION) {if(level >= detector->level_threshold) {detector->duration += CODEC_FRAME_TIME_BASE;if(detector->duration >= detector->speech_timeout) {/* finally detected activity */det_event = MPF_DETECTOR_EVENT_ACTIVITY;mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY);}}else {/* fallback to inactivity */mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY);}}else if(detector->state == DETECTOR_STATE_ACTIVITY) {if(level >= detector->level_threshold) {detector->duration += CODEC_FRAME_TIME_BASE;}else {/* start to detect inactivity */mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY_TRANSITION);}}else if(detector->state == DETECTOR_STATE_INACTIVITY_TRANSITION) {if(level >= detector->level_threshold) {/* fallback to activity */mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY);}else {detector->duration += CODEC_FRAME_TIME_BASE;if(detector->duration >= detector->silence_timeout) {/* detected inactivity */det_event = MPF_DETECTOR_EVENT_INACTIVITY;mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY);}}}return det_event;
}

具体算法调用就是mpf_activity_detector_level_calculate()

static apr_size_t mpf_activity_detector_level_calculate(const mpf_frame_t *frame)
{apr_size_t sum = 0;apr_size_t count = frame->codec_frame.size/2;const apr_int16_t *cur = frame->codec_frame.buffer;const apr_int16_t *end = cur + count;for(; cur < end; cur++) {if(*cur < 0) {sum -= *cur;}else {sum += *cur;}}return sum / count;
}

对于这个算法，不需要太较真，累加求其平均值，如果大于阈值，表示有声音，如果不大于，表示静音。并没有噪音检测。如果生产需要，肯定是需要修改的。

unimrcp的voice activity dector相关推荐

建建自学VoIP之VAD(Voice Activity Detector)和CNG(Comfort Noice Generator)
语音活动检测(Voice Activity Detection,VAD)又称语音端点检测.语音边界检测.目的是从声音信号流里识别和消除长时间的静音期,以达到在不降低业务质量的情况下节省网络资源的作用, ...
语音端点检测（Voice Activity Detection,VAD）
1.VAD的总体步骤:https://www.bbsmax.com/A/1O5EOo73z7/ 2.基于短时能量和过零率的简单实现(实际上精确度高的VAD会提取4种或更多的特征进行判断,这里只介绍两种 ...
unimrcp 实现阿里云的plugin
1. 环境说明 unimrcp版本V1.7.0 阿里SDK版本:V3.X 操作系统:CentOS7 GCC版本:V4.8.5,注意,使用高版本的GCC可能会有SDK编译兼容问题 2. 修改config ...
activity 生命周期_Activity 源码解析
Android 应用程序启动过程 Activity启动过程可以分为两种:一种是根activity的启动过程,另一种是普通activity启动过程.根activity指的是应用程序启动的第一个activ ...
有趣的Github项目万里挑一 !（附论文、项目链接）
来源:PaperWeekly 本文共1000字,建议阅读5分钟. 本文为你介绍9个最新机器学习开源项目. 本文带你快速 get 每个精选Github项目的亮点和痛点,时刻紧跟 AI 前沿成果. 01 ...
WebRTC详解-zz
1.WebRTC目的 WebRTC(Web Real-Time Communication)项目的最终目的主要是让Web开发者能够基于浏览器(Chrome\FireFox\...) 轻易快捷开发出丰富 ...
深度学习核心技术精讲100篇（二十七）-如何利用NLP技术对ASR的query文本进行预处理纠错？
前言语音系统中语音内容识别 ( ASR ) 的精准性,是影响智能语音产品发展的关键制约因素,用户query的文本,通常是由ASR系统将用户的语音命令转换而成,但由于技术上的原因,这些由ASR生成的文 ...
好看的论文千篇一律，有趣的Github项目万里挑一！
在碎片化阅读充斥眼球的时代,越来越少的人会去关注每篇论文背后的探索和思考. 在这个栏目里,你会快速 get 每篇精选论文的亮点和痛点,时刻紧跟 AI 前沿成果. 点击本文底部的「阅读原文」即刻加入社区 ...
lms自适应滤波器实现噪声干扰的语音恢复_ZLG深度解析语音识别技术
语音识别已成为人与机器通过自然语言交互重要方式之一,本文将从语音识别的原理以及语音识别算法的角度出发为大家介绍语音识别的方案及详细设计过程. 语言作为人类的一种基本交流方式,在数千年历史中得到持续传承 ...

unimrcp的voice activity dector

unimrcp的voice activity dector相关推荐

最新文章

热门文章