linux下lamealsa进行音频流操作（八）用ffmpeg将mp3转为wav

1. 利用fffmpeg将mp3转为pcm并在pcm数据加上wav头就是一个完整的wav文件

2. 代码

#include "utils.h"
#include <libavutil/avutil.h>
#include <libavutil/attributes.h>
#include <libavutil/opt.h>
#include <libavutil/mathematics.h>
#include <libavutil/imgutils.h>
#include <libavutil/samplefmt.h>
#include <libavutil/timestamp.h>
#include <libavformat/avformat.h>
#include <libavcodec/avcodec.h>
#include <libswscale/swscale.h>
#include <libavutil/mathematics.h>
#include <libswresample/swresample.h>
#include <libavutil/channel_layout.h>
#include <libavutil/common.h>
#include <libavformat/avio.h>
#include <libavutil/file.h>
#include <libswresample/swresample.h>#define AVCODEC_MAX_AUDIO_FRAME_SIZE 192000//下面这四个结构体是为了分析wav头的
typedef struct {u_int magic;      /* 'RIFF' */u_int length;     /* filelen */u_int type;       /* 'WAVE' */
} WaveHeader;typedef struct {u_short format;       /* see WAV_FMT_* */u_short channels;u_int sample_fq;      /* frequence of sample */u_int byte_p_sec;u_short byte_p_spl;   /* samplesize; 1 or 2 bytes */u_short bit_p_spl;    /* 8, 12 or 16 bit */
} WaveFmtBody;typedef struct {u_int type;        /* 'data' */u_int length;      /* samplecount */
} WaveChunkHeader;#define COMPOSE_ID(a,b,c,d) ((a) | ((b)<<8) | ((c)<<16) | ((d)<<24))
#define WAV_RIFF COMPOSE_ID('R','I','F','F')
#define WAV_WAVE COMPOSE_ID('W','A','V','E')
#define WAV_FMT COMPOSE_ID('f','m','t',' ')
#define WAV_DATA COMPOSE_ID('d','a','t','a')
int insert_wave_header(FILE* fp, long data_len)
{int len;WaveHeader* header;WaveChunkHeader* chunk;WaveFmtBody* body;fseek(fp, 0, SEEK_SET);        //写到wav文件的开始处len = sizeof(WaveHeader)+sizeof(WaveFmtBody)+sizeof(WaveChunkHeader)*2;char* buf = (char*)malloc(len);header = (WaveHeader*)buf;header->magic = WAV_RIFF;header->length = data_len + sizeof(WaveFmtBody)+sizeof(WaveChunkHeader)*2 + 4;header->type = WAV_WAVE;chunk = buf+sizeof(WaveHeader);chunk->type = WAV_FMT;chunk->length = 16;body = buf+sizeof(WaveHeader)+sizeof(WaveChunkHeader);body->format = (u_short)0x0001;      //编码方式为pcmbody->channels = (u_short)0x02;      //声道数为2body->sample_fq = 44100;             //采样频率为44.1kbody->byte_p_sec = 176400;           //每秒所需字节数 44100*2*2=采样频率*声道*采样位数body->byte_p_spl = (u_short)0x4;     //对齐无意义body->bit_p_spl = (u_short)16;       //采样位数16bit=2Bytechunk = buf+sizeof(WaveHeader)+sizeof(WaveChunkHeader)+sizeof(WaveFmtBody);chunk->type = WAV_DATA;chunk->length = data_len;fwrite(buf, 1, len, fp);free(buf);return 0;
}typedef struct {int videoindex;int sndindex;AVFormatContext* pFormatCtx;AVCodecContext* sndCodecCtx;AVCodec* sndCodec;SwrContext *swr_ctx;DECLARE_ALIGNED(16,uint8_t,audio_buf) [AVCODEC_MAX_AUDIO_FRAME_SIZE * 4];
}AudioState;int init_ffmpeg(AudioState* is, char* filepath)
{int i=0;int ret;is->sndindex = -1;if(NULL == filepath){dbmsg("input file is NULL");return -1;}avcodec_register_all();avfilter_register_all();av_register_all();is->pFormatCtx = avformat_alloc_context();if(avformat_open_input(&is->pFormatCtx, filepath, NULL, NULL)!=0)return -1;if(avformat_find_stream_info(is->pFormatCtx, NULL)<0)return -1;av_dump_format(is->pFormatCtx,0, 0, 0);is->videoindex = av_find_best_stream(is->pFormatCtx, AVMEDIA_TYPE_VIDEO, is->videoindex, -1, NULL, 0);is->sndindex = av_find_best_stream(is->pFormatCtx, AVMEDIA_TYPE_AUDIO,is->sndindex, is->videoindex, NULL, 0);dbmsg("videoindex=%d, sndindex=%d", is->videoindex, is->sndindex);if(is->sndindex != -1){is->sndCodecCtx = is->pFormatCtx->streams[is->sndindex]->codec;is->sndCodec = avcodec_find_decoder(is->sndCodecCtx->codec_id);if(is->sndCodec == NULL){dbmsg("Codec not found");return -1;}if(avcodec_open2(is->sndCodecCtx, is->sndCodec, NULL) < 0)return -1;}return 0;
}int main(int argc, char **argv)
{int ret;FILE* fp;int file_data_size = 0;                //这儿注意一个问题: 变量用时一定要初始化,否则会出现异常int len1,len2, data_size, got_frame;AVPacket *packet = av_mallocz(sizeof(AVPacket));AVFrame *frame = av_frame_alloc();AudioState* is = (AudioState*) av_mallocz(sizeof(AudioState));uint8_t *out[] = { is->audio_buf };fp = fopen("./test.wav", "wb+");len1 = sizeof(WaveHeader)+sizeof(WaveFmtBody)+sizeof(WaveChunkHeader)*2;fseek(fp,len1, SEEK_SET);      //在写之前先预留出wav的header,即44个字节dbmsg("len1=%d",len1);//第1步初始化ffmpeg,并用ffmpeg解码,最后转为pcm格式if( (ret=init_ffmpeg(is, argv[1])) != 0)            //1.1 初始化ffmpeg{dbmsg("init_ffmpeg error");return -1;}while( (av_read_frame(is->pFormatCtx, packet)>=0) )    //1.2 循环读取mp3文件中的数据帧{if(packet->stream_index != is->sndindex)continue;if((ret=avcodec_decode_audio4(is->sndCodecCtx, frame, &got_frame, packet)) < 0) //1.3 解码数据帧{dbmsg("file eof");break;}if(got_frame <= 0) /* No data yet, get more frames */continue;data_size = av_samples_get_buffer_size(NULL, is->sndCodecCtx->channels, frame->nb_samples, is->sndCodecCtx->sample_fmt, 1);//1.4下面将ffmpeg解码后的数据帧转为我们需要的数据(关于"需要的数据"下面有解释)if(NULL==is->swr_ctx){if(is->swr_ctx != NULL)swr_free(&is->swr_ctx);dbmsg("frame: channnels=%d,format=%d, sample_rate=%d", frame->channels, frame->format, frame->sample_rate);is->swr_ctx = swr_alloc_set_opts(NULL, AV_CH_LAYOUT_STEREO, AV_SAMPLE_FMT_S16, 44100, av_get_default_channel_layout(frame->channels), frame->format, frame->sample_rate, 0, NULL);if(is->swr_ctx == NULL){dbmsg("swr_ctx == NULL");}swr_init(is->swr_ctx);}len2 = swr_convert(is->swr_ctx, out, 44100,(const uint8_t **)frame->extended_data, frame->nb_samples);file_data_size += len2;//1.5 数据格式转换完成后就写到文件中fwrite((short *)is->audio_buf, sizeof(short), (size_t) len2* 2, fp);}file_data_size *= 4;dbmsg("file_data_size=%d", file_data_size);//第2步添加上wav的头ret = insert_wave_header(fp, file_data_size);av_free_packet(packet);av_free(frame);avcodec_close(is->sndCodecCtx);avformat_close_input(&is->pFormatCtx);fclose(fp);return 0;
}

3.运行结果

cong@msi:/work/ffmpeg/test/alsa/testalsa/5mp3towav$ make run
export LD_LIBRARY_PATH=/work/ffmpeg/out/lib/ \&& ./mp3towav /work/ffmpeg/test/resource//test.mp3
mp3towav.c:main[150]: len1=44
[mp3 @ 0x14d3620] Skipping 0 bytes of junk at 197687.
libavutil/crc.c:av_crc_init[313]:
[mp3 @ 0x14d3620] Estimating duration from bitrate, this may be inaccurate
Input #0, mp3, from '(null)':Metadata:artist : 佚名title : 法国国歌 马赛曲TYER : 2013-10-26Duration: 00:03:28.20, start: 0.000000, bitrate: 199 kb/sStream #0:0: Audio: mp3, 44100 Hz, stereo, s16p, 192 kb/sStream #0:1: Video: mjpeg, yuvj420p(pc, bt470bg/unknown/unknown), 600x600 [SAR 1:1 DAR 1:1], 90k tbr, 90k tbn, 90k tbcMetadata:title : ecomment : Cover (front)
mp3towav.c:init_ffmpeg[120]: videoindex=-1381258232, sndindex=0
mp3towav.c:main[173]: frame: channnels=2,format=6, sample_rate=44100
mp3towav.c:main[186]: file_data_size=36725760

ls查看

cong@msi:/work/ffmpeg/test/alsa/testalsa/5mp3towav$ ls -l
total 36064
-rw-rw-r-- 1 cong cong 885 Sep 11 11:25 Makefile
-rwxrwxr-x 1 cong cong 64126 Sep 11 11:44 mp3towav
-rw-rw-r-- 1 cong cong 6183 Sep 11 11:24 mp3towav.c
-rw-rw-r-- 1 cong cong 115344 Sep 11 11:44 mp3towav.o
-rw-rw-r-- 1 cong cong 36725804 Sep 11 11:44 test.wav
-rw-rw-r-- 1 cong cong 333 Sep 9 11:31 utils.h

4. 说明

mp3towav.c:main[173]: AV_CH_LAYOUT_STEREO=3, AV_SAMPLE_FMT_S16=1, freq=44100
mp3towav.c:main[174]: frame: channnels=2, default_layout=3, format=6, sample_rate=44100

ffmpeg中:include/libavutil/samplefmt.h
enum AVSampleFormat {AV_SAMPLE_FMT_NONE = -1,AV_SAMPLE_FMT_U8, ///< unsigned 8 bitsAV_SAMPLE_FMT_S16, ///< signed 16 bits    --> 1 这个是pcm的数据格式AV_SAMPLE_FMT_S32, ///< signed 32 bitsAV_SAMPLE_FMT_FLT, ///< floatAV_SAMPLE_FMT_DBL, ///< doubleAV_SAMPLE_FMT_U8P, ///< unsigned 8 bits, planarAV_SAMPLE_FMT_S16P, ///< signed 16 bits, planar  -->6 这个是ffmepg解码之后的数据格式AV_SAMPLE_FMT_S32P, ///< signed 32 bits, planarAV_SAMPLE_FMT_FLTP, ///< float, planarAV_SAMPLE_FMT_DBLP, ///< double, planarAV_SAMPLE_FMT_NB ///< Number of sample formats. DO NOT USE if linking dynamically
};

interleaved -->理解为交叉存取 --> AV_SAMPLE_FMT_S16是两个声道的声音是交叉存储的
plannar–> 理解为平面存取 --> AV_SAMPLE_FMT_S16P是先存1个声道的数据再存另一个声道的数据

AV_SAMPLE_FMT_S16P is planar signed 16 bit audio, i.e. 2 bytes for each sample which is same for AV_SAMPLE_FMT_S16.

The only difference is in AV_SAMPLE_FMT_S16 samples of each channel are interleaved i.e. if you have two channel audio then the samples buffer will look like

c1 c1 c2 c2 c1 c1 c2 c2… -->AV_SAMPLE_FMT_S16的数据组织方式

where c1 is a sample for channel1 and c2 is sample for channel2.

while for one frame of planar audio you will have something like

c1 c1 c1 c1 … c2 c2 c2 c2 … -->AV_SAMPLE_FMT_S16P的数据组织方式

now how is it stored in AVFrame:

for planar audio:

data[i] will contain the data of channel i (assuming channel 0 is first channel).

however if you have more channels then 8 then data for rest of the channels can be found in extended_data attribute of AVFrame.

for non-planar audio

data[0] will contain the data for all channels in an interleaved manner.

参考文章:
What is the difference between AV_SAMPLE_FMT_S16P and AV_SAMPLE_FMT_S16?