RGB24 To Yuv420 C语言 +汇编实现(windows平台)

以下代码来自libyuv


#include <stdint.h>
#include <stdlib.h>
#include <string.h>#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))#define align_buffer_64(var, size)                                           \uint8_t* var##_mem = (uint8_t*)(malloc((size)+63));         /* NOLINT */ \uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */#define free_aligned_buffer_64(var)                                            \free(var##_mem);                                                       \var = 0#define SIMD_ALIGNED(var) __declspec(align(16)) var#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))typedef __declspec(align(16)) uint8_t uvec8[16];static const uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u };typedef __declspec(align(16)) int8_t vec8[16];static const vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0,
13, 65, 33, 0, 13, 65, 33, 0 };static const uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u };typedef __declspec(align(32)) uint8_t ulvec8[32];#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                      \void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \uint8_t* dst_v, int width) {                                                \SIMD_ALIGNED(uint8_t temp[128 * 4]);                                       \memset(temp, 0, 128 * 2); /* for msan */                                   \int r = width & MASK;                                                     \int n = width & ~MASK;                                                        \
if (n > 0) {                                                                 \ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n);                                \
}                                                                               \memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \SS(r, UVSHIFT) * BPP);                                            \
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \BPP);                                                           \memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
}                                                                        \ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \}#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                  \void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {        \SIMD_ALIGNED(uint8_t temp[128 * 2]);                                   \memset(temp, 0, 128); /* for YUY2 and msan */                          \int r = width & MASK;                                                 \int n = width & ~MASK;                                                    \
if (n > 0) {                                                             \ANY_SIMD(src_ptr, dst_ptr, n);                                      \
}                                                                     \memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ANY_SIMD(temp, temp + 128, MASK + 1);                                 \memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \}#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {      \SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \memset(temp, 0, 128); /* for YUY2 and msan */                         \int r = width & MASK;                                                 \int n = width & ~MASK;                                                \
if (n > 0) {                                                         \ANY_SIMD(src_ptr, dst_ptr, n);                                      \
}                                                                     \memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ANY_SIMD(temp, temp + 128, MASK + 1);                                 \memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \}static const ulvec8 kBiasUV128 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width)
{__asm {mov       eax, [esp + 4]  // src_rgb24mov       edx, [esp + 8]  // dst_argbmov       ecx, [esp + 12]  // widthpcmpeqb   xmm5, xmm5  // generate mask 0xff000000pslld     xmm5, 24movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGBconvertloop :movdqu    xmm0, [eax]movdqu    xmm1, [eax + 16]movdqu    xmm3, [eax + 32]lea       eax, [eax + 48]movdqa    xmm2, xmm3palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}pshufb    xmm2, xmm4por       xmm2, xmm5palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}pshufb    xmm0, xmm4movdqu[edx + 32], xmm2por       xmm0, xmm5pshufb    xmm1, xmm4movdqu[edx], xmm0por       xmm1, xmm5palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}pshufb    xmm3, xmm4movdqu[edx + 16], xmm1por       xmm3, xmm5movdqu[edx + 48], xmm3lea       edx, [edx + 64]sub       ecx, 16jg        convertloopret}
}static const vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0,
112, -74, -38, 0, 112, -74, -38, 0 };static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,int src_stride_argb,uint8_t* dst_u,uint8_t* dst_v,int width) {__asm {push       esipush       edimov        eax, [esp + 8 + 4]  // src_argbmov        esi, [esp + 8 + 8]  // src_stride_argbmov        edx, [esp + 8 + 12]  // dst_umov        edi, [esp + 8 + 16]  // dst_vmov        ecx, [esp + 8 + 20]  // widthmovdqa     xmm5, xmmword ptr kBiasUV128movdqa     xmm6, xmmword ptr kARGBToVmovdqa     xmm7, xmmword ptr kARGBToUsub        edi, edx  // stride from u to vconvertloop :/* step 1 - subsample 16x2 argb pixels to 8x1 */movdqu     xmm0, [eax]movdqu     xmm4, [eax + esi]pavgb      xmm0, xmm4movdqu     xmm1, [eax + 16]movdqu     xmm4, [eax + esi + 16]pavgb      xmm1, xmm4movdqu     xmm2, [eax + 32]movdqu     xmm4, [eax + esi + 32]pavgb      xmm2, xmm4movdqu     xmm3, [eax + 48]movdqu     xmm4, [eax + esi + 48]pavgb      xmm3, xmm4lea        eax, [eax + 64]movdqa     xmm4, xmm0shufps     xmm0, xmm1, 0x88shufps     xmm4, xmm1, 0xddpavgb      xmm0, xmm4movdqa     xmm4, xmm2shufps     xmm2, xmm3, 0x88shufps     xmm4, xmm3, 0xddpavgb      xmm2, xmm4// step 2 - convert to U and V// from here down is very similar to Y code except// instead of 16 different pixels, its 8 pixels of U and 8 of Vmovdqa     xmm1, xmm0movdqa     xmm3, xmm2pmaddubsw  xmm0, xmm7  // Upmaddubsw  xmm2, xmm7pmaddubsw  xmm1, xmm6  // Vpmaddubsw  xmm3, xmm6phaddw     xmm0, xmm2phaddw     xmm1, xmm3psraw      xmm0, 8psraw      xmm1, 8packsswb   xmm0, xmm1paddb      xmm0, xmm5  // -> unsigned// step 3 - store 8 U and 8 V valuesmovlps     qword ptr[edx], xmm0  // Umovhps     qword ptr[edx + edi], xmm0  // Vlea        edx, [edx + 8]sub        ecx, 16jg         convertlooppop        edipop        esiret}
}__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,uint8_t* dst_y,int width) {__asm {mov        eax, [esp + 4] /* src_argb */mov        edx, [esp + 8] /* dst_y */mov        ecx, [esp + 12] /* width */movdqa     xmm4, xmmword ptr kARGBToYmovdqa     xmm5, xmmword ptr kAddY16convertloop :movdqu     xmm0, [eax]movdqu     xmm1, [eax + 16]movdqu     xmm2, [eax + 32]movdqu     xmm3, [eax + 48]pmaddubsw  xmm0, xmm4pmaddubsw  xmm1, xmm4pmaddubsw  xmm2, xmm4pmaddubsw  xmm3, xmm4lea        eax, [eax + 64]phaddw     xmm0, xmm1phaddw     xmm2, xmm3psrlw      xmm0, 7psrlw      xmm2, 7packuswb   xmm0, xmm2paddb      xmm0, xmm5movdqu[edx], xmm0lea        edx, [edx + 16]sub        ecx, 16jg         convertloopret}
}ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)int RGB24ToI420(const uint8_t* src_rgb24,
int src_stride_rgb24,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height)
{int y;void(*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width);void(*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width);void(*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width);if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0){return -1;}if (height < 0){height = -height;src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;src_stride_rgb24 = -src_stride_rgb24;}RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;//宽度不是16字节的倍数if (IS_ALIGNED(width, 16)){RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;}ARGBToUVRow = ARGBToUVRow_Any_SSSE3;ARGBToYRow = ARGBToYRow_Any_SSSE3;if (IS_ALIGNED(width, 16)) {ARGBToUVRow = ARGBToUVRow_SSSE3;ARGBToYRow = ARGBToYRow_SSSE3;}const int kRowSize = (width * 4 + 31) & ~31;align_buffer_64(row, kRowSize * 2);for (y = 0; y < height - 1; y += 2) {RGB24ToARGBRow(src_rgb24, row, width);RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);ARGBToYRow(row, dst_y, width);ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);src_rgb24 += src_stride_rgb24 * 2;dst_y += dst_stride_y * 2;dst_u += dst_stride_u;dst_v += dst_stride_v;}if (height & 1){RGB24ToARGBRow(src_rgb24, row, width);ARGBToUVRow(row, 0, dst_u, dst_v, width);ARGBToYRow(row, dst_y, width);}free_aligned_buffer_64(row);return 0;
}

RGB24 To Yuv420 C语言 +汇编实现(windows平台)相关推荐

RGB24 To Yuv420 C语言实现
RGB24 To Yuv420 C语言实现(非SIMD版本) 以下代码来自 libyuv #include <stdint.h>//下面三个函数为RGB-->>Yuv420的公 ...
Win32汇编学习——windows汇编语法（小甲鱼教程）
Win32汇编学习--windows汇编语法(小甲鱼教程) 1)指令集 .386 语句是汇编语句的伪指令,类似指令有:.8086 . .186 ..286 ..386/.386p . .486/ ...
Go语言环境搭建(Windows+Linux)
目录 1. Windows安装配置 2. Linux安装配置 1. Windows安装配置 1️⃣ 下载SDK SDK 的全称是Software Development Kit(软件开发工具包) ,包 ...
Windows平台搭建-----C语言
上期我们已经进行Linux的平台搭建,今期我们就来搭建下我们最常用的.最适合初学者的一种方式,那就是搭建Windows平台开发环境,只需要两种工具即可,一个就是编辑器(编辑代码的工具),另一个就是编译 ...
在Windows平台如何选择C语言编译器?
该博文为原创文章,未经博主同意不得转载,如同意转载请注明博文出处本文章博客地址:https://cplusplus.blog.csdn.net/article/details/104995539 选 ...
c语言迷宫求解毕业设计,毕业设计（论文）-基于Windows平台C语言实现迷宫游戏的设计.doc...
基于Windows平台C语言实现迷宫游戏的设计摘要随着科技的日益发展,计算机信息知识越来越被人们所认知和使用,在当今知识爆炸的时代计算机毫无疑问成为人们常用的日常工具,而Windows和C语言都 ...
大连东软c语言编程题,大连东软信息学院C语言实验一 windows基本操作和turboc使用答案.doc...
大连东软信息学院C语言实验一 windows基本操作和turboc使用答案实验一 Windows基本操作及Turbo C2.0编译环境一.实验目的 1.熟悉Windows操作系统的基本操作,如文件 ...
【转】此版本之魔兽争霸3需要特定语言版本之windows 解决办法
英文版的Windows 7/ Vista / XP / 2003系统下玩中文版魔兽争霸/冰封王座的方法运行程序后,弹出一个对话框:"此版本之Warcraft3需要特定语言版本之window ...
此版本之魔兽争霸3需要特定语言版本之windows 解决办法
Ulrtaedit编辑法方法1 引用原文如下: 英文版的Windows 7/ Vista / XP / 2003系统下玩中文版魔兽争霸/冰封王座的方法运行程序后,弹出一个对话框:"此版本 ...

RGB24 To Yuv420 C语言 +汇编实现(windows平台)

RGB24 To Yuv420 C语言 +汇编实现(windows平台)相关推荐

最新文章

热门文章