MMX Intrinsics各函数介绍
SIMD相关头文件包括:
//#include <ivec.h>//MMX
//#include <fvec.h>//SSE(also include ivec.h)
//#include <dvec.h>//SSE2(also include fvec.h)#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h>//SSSE3(include pmmintrin.h)
#include <smmintrin.h>//SSE4.1(include tmmintrin.h)
#include <nmmintrin.h>//SSE4.2(include smmintrin.h)
#include <wmmintrin.h>//AES(include nmmintrin.h)
#include <immintrin.h>//AVX(include wmmintrin.h)
#include <intrin.h>//(include immintrin.h)
mmintrin.h为MMX头文件,其中__m64的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{unsigned __int64 m64_u64;float m64_f32[2];__int8 m64_i8[8];__int16 m64_i16[4];__int32 m64_i32[2]; __int64 m64_i64;unsigned __int8 m64_u8[8];unsigned __int16 m64_u16[4];unsigned __int32 m64_u32[2];
} __m64;
mmintrin.h 文件中各函数的介绍
/* General support intrinsics *///Empties the multimedia state,清除MMX寄存器中的内容,即初始化(以避免和浮点数//操作发生冲突),详细说明见参考文献1void _m_empty(void);//_mm_empty//Converts the integer object _I to a 64-bit __m64 object, r0=_I, r1=0__m64 _m_from_int(int _I);//_mm_cvtsi32_si64//Converts the lower 32 bits of the __m64 object _M to an integer, r=_M0int _m_to_int(__m64 _M);//_mm_cvtsi64_si32//Packs the four 16-bit values from _MM1 into the lower four 8-bit values of//the result with signed saturation, and packs the four 16-bit values from _MM2//into the upper four 8-bit values of the result with signed saturation__m64 _m_packsswb(__m64 _MM1, __m64 _MM2);//_mm_packs_pi16//Packs the two 32-bit values from _MM1 into the lower two 16-bit values of the// result with signed saturation, and packs the two 32-bit values from _MM2 into// the upper two 16-bit values of the result with signed saturation__m64 _m_packssdw(__m64 _MM1, __m64 _MM2);//_mm_packs_pi32//Packs the four 16-bit values from _MM1 into the lower four 8-bit values of the//result with unsigned saturation, and packs the four 16-bit values from _MM2 into//the upper four 8-bit values of the result with unsigned saturation__m64 _m_packuswb(__m64 _MM1, __m64 _MM2);//_mm_packs_pu16//_MM1=(_MM10, _MM11, _MM12, _MM13, _MM14, _MM15, _MM16, _MM17),//_MM2=(_MM20, _MM21, _MM22, _MM23, _MM24, _MM25, _MM26, _MM27),//则r=(_MM14, _MM24, _MM15, _MM25, _MM16, _MM26, _MM17, _MM27)__m64 _m_punpckhbw(__m64 _MM1, __m64 _MM2);//_mm_unpackhi_pi8 //_MM1=(_MM10, _MM11, _MM12, _MM13),_MM10为低位,_MM2=(_MM20, _MM21, _MM22, _MM23),//则r=(_MM12, _MM22, _MM13, _MM23)__m64 _m_punpckhwd(__m64 _MM1, __m64 _MM2);//_mm_unpackhi_pi16//MM1=(_MM10, _MM11),_MM10为低位,_MM2=(_MM20, _MM21),则r=(_MM11, _MM21)__m64 _m_punpckhdq(__m64 _MM1, __m64 _MM2);//_mm_unpackhi_pi32//_MM1=(_MM10, _MM11, _MM12, _MM13, _MM14, _MM15, _MM16, _MM17),//_MM2=(_MM20, _MM21, _MM22, _MM23, _MM24, _MM25, _MM26, _MM27),//则r=(_MM10, _MM20, _MM11, _MM21, _MM12, _MM22, _MM13, _MM23)__m64 _m_punpcklbw(__m64 _MM1, __m64 _MM2);//_mm_unpacklo_pi8//_MM1=(_MM10, _MM11, _MM12, _MM13),_MM10为低位,_MM2=(_MM20, _MM21, _MM22, _MM23),//则r=(_MM10, _MM20, _MM11, _MM21)__m64 _m_punpcklwd(__m64 _MM1, __m64 _MM2);//_mm_unpacklo_pi16//MM1=(_MM10, _MM11),_MM10为低位,_MM2=(_MM20, _MM21),则r=(_MM10, _MM20)__m64 _m_punpckldq(__m64 _MM1, __m64 _MM2);//mm_unpacklo_pi32/* Packed arithmetic intrinsics *///Adds the eight 8-bit values in _MM1 to the eight 8-bit values in _MM2__m64 _m_paddb(__m64 _MM1, __m64 _MM2);//_mm_add_pi8//Adds the four 16-bit values in _MM1 to the four 16-bit values in _MM2__m64 _m_paddw(__m64 _MM1, __m64 _MM2);//_mm_add_pi16//Adds the two 32-bit values in _MM1 to the two 32-bit values in _MM2__m64 _m_paddd(__m64 _MM1, __m64 _MM2);//_mm_add_pi32//Adds the eight signed 8-bit values in _MM1 to the eight signed 8-bit values in _MM2//and saturates__m64 _m_paddsb(__m64 _MM1, __m64 _MM2);//_mm_adds_pi8//Adds the four signed 16-bit values in _MM1 to the four signed 16-bit values in _MM2//and saturates__m64 _m_paddsw(__m64 _MM1, __m64 _MM2);//_mm_adds_pi16//Adds the eight unsigned 8-bit values in _MM1 to the eight unsigned 8-bit values //in _MM2 and saturates__m64 _m_paddusb(__m64 _MM1, __m64 _MM2);//_mm_adds_pu8//Add the four unsigned 16-bit values in _MM1 to the four unsigned 16-bit values //in _MM2 and saturates__m64 _m_paddusw(__m64 _MM1, __m64 _MM2);//_mm_adds_pu16//Subtracts the eight 8-bit values in _MM2 from the eight 8-bit values in _MM1__m64 _m_psubb(__m64 _MM1, __m64 _MM2);//_mm_sub_pi8 //Subtracts the four 16-bit values in _MM2 from the four 16-bit values in _MM1__m64 _m_psubw(__m64 _MM1, __m64 _MM2);//_mm_sub_pi16//Subtracts the two 32-bit values in _MM2 from the two 32-bit values in _MM1__m64 _m_psubd(__m64 _MM1, __m64 _MM2);//_mm_sub_pi32//Subtracts the eight signed 8-bit values in _MM2 from the eight signed 8-bit//values in _MM1 and saturates__m64 _m_psubsb(__m64 _MM1, __m64 _MM2);//_mm_subs_pi8//Subtracts the four signed 16-bit values in _MM2 from the four signed 16-bit//values in _MM1 and saturates__m64 _m_psubsw(__m64 _MM1, __m64 _MM2);//_mm_subs_pi16//Subtracts the eight unsigned 8-bit values in _MM2 from the eight unsigned 8-bit//values in _MM1 and saturates__m64 _m_psubusb(__m64 _MM1, __m64 _MM2);//_mm_subs_pu8//Subtracts the four unsigned 16-bit values in _MM2 from the four unsigned 16-bit//values in _MM1 and saturates__m64 _m_psubusw(__m64 _MM1, __m64 _MM2);//_mm_subs_pu16//Multiplies four 16-bit values in _MM1 by four 16-bit values in _MM2 to produce//four 32-bit intermediate results, which are then summed by pairs to produce two//32-bit results,r0=_MM10*_MM20+_MM11*_MM21, r1=_MM12*_MM22+_MM13*_MM23__m64 _m_pmaddwd(__m64 _MM1, __m64 _MM2);//_mm_madd_pi16//Multiplies four signed 16-bit values in _MM1 by four signed 16-bit values in _MM2//and produces the high 16 bits of the four results__m64 _m_pmulhw(__m64 _MM1, __m64 _MM2);//_mm_mulhi_pi16//Multiplies four 16-bit values in _MM1 by four 16-bit values in _MM2 and produces//the low 16 bits of the four results__m64 _m_pmullw(__m64 _MM1, __m64 _MM2);//_mm_mullo_pi16/* Shift intrinsics *///Shifts four 16-bit values in _M left the amount specified by _Count //while shifting in zeros,左移_Count位,移出位补0__m64 _m_psllw(__m64 _M, __m64 _Count);//_mm_sll_pi16//Shifts four 16-bit values in _M left the amount specified by _Ccount while //shifting in zeros,左移_Count位,移出位补0,_Count需是一个立即数//汇编语言中的立即数相当于高级语言中的常量(常数),它是直接出现在指令中的数,//不用存储在寄存器或存储器中的数__m64 _m_psllwi(__m64 _M, int _Count);//_mm_slli_pi16 //Shifts two 32-bit values in _M left the amount specified by _Count//while shifting in zeros__m64 _m_pslld(__m64 _M, __m64 _Count);//_mm_sll_pi32//Shifts two 32-bit values in _M left the amount specified by _Count//while shifting in zeros__m64 _m_pslldi(__m64 _M, int _Count);//_mm_slli_pi32//Shifts the 64-bit value in _M left the amount specified by _Count//while shifting in zeros__m64 _m_psllq(__m64 _M, __m64 _Count);//_mm_sll_si64//Shifts the 64-bit value in _M left the amount specified by _Count//while shifting in zeros__m64 _m_psllqi(__m64 _M, int _Count);//_mm_slli_si64//Shifts four 16-bit values in _M right the amount specified by _Count//while shifting in the sign bit__m64 _m_psraw(__m64 _M, __m64 _Count);//_mm_sra_pi16//Shifts four 16-bit values in _M right the amount specified by _Count//while shifting in the sign bit__m64 _m_psrawi(__m64 _M, int _Count);//_mm_srai_pi16//Shifts two 32-bit values in _M right the amount specified by _Count//while shifting in the sign bit__m64 _m_psrad(__m64 _M, __m64 _Count);//_mm_sra_pi32//Shifts two 32-bit values in _M right the amount specified by _Count//while shifting in the sign bit__m64 _m_psradi(__m64 _M, int _Count);//_mm_srai_pi32//Shifts four 16-bit values in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrlw(__m64 _M, __m64 _Count);//_mm_srl_pi16//Shifts four 16-bit values in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrlwi(__m64 _M, int _Count);//_mm_srli_pi16//Shifts two 32-bit values in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrld(__m64 _M, __m64 _Count);//_mm_srl_pi32//Shifts two 32-bit values in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrldi(__m64 _M, int _Count);//_mm_srli_pi32 //Shifts the 64-bit value in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrlq(__m64 _M, __m64 _Count);//_mm_srl_si64//Shifts the 64-bit value in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrlqi(__m64 _M, int _Count);//_mm_srli_si64/* Logical intrinsics *///Performs a bitwise AND of the 64-bit value in _MM1 with the 64-bit value in _MM2__m64 _m_pand(__m64 _MM1, __m64 _MM2);//_mm_and_si64//Performs a logical NOT on the 64-bit value in _MM1 and use the result in a //bitwise AND with the 64-bit value in _MM2__m64 _m_pandn(__m64 _MM1, __m64 _MM2);//_mm_andnot_si64//Performs a bitwise OR of the 64-bit value in _MM1 with the 64-bit value in _MM2__m64 _m_por(__m64 _MM1, __m64 _MM2);//_mm_or_si64//Performs a bitwise XOR of the 64-bit value in _MM1 with the 64-bit value in _MM2__m64 _m_pxor(__m64 _MM1, __m64 _MM2);//_mm_xor_si64/* Comparison intrinsics *///If the respective 8-bit values in _MM1 are equal to the respective //8-bit values in _MM2, sets the respective 8-bit resulting values to //all ones; otherwise, sets them to all zeros__m64 _m_pcmpeqb(__m64 _MM1, __m64 _MM2);//_mm_cmpeq_pi8//If the respective 16-bit values in _MM1 are equal to the respective //16-bit values in _MM2, sets the respective 16-bit resulting values //to all ones; otherwise, sets them to all zeros__m64 _m_pcmpeqw(__m64 _MM1, __m64 _MM2);//_mm_cmpeq_pi16//If the respective 32-bit values in _MM1 are equal to the respective //32-bit values in _MM2, sets the respective 32-bit resulting values//to all ones; otherwise, sets them to all zeros__m64 _m_pcmpeqd(__m64 _MM1, __m64 _MM2);//_mm_cmpeq_pi32 //If the respective 8-bit values in _MM1 are greater than the respective //8-bit values in _MM2, sets the respective 8-bit resulting values to all ones;//otherwise, sets them to all zeros__m64 _m_pcmpgtb(__m64 _MM1, __m64 _MM2);//_mm_cmpgt_pi8//If the respective 16-bit values in _MM1 are greater than the respective 16-bit//values in _MM2, sets the respective 16-bit resulting values to all ones;//otherwise, sets them to all zeros__m64 _m_pcmpgtw(__m64 _MM1, __m64 _MM2);//_mm_cmpgt_pi16//If the respective 32-bit values in _MM1 are greater than the respective 32-bit//values in _MM2, sets the respective 32-bit resulting values to all ones;//otherwise, sets them all to zeros__m64 _m_pcmpgtd(__m64 _MM1, __m64 _MM2);//_mm_cmpgt_pi32/* Utility intrinsics *///Sets the 64-bit value to zero__m64 _mm_setzero_si64(void);//Sets the two signed 32-bit integer values,r0=_I0, r1=_I1__m64 _mm_set_pi32(int _I1, int _I0);//r0=_S0, r1=_S1, r2=_S2, r3=_S3__m64 _mm_set_pi16(short _S3, short _S2, short _S1, short _S0);//r0=_B0, r1=_B1, r2=_B2, r3=_B3, r4=_B4, ..., r7=_B7__m64 _mm_set_pi8(char _B7, char _B6, char _B5, char _B4,char _B3, char _B2, char _B1, char _B0);//Sets the two signed 32-bit integer values to _I,r0=r1=_I__m64 _mm_set1_pi32(int _I);//Sets the four signed 16-bit integer values to _S, r0=r1=r2=r3=_S__m64 _mm_set1_pi16(short _S);//Sets the eight signed 8-bit integer values to _B, r0=r1...=r7=_B__m64 _mm_set1_pi8(char _B);//Sets the two signed 32-bit integer values in reverse order,r0=_I1, r1=_I0__m64 _mm_setr_pi32(int _I1, int _I0);//Sets the four signed 16-bit integer values in reverse order,//r0=_S3, r1=_S2, r2=_S1, r3=_S0__m64 _mm_setr_pi16(short _S3, short _S2, short _S1, short _S0);//Sets the eight signed 8-bit integer values in reverse order//r0=_B7, r1=_B6, r2=_B5, r3=_B4, r4=_B3, r5=_B2, r6=_B1, r7=_B0__m64 _mm_setr_pi8(char _B7, char _B6, char _B5, char _B4,char _B3, char _B2, char _B1, char _B0);/* Alternate intrinsic name definitions */#define _mm_empty _m_empty#define _mm_cvtsi32_si64 _m_from_int#define _mm_cvtsi64_si32 _m_to_int#define _mm_packs_pi16 _m_packsswb#define _mm_packs_pi32 _m_packssdw#define _mm_packs_pu16 _m_packuswb#define _mm_unpackhi_pi8 _m_punpckhbw#define _mm_unpackhi_pi16 _m_punpckhwd#define _mm_unpackhi_pi32 _m_punpckhdq#define _mm_unpacklo_pi8 _m_punpcklbw#define _mm_unpacklo_pi16 _m_punpcklwd#define _mm_unpacklo_pi32 _m_punpckldq#define _mm_add_pi8 _m_paddb#define _mm_add_pi16 _m_paddw#define _mm_add_pi32 _m_paddd#define _mm_adds_pi8 _m_paddsb#define _mm_adds_pi16 _m_paddsw#define _mm_adds_pu8 _m_paddusb#define _mm_adds_pu16 _m_paddusw#define _mm_sub_pi8 _m_psubb#define _mm_sub_pi16 _m_psubw#define _mm_sub_pi32 _m_psubd#define _mm_subs_pi8 _m_psubsb#define _mm_subs_pi16 _m_psubsw#define _mm_subs_pu8 _m_psubusb#define _mm_subs_pu16 _m_psubusw#define _mm_madd_pi16 _m_pmaddwd#define _mm_mulhi_pi16 _m_pmulhw#define _mm_mullo_pi16 _m_pmullw#define _mm_sll_pi16 _m_psllw#define _mm_slli_pi16 _m_psllwi#define _mm_sll_pi32 _m_pslld#define _mm_slli_pi32 _m_pslldi#define _mm_sll_si64 _m_psllq#define _mm_slli_si64 _m_psllqi#define _mm_sra_pi16 _m_psraw#define _mm_srai_pi16 _m_psrawi#define _mm_sra_pi32 _m_psrad#define _mm_srai_pi32 _m_psradi#define _mm_srl_pi16 _m_psrlw#define _mm_srli_pi16 _m_psrlwi#define _mm_srl_pi32 _m_psrld#define _mm_srli_pi32 _m_psrldi#define _mm_srl_si64 _m_psrlq#define _mm_srli_si64 _m_psrlqi#define _mm_and_si64 _m_pand#define _mm_andnot_si64 _m_pandn#define _mm_or_si64 _m_por#define _mm_xor_si64 _m_pxor#define _mm_cmpeq_pi8 _m_pcmpeqb#define _mm_cmpeq_pi16 _m_pcmpeqw#define _mm_cmpeq_pi32 _m_pcmpeqd#define _mm_cmpgt_pi8 _m_pcmpgtb#define _mm_cmpgt_pi16 _m_pcmpgtw#define _mm_cmpgt_pi32 _m_pcmpgtd
参考文献:1、http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/intref_cls/common/intref_mmx_emms_usage.htm
MMX Intrinsics各函数介绍相关推荐
- SSE3 和 SSSE3 Intrinsics各函数介绍
[转载]:SSE3和SSSE3 Intrinsics各函数介绍 SIMD相关头文件包括: mmintrin.h为MMX 头文件,其中__m64的定义为: xmmintrin.h为SSE 头文件,此头文 ...
- SSE4.1和SSE4.2 Intrinsics各函数介绍
SIMD相关头文件包括: //#include <ivec.h>//MMX //#include <fvec.h>//SSE(also include ivec.h) //#i ...
- SSE2 Intrinsics各函数介绍
SIMD相关头文件包括: //#include <ivec.h>//MMX //#include <fvec.h>//SSE(also include ivec.h) //#i ...
- SSE Intrinsics各函数介绍
原文:http://blog.csdn.net/fengbingchun/article/details/19293081 SIMD相关头文件包括: [cpp] view plaincopy //#i ...
- SSE2 Intrinsics各函数介绍 及简单例子
转载地址 http://blog.csdn.net/fengbingchun/article/details/18460199 关于ARM上的SIMD可以参见网址,ARM上的SIMD技术叫NEON: ...
- AES(Advanced Encryption Standard) Intrinsics各函数介绍
AES为高级加密标准,是较流行的一种密码算法. SIMD相关头文件包括: //#include <ivec.h>//MMX //#include <fvec.h>//SSE(a ...
- SSE3和SSSE3 Intrinsics各函数介绍
SIMD相关头文件包括: //#include <ivec.h>//MMX //#include <fvec.h>//SSE(also include ivec.h) //#i ...
- Neon Intrinsics各函数介绍
#ifndef __ARM_NEON__ #error You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) t ...
- ARM Neon Intrinsics各函数介绍
#ifndef __ARM_NEON__ #error You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) t ...
最新文章
- VMware Coding Challenge: Possible Scores Summary: static
- JQUERY的appendappendTo
- bpcs uploader.php,linux 备份定时同步到百度云盘
- Java常用设计模式————建造者模式
- 职业人应该“这山望着那山高”
- 分布式 Spring Cloud 基于 Spring Boot 开发一整套
- Mac Postman app使用方法
- Qt之SQLite数据库可视化工具
- 使用硕正插件在strtus2框架下返回数据问题
- OpenVINO之链接库
- 蝴蝶效应、青蛙现象、鳄鱼法则、鲇鱼效应、羊群效应、刺猬法则、手表定律、破窗理论、二八定律、木桶理论、马太效应
- php保存微信用户头像到本地或者服务器的完美方案!
- 参数化建模类毕业论文文献有哪些?
- 禁止应用和adb安装APK
- Matlab中rgb2ind函数用法
- IOT网关开发受难记-(一) 2022/05/13
- 计算机怎样去掉语音,如何关闭word语音识别 (数据丢失 - 电脑使用小技巧 - 电子发烧友网...
- 7-32 寻找250 (10分)
- B站,牛啊。,java底层原理
- OpenCV:图像批量、任意比例裁剪
热门文章
- Python Qt GUI设计:QClipboard剪贴数据类(基础篇—19)
- Monitor CodeForces - 846D ——二维前缀和
- 【面向对象编程】(2) 类属性的定义及使用;__repr__()方法
- c++之openGL在VS中的配置及简单图形绘制
- Python计算机视觉——SIFT特征
- python3.7.2怎么用不了pillow_python 3.7.0 下pillow安装方法
- Udacity机器人软件工程师课程笔记(五)-样本搜索和找回-基于漫游者号模拟器-自主驾驶
- 强哥原创管理方法论之“掌纹管理学”
- 设置显示Git的修改历史History快捷键Alt+H,方便多人开发的时候快速查看谁修改了代码
- 在Ubuntu 16.04.1 LTS上安装ATS 6.2.1 LTS实录