阅读代码的时候遇到了__m128i_mm_set1_epi8_mm_loadu_si128_mm_max_epu8_mm_min_epu8_mm_store_si128_mm_unpackhi_epi8_mm_adds_epi16_mm_srli_si128等SIMD指令集,所以想着作一个总结。


0. SIMD基础知识

SIMD是单指令多数据技术,目前Intel处理器支持的SIMD技术包括MMX、SSE以及AVX。

MMX是MultiMedia eXtensions(多媒体扩展)的缩写,是第六代CPU芯片的重要特点。它是继Intel386 ( TM ) 处理器(将体系结构扩展至32位)之后对Intel体系结构最重要的加强,这些指令集能够加速有关图形、影像、声音等的应用。其中,MMX提供了8个64bit的寄存器进行SIMD操作。

SSE是"因特尔数据流单指令序列扩展(Internet Streaming SIMD Extensions)"的缩写。SSE除保持原有的MMX指令外,又新添加了70条指令,在加快浮点运算的同时,改善了内存的使用效率,使内存速度更快。其中,SSE系列提供了8个128bit的寄存器进行SIMD操作。

AVX指令集是Sandy Bridge和Larrabee架构下的新指令集,在单指令多数据流计算性能增加的同时也沿用了MMX/SSE指令集,是在之前的128位扩展到256位的单指令多数据流,不过和MMX/SSE的不同点在于增强的AVX指令,从指令的格式上就发生了很大的变化。

1. 如何使用SIMD指令以及其相关头文件

使用SIMD指令有两种方式:一是直接在C/C++中嵌入(汇编)指令;而是使用Intel C++ Compiler或是Microsoft Visual C++提供的支持SIMD指令集的intrinsics内联函数。从代码可读和维护角度讲,推荐使用intrinsics内联函数的形式。intrinsics是对MMX、SSE等指令集的一种封装,以函数的形式提供,使得程序员更容易编写和使用这些高级指令,在编译的时候,这些函数会被内联为汇编,不会产生函数调用的开销。要想使用SIMD指令,则需要包含对应的头文件。

接下来介绍一下头文件之间的关系:

#include <mmintrin.h>  //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h> //SSSE3(include pmmintrin.h)
#include <smmintrin.h> //SSE4.1(include tmmintrin.h)
#include <nmmintrin.h> //SSE4.2(include smmintrin.h)
#include <wmmintrin.h> //AES(include nmmintrin.h)
#include <immintrin.h> //AVX(include wmmintrin.h)
#include <intrin.h>    //(include immintrin.h)

mmintrin.h ∈ xmmintrin.h ∈ emmintrin.h ∈ pmmintrin.h ∈ tmmintrin.h ∈ smmintrin.h ∈ nmmintrin.h ∈ wmmintrin.h ∈ immintrin.h ∈ intrin.h

2. 变量类型

>>>__m64

mmintrin.h为MMX头文件,__m64的定义就来自这个头文件:

typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{unsigned __int64    m64_u64;float               m64_f32[2];__int8              m64_i8[8];__int16             m64_i16[4];__int32             m64_i32[2];    __int64             m64_i64;unsigned __int8     m64_u8[8];unsigned __int16    m64_u16[4];unsigned __int32    m64_u32[2];
} __m64;

可以看到,__m64为一个共用体(union)类型,union的特点为:所有成员占用同一段内存,在不同的时间保存不同的数据类型和不同长度的变量。在union中,所有的共用体成员公用一个空间,并且同一时间只能存储其中一个成员变量的值。__m64的大小为64位,其中,__declspec(align(8))是设置内存对齐方式(8字节对齐),来保证__m64的大小为64位。__m64这种类型的变量可用作MMX指令的操作数,它不能直接被访问,被自动分配为8个字节的字长。

>>>__m128

xmmintrin.h为SSE头文件,__m128的定义就来自于这个头文件:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {float               m128_f32[4];unsigned __int64    m128_u64[2];__int8              m128_i8[16];__int16             m128_i16[8];__int32             m128_i32[4];__int64             m128_i64[2];unsigned __int8     m128_u8[16];unsigned __int16    m128_u16[8];unsigned __int32    m128_u32[4];} __m128;

>>>__m128i 和 __m128d

emmintrin.h为SSE2头文件,其中__m128i和__m128d的定义就来自于这个头文件:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {__int8              m128i_i8[16];__int16             m128i_i16[8];__int32             m128i_i32[4];    __int64             m128i_i64[2];unsigned __int8     m128i_u8[16];unsigned __int16    m128i_u16[8];unsigned __int32    m128i_u32[4];unsigned __int64    m128i_u64[2];
} __m128i;typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {double              m128d_f64[2];
} __m128d;

>>>__m256、__m256d 和 __m256i

immintrin.h为AVX头文件,其中__m256、__m256d和__m256i的定义就来自于这个头文件:

typedef union __declspec(intrin_type) _CRT_ALIGN(32) __m256 { float m256_f32[8];
} __m256;typedef struct __declspec(intrin_type) _CRT_ALIGN(32) {double m256d_f64[4];
} __m256d; typedef union  __declspec(intrin_type) _CRT_ALIGN(32) __m256i {__int8              m256i_i8[32];__int16             m256i_i16[16];__int32             m256i_i32[8];__int64             m256i_i64[4];unsigned __int8     m256i_u8[32];unsigned __int16    m256i_u16[16];unsigned __int32    m256i_u32[8];unsigned __int64    m256i_u64[4];
} __m256i;

3. 函数接口

>>>mmintrin.h头文件中的函数接口

/* General support intrinsics */
void  _m_empty(void);
__m64 _m_from_int(int _I);
int   _m_to_int(__m64 _M);
__m64 _m_packsswb(__m64 _MM1, __m64 _MM2);
__m64 _m_packssdw(__m64 _MM1, __m64 _MM2);
__m64 _m_packuswb(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckhbw(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckhwd(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckhdq(__m64 _MM1, __m64 _MM2);
__m64 _m_punpcklbw(__m64 _MM1, __m64 _MM2);
__m64 _m_punpcklwd(__m64 _MM1, __m64 _MM2);
__m64 _m_punpckldq(__m64 _MM1, __m64 _MM2);/* Packed arithmetic intrinsics */
__m64 _m_paddb(__m64 _MM1, __m64 _MM2);
__m64 _m_paddw(__m64 _MM1, __m64 _MM2);
__m64 _m_paddd(__m64 _MM1, __m64 _MM2);
__m64 _m_paddsb(__m64 _MM1, __m64 _MM2);
__m64 _m_paddsw(__m64 _MM1, __m64 _MM2);
__m64 _m_paddusb(__m64 _MM1, __m64 _MM2);
__m64 _m_paddusw(__m64 _MM1, __m64 _MM2);
__m64 _m_psubb(__m64 _MM1, __m64 _MM2);
__m64 _m_psubw(__m64 _MM1, __m64 _MM2);
__m64 _m_psubd(__m64 _MM1, __m64 _MM2);
__m64 _m_psubsb(__m64 _MM1, __m64 _MM2);
__m64 _m_psubsw(__m64 _MM1, __m64 _MM2);
__m64 _m_psubusb(__m64 _MM1, __m64 _MM2);
__m64 _m_psubusw(__m64 _MM1, __m64 _MM2);
__m64 _m_pmaddwd(__m64 _MM1, __m64 _MM2);
__m64 _m_pmulhw(__m64 _MM1, __m64 _MM2);
__m64 _m_pmullw(__m64 _MM1, __m64 _MM2);/* Shift intrinsics */
__m64 _m_psllw(__m64 _M, __m64 _Count);
__m64 _m_psllwi(__m64 _M, int _Count);
__m64 _m_pslld(__m64 _M, __m64 _Count);
__m64 _m_pslldi(__m64 _M, int _Count);
__m64 _m_psllq(__m64 _M, __m64 _Count);
__m64 _m_psllqi(__m64 _M, int _Count);
__m64 _m_psraw(__m64 _M, __m64 _Count);
__m64 _m_psrawi(__m64 _M, int _Count);
__m64 _m_psrad(__m64 _M, __m64 _Count);
__m64 _m_psradi(__m64 _M, int _Count);
__m64 _m_psrlw(__m64 _M, __m64 _Count);
__m64 _m_psrlwi(__m64 _M, int _Count);
__m64 _m_psrld(__m64 _M, __m64 _Count);
__m64 _m_psrldi(__m64 _M, int _Count);
__m64 _m_psrlq(__m64 _M, __m64 _Count);
__m64 _m_psrlqi(__m64 _M, int _Count);/* Logical intrinsics */
__m64 _m_pand(__m64 _MM1, __m64 _MM2);
__m64 _m_pandn(__m64 _MM1, __m64 _MM2);
__m64 _m_por(__m64 _MM1, __m64 _MM2);
__m64 _m_pxor(__m64 _MM1, __m64 _MM2);/* Comparison intrinsics */
__m64 _m_pcmpeqb(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpeqw(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpeqd(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpgtb(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpgtw(__m64 _MM1, __m64 _MM2);
__m64 _m_pcmpgtd(__m64 _MM1, __m64 _MM2);/* Utility intrinsics */
__m64 _mm_setzero_si64(void);
__m64 _mm_set_pi32(int _I1, int _I0);
__m64 _mm_set_pi16(short _S3, short _S2, short _S1, short _S0);
__m64 _mm_set_pi8(char _B7, char _B6, char _B5, char _B4,char _B3, char _B2, char _B1, char _B0);
__m64 _mm_set1_pi32(int _I);
__m64 _mm_set1_pi16(short _S);
__m64 _mm_set1_pi8(char _B);
__m64 _mm_setr_pi32(int _I1, int _I0);
__m64 _mm_setr_pi16(short _S3, short _S2, short _S1, short _S0);
__m64 _mm_setr_pi8(char _B7, char _B6, char _B5, char _B4,char _B3, char _B2, char _B1, char _B0);

>>>xmmintrin.h头文件中的函数接口

/** FP, arithmetic*/extern __m128 _mm_add_ss(__m128 _A, __m128 _B);
extern __m128 _mm_add_ps(__m128 _A, __m128 _B);
extern __m128 _mm_sub_ss(__m128 _A, __m128 _B);
extern __m128 _mm_sub_ps(__m128 _A, __m128 _B);
extern __m128 _mm_mul_ss(__m128 _A, __m128 _B);
extern __m128 _mm_mul_ps(__m128 _A, __m128 _B);
extern __m128 _mm_div_ss(__m128 _A, __m128 _B);
extern __m128 _mm_div_ps(__m128 _A, __m128 _B);
extern __m128 _mm_sqrt_ss(__m128 _A);
extern __m128 _mm_sqrt_ps(__m128 _A);
extern __m128 _mm_rcp_ss(__m128 _A);
extern __m128 _mm_rcp_ps(__m128 _A);
extern __m128 _mm_rsqrt_ss(__m128 _A);
extern __m128 _mm_rsqrt_ps(__m128 _A);
extern __m128 _mm_min_ss(__m128 _A, __m128 _B);
extern __m128 _mm_min_ps(__m128 _A, __m128 _B);
extern __m128 _mm_max_ss(__m128 _A, __m128 _B);
extern __m128 _mm_max_ps(__m128 _A, __m128 _B);/** FP, logical*/extern __m128 _mm_and_ps(__m128 _A, __m128 _B);
extern __m128 _mm_andnot_ps(__m128 _A, __m128 _B);
extern __m128 _mm_or_ps(__m128 _A, __m128 _B);
extern __m128 _mm_xor_ps(__m128 _A, __m128 _B);/** FP, comparison*/extern __m128 _mm_cmpeq_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpeq_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmplt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmplt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmple_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmple_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpgt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpgt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpge_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpge_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpneq_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpneq_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnlt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnlt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnle_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnle_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpngt_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpngt_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnge_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpnge_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpord_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpord_ps(__m128 _A, __m128 _B);
extern __m128 _mm_cmpunord_ss(__m128 _A, __m128 _B);
extern __m128 _mm_cmpunord_ps(__m128 _A, __m128 _B);
extern int _mm_comieq_ss(__m128 _A, __m128 _B);
extern int _mm_comilt_ss(__m128 _A, __m128 _B);
extern int _mm_comile_ss(__m128 _A, __m128 _B);
extern int _mm_comigt_ss(__m128 _A, __m128 _B);
extern int _mm_comige_ss(__m128 _A, __m128 _B);
extern int _mm_comineq_ss(__m128 _A, __m128 _B);
extern int _mm_ucomieq_ss(__m128 _A, __m128 _B);
extern int _mm_ucomilt_ss(__m128 _A, __m128 _B);
extern int _mm_ucomile_ss(__m128 _A, __m128 _B);
extern int _mm_ucomigt_ss(__m128 _A, __m128 _B);
extern int _mm_ucomige_ss(__m128 _A, __m128 _B);
extern int _mm_ucomineq_ss(__m128 _A, __m128 _B);/** FP, conversions*/extern int _mm_cvt_ss2si(__m128 _A);
extern __m64 _mm_cvt_ps2pi(__m128 _A);
extern int _mm_cvtt_ss2si(__m128 _A);
extern __m64 _mm_cvtt_ps2pi(__m128 _A);
extern __m128 _mm_cvt_si2ss(__m128, int);
extern __m128 _mm_cvt_pi2ps(__m128, __m64);
extern float _mm_cvtss_f32(__m128 _A);/** Support for 64-bit extension intrinsics*/
#if defined (_M_X64)
extern __int64 _mm_cvtss_si64(__m128 _A);
extern __int64 _mm_cvttss_si64(__m128 _A);
extern __m128  _mm_cvtsi64_ss(__m128 _A, __int64 _B);
#endif  /* defined (_M_X64) *//** FP, misc*/extern __m128 _mm_shuffle_ps(__m128 _A, __m128 _B, unsigned int _Imm8);
extern __m128 _mm_unpackhi_ps(__m128 _A, __m128 _B);
extern __m128 _mm_unpacklo_ps(__m128 _A, __m128 _B);
extern __m128 _mm_loadh_pi(__m128, __m64 const*);
extern __m128 _mm_movehl_ps(__m128, __m128);
extern __m128 _mm_movelh_ps(__m128, __m128);
extern void _mm_storeh_pi(__m64 *, __m128);
extern __m128 _mm_loadl_pi(__m128, __m64 const*);
extern void _mm_storel_pi(__m64 *, __m128);
extern int _mm_movemask_ps(__m128 _A);/** Integer extensions*/
extern int _m_pextrw(__m64, int);
extern __m64 _m_pinsrw(__m64, int, int);
extern __m64 _m_pmaxsw(__m64, __m64);
extern __m64 _m_pmaxub(__m64, __m64);
extern __m64 _m_pminsw(__m64, __m64);
extern __m64 _m_pminub(__m64, __m64);
extern int _m_pmovmskb(__m64);
extern __m64 _m_pmulhuw(__m64, __m64);
extern __m64 _m_pshufw(__m64, int);
extern void _m_maskmovq(__m64, __m64, char *);
extern __m64 _m_pavgb(__m64, __m64);
extern __m64 _m_pavgw(__m64, __m64);
extern __m64 _m_psadbw(__m64, __m64);/** memory & initialization*/extern __m128 _mm_set_ss(float _A);
extern __m128 _mm_set_ps1(float _A);
extern __m128 _mm_set_ps(float _A, float _B, float _C, float _D);
extern __m128 _mm_setr_ps(float _A, float _B, float _C, float _D);
extern __m128 _mm_setzero_ps(void);
extern __m128 _mm_load_ss(float const*_A);
extern __m128 _mm_load_ps1(float const*_A);
extern __m128 _mm_load_ps(float const*_A);
extern __m128 _mm_loadr_ps(float const*_A);
extern __m128 _mm_loadu_ps(float const*_A);
extern void _mm_store_ss(float *_V, __m128 _A);
extern void _mm_store_ps1(float *_V, __m128 _A);
extern void _mm_store_ps(float *_V, __m128 _A);
extern void _mm_storer_ps(float *_V, __m128 _A);
extern void _mm_storeu_ps(float *_V, __m128 _A);
extern void _mm_prefetch(char const*_A, int _Sel);
extern void _mm_stream_pi(__m64 *, __m64);
extern void _mm_stream_ps(float *, __m128);
extern __m128 _mm_move_ss(__m128 _A, __m128 _B);extern void _mm_sfence(void);
extern unsigned int _mm_getcsr(void);
extern void _mm_setcsr(unsigned int);/******************************************************//* UTILITY INTRINSICS FUNCTION DEFINITIONS START HERE *//******************************************************//*********************************************************//*  NAME : _mm_cvtpi16_ps                                *//*  DESCRIPTION : Convert 4 16-bit signed integer values *//*                to 4 single-precision float values     *//*  IN : __m64 _A                                         *//*  OUT : none                                           *//*  RETURN : __m128 : (float)_A                           *//*********************************************************/
__inline __m128 _mm_cvtpi16_ps(__m64 _A)
{__m128 _Tmp;__m64  _Ext_val = _mm_cmpgt_pi16(_mm_setzero_si64(), _A);_Tmp = _mm_cvtpi32_ps(_mm_setzero_ps(), _mm_unpackhi_pi16(_A, _Ext_val));return(_mm_cvtpi32_ps(_mm_movelh_ps(_Tmp, _Tmp),_mm_unpacklo_pi16(_A, _Ext_val)));
}/***********************************************************//*  NAME : _mm_cvtpu16_ps                                  *//*  DESCRIPTION : Convert 4 16-bit unsigned integer values *//*                to 4 single-precision float values       *//*  IN : __m64 _A                                           *//*  OUT : none                                             *//*  RETURN : __m128 : (float)_A                             *//***********************************************************/
__inline __m128 _mm_cvtpu16_ps(__m64 _A)
{__m128 _Tmp;__m64  _Ext_val = _mm_setzero_si64();_Tmp = _mm_cvtpi32_ps(_mm_setzero_ps(), _mm_unpackhi_pi16(_A, _Ext_val));return(_mm_cvtpi32_ps(_mm_movelh_ps(_Tmp, _Tmp),_mm_unpacklo_pi16(_A, _Ext_val)));
}/******************************************************//*  NAME : _mm_cvtps_pi16                             *//*  DESCRIPTION : Convert 4 single-precision float    *//*                values to 4 16-bit integer values   *//*  IN : __m128 a                                     *//*  OUT : none                                        *//*  RETURN : __m64 : (short)a                         *//******************************************************/
__inline __m64 _mm_cvtps_pi16(__m128 _A)
{return _mm_packs_pi32(_mm_cvtps_pi32(_A),_mm_cvtps_pi32(_mm_movehl_ps(_A, _A)));
}/******************************************************//*  NAME : _mm_cvtpi8_ps                              *//*  DESCRIPTION : Convert 4 8-bit integer values to 4 *//*                single-precision float values       *//*  IN : __m64 _A                                     *//*  OUT : none                                        *//*  RETURN : __m128 : (float)_A                        *//******************************************************/
__inline __m128 _mm_cvtpi8_ps(__m64 _A)
{__m64  _Ext_val = _mm_cmpgt_pi8(_mm_setzero_si64(), _A);return _mm_cvtpi16_ps(_mm_unpacklo_pi8(_A, _Ext_val));
}/******************************************************//*  NAME : _mm_cvtpu8_ps                              *//*  DESCRIPTION : Convert 4 8-bit unsigned integer    *//*                values to 4 single-precision float  *//*                values                              *//*  IN : __m64 _A                                      *//*  OUT : none                                        *//*  RETURN : __m128 : (float)_A                        *//******************************************************/
__inline __m128 _mm_cvtpu8_ps(__m64 _A)
{return _mm_cvtpu16_ps(_mm_unpacklo_pi8(_A, _mm_setzero_si64()));
}/******************************************************//*  NAME : _mm_cvtps_pi8                              *//*  DESCRIPTION : Convert 4 single-precision float    *//*                values to 4 8-bit integer values    *//*  IN : __m128 _A                                     *//*  OUT : none                                        *//*  RETURN : __m64 : (char)_A                          *//******************************************************/
__inline __m64 _mm_cvtps_pi8(__m128 _A)
{return _mm_packs_pi16(_mm_cvtps_pi16(_A), _mm_setzero_si64());
}/******************************************************//*  NAME : _mm_cvtpi32x2_ps                           *//*  DESCRIPTION : Convert 4 32-bit integer values     *//*                to 4 single-precision float values  *//*  IN : __m64 _A : operand 1                          *//*       __m64 _B : operand 2                          *//*  OUT : none                                        *//*  RETURN : __m128 : (float)_A,(float)_B               *//******************************************************/
__inline __m128 _mm_cvtpi32x2_ps(__m64 _A, __m64 _B)
{return _mm_movelh_ps(_mm_cvt_pi2ps(_mm_setzero_ps(), _A),_mm_cvt_pi2ps(_mm_setzero_ps(), _B));
}

>>>emmintrin.h头文件中的函数接口

/** DP, arithmetic*/extern __m128d _mm_add_sd(__m128d _A, __m128d _B);
extern __m128d _mm_add_pd(__m128d _A, __m128d _B);
extern __m128d _mm_sub_sd(__m128d _A, __m128d _B);
extern __m128d _mm_sub_pd(__m128d _A, __m128d _B);
extern __m128d _mm_mul_sd(__m128d _A, __m128d _B);
extern __m128d _mm_mul_pd(__m128d _A, __m128d _B);
extern __m128d _mm_sqrt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_sqrt_pd(__m128d _A);
extern __m128d _mm_div_sd(__m128d _A, __m128d _B);
extern __m128d _mm_div_pd(__m128d _A, __m128d _B);
extern __m128d _mm_min_sd(__m128d _A, __m128d _B);
extern __m128d _mm_min_pd(__m128d _A, __m128d _B);
extern __m128d _mm_max_sd(__m128d _A, __m128d _B);
extern __m128d _mm_max_pd(__m128d _A, __m128d _B);/** DP, logicals*/extern __m128d _mm_and_pd(__m128d _A, __m128d _B);
extern __m128d _mm_andnot_pd(__m128d _A, __m128d _B);
extern __m128d _mm_or_pd(__m128d _A, __m128d _B);
extern __m128d _mm_xor_pd(__m128d _A, __m128d _B);/** DP, comparisons*/extern __m128d _mm_cmpeq_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpeq_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmplt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmplt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmple_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmple_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpgt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpgt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpge_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpge_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpneq_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpneq_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnlt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnlt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnle_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnle_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpngt_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpngt_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnge_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpnge_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpord_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpord_sd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpunord_pd(__m128d _A, __m128d _B);
extern __m128d _mm_cmpunord_sd(__m128d _A, __m128d _B);
extern int _mm_comieq_sd(__m128d _A, __m128d _B);
extern int _mm_comilt_sd(__m128d _A, __m128d _B);
extern int _mm_comile_sd(__m128d _A, __m128d _B);
extern int _mm_comigt_sd(__m128d _A, __m128d _B);
extern int _mm_comige_sd(__m128d _A, __m128d _B);
extern int _mm_comineq_sd(__m128d _A, __m128d _B);
extern int _mm_ucomieq_sd(__m128d _A, __m128d _B);
extern int _mm_ucomilt_sd(__m128d _A, __m128d _B);
extern int _mm_ucomile_sd(__m128d _A, __m128d _B);
extern int _mm_ucomigt_sd(__m128d _A, __m128d _B);
extern int _mm_ucomige_sd(__m128d _A, __m128d _B);
extern int _mm_ucomineq_sd(__m128d _A, __m128d _B);/** DP, converts*/extern __m128d _mm_cvtepi32_pd(__m128i _A);
extern __m128i _mm_cvtpd_epi32(__m128d _A);
extern __m128i _mm_cvttpd_epi32(__m128d _A);
extern __m128 _mm_cvtepi32_ps(__m128i _A);
extern __m128i _mm_cvtps_epi32(__m128 _A);
extern __m128i _mm_cvttps_epi32(__m128 _A);
extern __m128 _mm_cvtpd_ps(__m128d _A);
extern __m128d _mm_cvtps_pd(__m128 _A);
extern __m128 _mm_cvtsd_ss(__m128 _A, __m128d _B);
extern __m128d _mm_cvtss_sd(__m128d _A, __m128 _B);extern int _mm_cvtsd_si32(__m128d _A);
extern int _mm_cvttsd_si32(__m128d _A);
extern __m128d _mm_cvtsi32_sd(__m128d _A, int _B);extern __m64 _mm_cvtpd_pi32(__m128d _A);
extern __m64 _mm_cvttpd_pi32(__m128d _A);
extern __m128d _mm_cvtpi32_pd(__m64 _A);/** DP, misc*/extern __m128d _mm_unpackhi_pd(__m128d _A, __m128d _B);
extern __m128d _mm_unpacklo_pd(__m128d _A, __m128d _B);
extern int _mm_movemask_pd(__m128d _A);
extern __m128d _mm_shuffle_pd(__m128d _A, __m128d _B, int _I);/** DP, loads*/extern __m128d _mm_load_pd(double const*_Dp);
extern __m128d _mm_load1_pd(double const*_Dp);
extern __m128d _mm_loadr_pd(double const*_Dp);
extern __m128d _mm_loadu_pd(double const*_Dp);
extern __m128d _mm_load_sd(double const*_Dp);
extern __m128d _mm_loadh_pd(__m128d _A, double const*_Dp);
extern __m128d _mm_loadl_pd(__m128d _A, double const*_Dp);/** DP, sets*/extern __m128d _mm_set_sd(double _W);
extern __m128d _mm_set1_pd(double _A);
extern __m128d _mm_set_pd(double _Z, double _Y);
extern __m128d _mm_setr_pd(double _Y, double _Z);
extern __m128d _mm_setzero_pd(void);
extern __m128d _mm_move_sd(__m128d _A, __m128d _B);/** DP, stores*/extern void _mm_store_sd(double *_Dp, __m128d _A);
extern void _mm_store1_pd(double *_Dp, __m128d _A);
extern void _mm_store_pd(double *_Dp, __m128d _A);
extern void _mm_storeu_pd(double *_Dp, __m128d _A);
extern void _mm_storer_pd(double *_Dp, __m128d _A);
extern void _mm_storeh_pd(double *_Dp, __m128d _A);
extern void _mm_storel_pd(double *_Dp, __m128d _A);/** Integer, arithmetic*/extern __m128i _mm_add_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_add_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_add_epi32(__m128i _A, __m128i _B);
extern __m64 _mm_add_si64(__m64 _A, __m64 _B);
extern __m128i _mm_add_epi64(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_adds_epu16(__m128i _A, __m128i _B);
extern __m128i _mm_avg_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_avg_epu16(__m128i _A, __m128i _B);
extern __m128i _mm_madd_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_max_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_max_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_min_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_min_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_mulhi_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_mulhi_epu16(__m128i _A, __m128i _B);
extern __m128i _mm_mullo_epi16(__m128i _A, __m128i _B);
extern __m64 _mm_mul_su32(__m64 _A, __m64 _B);
extern __m128i _mm_mul_epu32(__m128i _A, __m128i _B);
extern __m128i _mm_sad_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_sub_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_sub_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_sub_epi32(__m128i _A, __m128i _B);
extern __m64 _mm_sub_si64(__m64 _A, __m64 _B);
extern __m128i _mm_sub_epi64(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epu8(__m128i _A, __m128i _B);
extern __m128i _mm_subs_epu16(__m128i _A, __m128i _B);/** Integer, logicals*/extern __m128i _mm_and_si128(__m128i _A, __m128i _B);
extern __m128i _mm_andnot_si128(__m128i _A, __m128i _B);
extern __m128i _mm_or_si128(__m128i _A, __m128i _B);
extern __m128i _mm_xor_si128(__m128i _A, __m128i _B);/** Integer, shifts*/extern __m128i _mm_slli_si128(__m128i _A, int _Imm);
extern __m128i _mm_slli_epi16(__m128i _A, int _Count);
extern __m128i _mm_sll_epi16(__m128i _A, __m128i _Count);
extern __m128i _mm_slli_epi32(__m128i _A, int _Count);
extern __m128i _mm_sll_epi32(__m128i _A, __m128i _Count);
extern __m128i _mm_slli_epi64(__m128i _A, int _Count);
extern __m128i _mm_sll_epi64(__m128i _A, __m128i _Count);
extern __m128i _mm_srai_epi16(__m128i _A, int _Count);
extern __m128i _mm_sra_epi16(__m128i _A, __m128i _Count);
extern __m128i _mm_srai_epi32(__m128i _A, int _Count);
extern __m128i _mm_sra_epi32(__m128i _A, __m128i _Count);
extern __m128i _mm_srli_si128(__m128i _A, int _Imm);
extern __m128i _mm_srli_epi16(__m128i _A, int _Count);
extern __m128i _mm_srl_epi16(__m128i _A, __m128i _Count);
extern __m128i _mm_srli_epi32(__m128i _A, int _Count);
extern __m128i _mm_srl_epi32(__m128i _A, __m128i _Count);
extern __m128i _mm_srli_epi64(__m128i _A, int _Count);
extern __m128i _mm_srl_epi64(__m128i _A, __m128i _Count);/** Integer, comparisons*/extern __m128i _mm_cmpeq_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_cmpeq_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_cmpeq_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_cmpgt_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_cmpgt_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_cmpgt_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_cmplt_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_cmplt_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_cmplt_epi32(__m128i _A, __m128i _B);/** Integer, converts*/extern __m128i _mm_cvtsi32_si128(int _A);
extern int _mm_cvtsi128_si32(__m128i _A);/** Integer, misc*/extern __m128i _mm_packs_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_packs_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_packus_epi16(__m128i _A, __m128i _B);
extern int _mm_extract_epi16(__m128i _A, int _Imm);
extern __m128i _mm_insert_epi16(__m128i _A, int _B, int _Imm);
extern int _mm_movemask_epi8(__m128i _A);
extern __m128i _mm_shuffle_epi32(__m128i _A, int _Imm);
extern __m128i _mm_shufflehi_epi16(__m128i _A, int _Imm);
extern __m128i _mm_shufflelo_epi16(__m128i _A, int _Imm);
extern __m128i _mm_unpackhi_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_unpackhi_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_unpackhi_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_unpackhi_epi64(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi8(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi16(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi32(__m128i _A, __m128i _B);
extern __m128i _mm_unpacklo_epi64(__m128i _A, __m128i _B);/** Integer, loads*/extern __m128i _mm_load_si128(__m128i const*_P);
extern __m128i _mm_loadu_si128(__m128i const*_P);
extern __m128i _mm_loadl_epi64(__m128i const*_P);/** Integer, sets*/extern __m128i _mm_set_epi64(__m64 _Q1, __m64 _Q0);
extern __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0);
extern __m128i _mm_set_epi16(short _W7, short _W6, short _W5, short _W4,short _W3, short _W2, short _W1, short _W0);
extern __m128i _mm_set_epi8(char _B15, char _B14, char _B13, char _B12,char _B11, char _B10, char _B9, char _B8,char _B7, char _B6, char _B5, char _B4,char _B3, char _B2, char _B1, char _B0);
extern __m128i _mm_set1_epi64(__m64 _Q);
extern __m128i _mm_set1_epi32(int _I);
extern __m128i _mm_set1_epi16(short _W);
extern __m128i _mm_set1_epi8(char _B);
extern __m128i _mm_setl_epi64(__m128i _Q);
extern __m128i _mm_setr_epi64(__m64 _Q0, __m64 _Q1);
extern __m128i _mm_setr_epi32(int _I0, int _I1, int _I2, int _I3);
extern __m128i _mm_setr_epi16(short _W0, short _W1, short _W2, short _W3,short _W4, short _W5, short _W6, short _W7);
extern __m128i _mm_setr_epi8(char _B15, char _B14, char _B13, char _B12,char _B11, char _B10, char _B9, char _B8,char _B7, char _B6, char _B5, char _B4,char _B3, char _B2, char _B1, char _B0);
extern __m128i _mm_setzero_si128(void);/** Integer, stores*/extern void _mm_store_si128(__m128i *_P, __m128i _B);
extern void _mm_storeu_si128(__m128i *_P, __m128i _B);
extern void _mm_storel_epi64(__m128i *_P, __m128i _Q);
extern void _mm_maskmoveu_si128(__m128i _D, __m128i _N, char *_P);/** Integer, moves*/extern __m128i _mm_move_epi64(__m128i _Q);
extern __m128i _mm_movpi64_epi64(__m64 _Q);
extern __m64 _mm_movepi64_pi64(__m128i _Q);/** Cacheability support*/extern void _mm_stream_pd(double *_Dp, __m128d _A);
extern void _mm_stream_si128(__m128i *_P, __m128i _A);
extern void _mm_clflush(void const*_P);
extern void _mm_lfence(void);
extern void _mm_mfence(void);
extern void _mm_stream_si32(int *_P, int _I);
extern void _mm_pause(void);/** New convert to float*/extern double _mm_cvtsd_f64(__m128d _A);/** Support for casting between various SP, DP, INT vector types.* Note that these do no conversion of values, they just change* the type.*/extern __m128  _mm_castpd_ps(__m128d);
extern __m128i _mm_castpd_si128(__m128d);
extern __m128d _mm_castps_pd(__m128);
extern __m128i _mm_castps_si128(__m128);
extern __m128  _mm_castsi128_ps(__m128i);
extern __m128d _mm_castsi128_pd(__m128i);/** Support for 64-bit extension intrinsics*/#if defined (_M_X64)
extern __int64 _mm_cvtsd_si64(__m128d);
extern __int64 _mm_cvttsd_si64(__m128d);
extern __m128d _mm_cvtsi64_sd(__m128d, __int64);
extern __m128i _mm_cvtsi64_si128(__int64);
extern __int64 _mm_cvtsi128_si64(__m128i);

>>>pmmintrin.h头文件中的函数接口

/** New Single precision vector instructions.*/extern __m128 _mm_addsub_ps(__m128 /* a */, __m128 /* b */);
extern __m128 _mm_hadd_ps(__m128 /* a */, __m128 /* b */);
extern __m128 _mm_hsub_ps(__m128 /* a */, __m128 /* b */);
extern __m128 _mm_movehdup_ps(__m128 /* a */);
extern __m128 _mm_moveldup_ps(__m128 /* a */);/** New double precision vector instructions.*/extern __m128d _mm_addsub_pd(__m128d /* a */, __m128d /* b */);
extern __m128d _mm_hadd_pd(__m128d /* a */, __m128d /* b */);
extern __m128d _mm_hsub_pd(__m128d /* a */, __m128d /* b */);
extern __m128d _mm_loaddup_pd(double const * /* dp */);
extern __m128d _mm_movedup_pd(__m128d /* a */);/** New unaligned integer vector load instruction.*/
extern __m128i _mm_lddqu_si128(__m128i const * /* p */);/** Miscellaneous new instructions.*/
/** For _mm_monitor p goes in eax, extensions goes in ecx, hints goes in edx.*/
extern void _mm_monitor(void const * /* p */, unsigned /* extensions */, unsigned /* hints */);/** For _mm_mwait, extensions goes in ecx, hints goes in eax.*/
extern void _mm_mwait(unsigned /* extensions */, unsigned /* hints */);

>>>tmmintrin.h头文件中的函数接口

    // Horizontal Add: add pairs of adjacent words or double words.// Each field in the result is the sum of two adjacent fields// from the arguments, with the lower result fields coming from// the first argument and the upper result fields coming from// the second argument. The "hadds" forms saturate the signed// addition rather than wrapping.extern __m128i _mm_hadd_epi16 (__m128i, __m128i);extern __m128i _mm_hadd_epi32 (__m128i, __m128i);extern __m128i _mm_hadds_epi16 (__m128i, __m128i);extern __m64 _mm_hadd_pi16 (__m64, __m64);extern __m64 _mm_hadd_pi32 (__m64, __m64);extern __m64 _mm_hadds_pi16 (__m64, __m64);// Horizontal Subtract: subtract pairs of adjacent words or double// words. Each field in the result is the difference of two adjacent// fields from the arguments, where the upper field is subtracted// from the lower field. The lower result fields come from// the first argument and the upper result fields come from// the second argument. The "hsubs" forms saturate the signed// subtraction rather than wrapping.extern __m128i _mm_hsub_epi16 (__m128i, __m128i);extern __m128i _mm_hsub_epi32 (__m128i, __m128i);extern __m128i _mm_hsubs_epi16 (__m128i, __m128i);extern __m64 _mm_hsub_pi16 (__m64, __m64);extern __m64 _mm_hsub_pi32 (__m64, __m64);extern __m64 _mm_hsubs_pi16 (__m64, __m64);// Multiply unsigned bytes by signed bytes and sum the word// results in pairs with saturation. Each byte of the first// argument is zero-extended to a word field and each byte// of the second argument is sign-extended to a word field,// then each pair of words is multiplied together to give// signed word intermediate results. Pairs of words from// that result are added horizontally with saturation// to give the final result.extern __m128i _mm_maddubs_epi16 (__m128i, __m128i);extern __m64 _mm_maddubs_pi16 (__m64, __m64);// Packed multiply high integers with round and scaling,// {X,}MM2/m{128,64} (b) to {X,}MM1 (a).extern __m128i _mm_mulhrs_epi16 (__m128i, __m128i);extern __m64 _mm_mulhrs_pi16 (__m64, __m64);// Packed shuffle bytes// {X,}MM2/m{128,64} (b) by {X,}MM1 (a).extern __m128i _mm_shuffle_epi8 (__m128i, __m128i);extern __m64 _mm_shuffle_pi8 (__m64, __m64);// Packed byte, word, double word sign, {X,}MM2/m{128,64} (b) to// {X,}MM1 (a).extern __m128i _mm_sign_epi8 (__m128i, __m128i);extern __m128i _mm_sign_epi16 (__m128i, __m128i);extern __m128i _mm_sign_epi32 (__m128i, __m128i);extern __m64 _mm_sign_pi8 (__m64, __m64);extern __m64 _mm_sign_pi16 (__m64, __m64);extern __m64 _mm_sign_pi32 (__m64, __m64);// Packed align and shift right by n*8 bits,// {X,}MM2/m{128,64} (b) to {X,}MM1 (a).extern __m128i _mm_alignr_epi8 (__m128i, __m128i, int);extern __m64 _mm_alignr_pi8 (__m64, __m64, int);// Packed byte, word, double word absolute value,// {X,}MM2/m{128,64} (b) to {X,}MM1 (a).extern __m128i _mm_abs_epi8 (__m128i);extern __m128i _mm_abs_epi16 (__m128i);extern __m128i _mm_abs_epi32 (__m128i);extern __m64 _mm_abs_pi8 (__m64);extern __m64 _mm_abs_pi16 (__m64);extern __m64 _mm_abs_pi32 (__m64);

>>>smmintrin.h头文件中的函数接口

        // Integer blend instructions - select data from 2 sources// using constant or variable maskextern __m128i _mm_blend_epi16 (__m128i, __m128i, const int /* mask */);extern __m128i _mm_blendv_epi8 (__m128i, __m128i, __m128i mask);// Float single precision blend instructions - select data// from 2 sources using constant/variable maskextern __m128  _mm_blend_ps (__m128, __m128, const int /* mask */);extern __m128  _mm_blendv_ps(__m128, __m128, __m128 /* mask */);// Float double precision blend instructions - select data// from 2 sources using constant/variable maskextern __m128d _mm_blend_pd (__m128d, __m128d, const int /* mask */);extern __m128d _mm_blendv_pd(__m128d, __m128d, __m128d /* mask */);// Dot product instructions with mask-defined summing and zeroing// of result's partsextern __m128  _mm_dp_ps(__m128, __m128, const int /* mask */);extern __m128d _mm_dp_pd(__m128d, __m128d, const int /* mask */);// Packed integer 64-bit comparison, zeroing or filling with ones// corresponding parts of resultextern __m128i _mm_cmpeq_epi64(__m128i, __m128i);// Min/max packed integer instructionsextern __m128i _mm_min_epi8 (__m128i, __m128i);extern __m128i _mm_max_epi8 (__m128i, __m128i);extern __m128i _mm_min_epu16(__m128i, __m128i);extern __m128i _mm_max_epu16(__m128i, __m128i);extern __m128i _mm_min_epi32(__m128i, __m128i);extern __m128i _mm_max_epi32(__m128i, __m128i);extern __m128i _mm_min_epu32(__m128i, __m128i);extern __m128i _mm_max_epu32(__m128i, __m128i);// Packed integer 32-bit multiplication with truncation// of upper halves of resultsextern __m128i _mm_mullo_epi32(__m128i, __m128i);// Packed integer 32-bit multiplication of 2 pairs of operands// producing two 64-bit resultsextern __m128i _mm_mul_epi32(__m128i, __m128i);// Packed integer 128-bit bitwise comparison.// return 1 if (val 'and' mask) == 0extern int _mm_testz_si128(__m128i /* mask */, __m128i /* val */);// Packed integer 128-bit bitwise comparison.// return 1 if (val 'and_not' mask) == 0extern int _mm_testc_si128(__m128i /* mask */, __m128i /* val */);// Packed integer 128-bit bitwise comparison// ZF = ((val 'and' mask) == 0)  CF = ((val 'and_not' mask) == 0)// return 1 if both ZF and CF are 0extern int _mm_testnzc_si128(__m128i /* mask */, __m128i /* val */);// Insert single precision float into packed single precision// array element selected by index.// The bits [7-6] of the 3d parameter define src index,// the bits [5-4] define dst index, and bits [3-0] define zeroing// mask for dstextern __m128 _mm_insert_ps(__m128 /* dst */, __m128 /* src */, const int /* index */);// Helper macro to create index-parameter value for _mm_insert_ps#define _MM_MK_INSERTPS_NDX(srcField, dstField, zeroMask) \(((srcField)<<6) | ((dstField)<<4) | (zeroMask))// Extract binary representation of single precision float from// packed single precision array element selected by indexextern int _mm_extract_ps(__m128 /* src */, const int /* index */);// Extract single precision float from packed single precision// array element selected by index into dest#define _MM_EXTRACT_FLOAT(dest, src, ndx) \*((int*)&(dest)) = _mm_extract_ps((src), (ndx))// Extract specified single precision float element// into the lower part of __m128#define _MM_PICK_OUT_PS(src, num) \_mm_insert_ps(_mm_setzero_ps(), (src), \_MM_MK_INSERTPS_NDX((num), 0, 0x0e))// Insert integer into packed integer array element// selected by indexextern __m128i _mm_insert_epi8 (__m128i /* dst */, int /* src */, const int /* index */);extern __m128i _mm_insert_epi32(__m128i /* dst */, int /* src */, const int /* index */);#if defined (_M_X64)extern __m128i _mm_insert_epi64(__m128i /* dst */, __int64 /* src */, const int /* index */);
#endif  /* defined (_M_X64) */// Extract integer from packed integer array element// selected by indexextern int   _mm_extract_epi8 (__m128i /* src */, const int /* index */);extern int   _mm_extract_epi32(__m128i /* src */, const int /* index */);#if defined (_M_X64)extern __int64 _mm_extract_epi64(__m128i /* src */, const int /* index */);
#endif  /* defined (_M_X64) */// Horizontal packed word minimum and its index in// result[15:0] and result[18:16] respectivelyextern __m128i _mm_minpos_epu16(__m128i);// Packed/single float double precision roundingextern __m128d _mm_round_pd(__m128d /* val */, int /* iRoundMode */);extern __m128d _mm_round_sd(__m128d /* dst */, __m128d /* val */, int /* iRoundMode */);// Packed/single float single precision roundingextern __m128  _mm_round_ps(__m128  /* val */, int /* iRoundMode */);extern __m128  _mm_round_ss(__m128 /* dst */, __m128  /* val */, int /* iRoundMode */);// Packed integer sign-extensionextern __m128i _mm_cvtepi8_epi32 (__m128i);extern __m128i _mm_cvtepi16_epi32(__m128i);extern __m128i _mm_cvtepi8_epi64 (__m128i);extern __m128i _mm_cvtepi32_epi64(__m128i);extern __m128i _mm_cvtepi16_epi64(__m128i);extern __m128i _mm_cvtepi8_epi16 (__m128i);// Packed integer zero-extensionextern __m128i _mm_cvtepu8_epi32 (__m128i);extern __m128i _mm_cvtepu16_epi32(__m128i);extern __m128i _mm_cvtepu8_epi64 (__m128i);extern __m128i _mm_cvtepu32_epi64(__m128i);extern __m128i _mm_cvtepu16_epi64(__m128i);extern __m128i _mm_cvtepu8_epi16 (__m128i);// Pack 8 double words from 2 operands into 8 words of result// with unsigned saturationextern __m128i _mm_packus_epi32(__m128i, __m128i);// Sum absolute 8-bit integer difference of adjacent groups of 4 byte// integers in operands. Starting offsets within operands are// determined by maskextern __m128i _mm_mpsadbw_epu8(__m128i /* s1 */, __m128i /* s2 */, const int /* mask */);/** Load double quadword using non-temporal aligned hint*/extern __m128i _mm_stream_load_si128(__m128i*);

>>>nmmintrin.h头文件中的函数接口

/** Intrinsics for text/string processing.*/extern __m128i _mm_cmpistrm (__m128i /* a */, __m128i /* b */, const int /* mode */);extern int     _mm_cmpistri (__m128i /* a */, __m128i /* b */, const int /* mode */);extern __m128i _mm_cmpestrm (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);extern int     _mm_cmpestri (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);/** Intrinsics for text/string processing and reading values of EFlags.*/extern int     _mm_cmpistrz (__m128i /* a */, __m128i /* b */, const int /* mode */);extern int     _mm_cmpistrc (__m128i /* a */, __m128i /* b */, const int /* mode */);extern int     _mm_cmpistrs (__m128i /* a */, __m128i /* b */, const int /* mode */);extern int     _mm_cmpistro (__m128i /* a */, __m128i /* b */, const int /* mode */);extern int     _mm_cmpistra (__m128i /* a */, __m128i /* b */, const int /* mode */);extern int     _mm_cmpestrz (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);extern int     _mm_cmpestrc (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);extern int     _mm_cmpestrs (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);extern int     _mm_cmpestro (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);extern int     _mm_cmpestra (__m128i /* a */, int /* la */, __m128i /* b */, int /* lb */, const int /* mode */);/** Packed integer 64-bit comparison, zeroing or filling with ones* corresponding parts of result*/extern __m128i _mm_cmpgt_epi64(__m128i /* val1 */, __m128i /* val2 */);/** Calculate a number of bits set to 1*/extern int _mm_popcnt_u32(unsigned int /* v */);#if defined (_M_X64)extern __int64 _mm_popcnt_u64(unsigned __int64 /* v */);
#endif  /* defined (_M_X64) *//** Accumulate CRC32 (polynomial 0x11EDC6F41) value*/extern unsigned int _mm_crc32_u8 (unsigned int /* crc */, unsigned char /* v */);extern unsigned int _mm_crc32_u16(unsigned int /* crc */, unsigned short /* v */);extern unsigned int _mm_crc32_u32(unsigned int /* crc */, unsigned int /* v */);#if defined (_M_X64)extern unsigned __int64 _mm_crc32_u64(unsigned __int64 /* crc */, unsigned __int64 /* v */);
#endif  /

4.实际使用

由于上面头文件中的函数接口比较多,所以仅需要大致了解一下,在用到的时候则需要知道具体函数怎么调用以及功能。这里先来简单介绍一下SSE的运算指令。

SSE指令繁多,以上只是针对代表性的列出一些以对其运算指令有一个概览,以上出现的指令名称在SSE指令集中都对应着具体的函数或函数集,如ADDPS对应的函数为_mm_add_ps,CMPSS对应的函数集为_mm_cmpeq_ss、_mm_cmpge_ss、_mm_cmpgt_ss、_mm_cmple_ss等。这里介绍一下SIMD指令集函数的命令规则:SIMD指令集的数据类型命令主要有__m128、__m128i、__128d、__m256、__m256i、__m256d等等,具体参考上面,默认为单精度(d表示双精度,i表示整型),其函数的命令大致分为3个部分,3个部分之间由“ _ ”隔开,含义如下:

  • 第一部分为_mm或_mm256。_mm表示其为SSE指令,操作的向量长度为64为或128位。_mm256表示AVX指令,操作的向量长度为256位。
  • 第二部分为操作函数名称,如_add、_load、_mul等,一些函数操作会增加修饰符,如loadu表示不对齐到向量长度的存储器访问。
  • 第三部分为操作的对象名及数据类型,_ps表示操作向量中所有的单精度数据;_pd表示操作向量中所有的双精度数据;_pixx表示操作向量中所有的xx位的有符号整型数据,向量寄存器长度为64位;_epixx表示操作向量中所有的xx位的有符号整型数据,向量寄存器长度为128位;_epuxx表示操作向量中所有的xx位的无符号整型数据,向量寄存器长度为128位;_ss表示只操作向量中第一个单精度数据;si128表示操作向量寄存器中的第一个128位有符号整型。

3个部分组合起来,就形成了一条向量函数,如_mm_add_ps表示使用128位向量寄存器执行单浮点加法运算。在实际应用过程中,编程人员需要针对特定的算法,灵活的对不同的指令进行组合,用指令集运算代替一般函数运算,达到指令级并行优化的效果。一般来说,使用指令集必须经过三个步骤:

  1. 将运算数据从内存迁移至寄存器上。这一步用到的SSE的数据迁移指令,主要目的是将位于内存的多个元数据排列后迁移至寄存器上,以便使用指令集在寄存器上进行向量化运算,比如灰度图像一个像素占8位空间,则可以将16个像素按次序排列后迁移至128位寄存器上,执行向量化数学运算,如加运算ADDPS,减运算SUBPS,乘运算MULPS,如此同时处理16个像素,大大提高运算效率。
  2. 运算。即调用SSE繁多的数学运算指令对寄存器上的数据进行向量化运算,因为是多个数据同时运算,所以可获得数倍的加速比。
  3. 将运算结果从寄存器迁移至内存。向量化运算完成后,将结果从寄存器传输至内存。

在优化之前,有一个非常关键的要求需要注意:使用SSE优化算法的输入数据,必须是16字节对齐的。这是由于SSE指令集对数据进行内存与寄存器之间转换的时候,对不满足16字节对其的地址将导致异常,使程序终止运行。因此在优化之前,所有的输入数据将被强制进行16

具体查找函数以及其功能参照此网站http://kntan.top/#!=undefined
这里把最上面说的自己阅读代码时遇到的函数接口做一个简单介绍:

// Sets the 16 signed 8-bit integer values to b.
用提供的值设置dst中的压缩16位整数。
//将16个有符号8位整数值设置为b。
__m128i _mm_set1_epi8(char b)
//将128位(由4个压缩的32位整数组成)从内存加载到dst。
//mem_addr必须在16字节边界上对齐,否则可能会生成一般保护异常。
__m128i _mm_load_epi32(void const* mem_addr)
//将128位(由2个压缩的64位整数组成)从内存加载到dst。
__m128i _mm_load_epi64(void const* mem_addr)
//将 128 位整数数据从内存加载到 dst。
//mem_addr 不需要在任何特定边界上对齐。
__m128i _mm_loadu_si128 (__m128i *p);
//将未对齐的 16 位整数从内存加载到 dst 的第一个元素中。
__m128i _mm_loadu_si16 (void const* mem_addr)
//将未对齐的 32 位整数从内存加载到 dst 的第一个元素中。
__m128i _mm_loadu_si32 (void const* mem_addr)
//将未对齐的 64 位整数从内存加载到 dst 的第一个元素中。
__m128i _mm_loadu_si64 (void const* mem_addr)
//将 128 位整数数据从 a 存储到存储到p所指定的变量中去。
//mem_addr 必须在 16 字节边界上对齐,否则可能会生成一般保护异常。
_mm_store_si128 ( __m128i *p, __m128i a)
//将a和b的低64位数以8位为单位进行交错;
__m128i _mm_unpacklo_epi8(_m128i a,_m128i b)
//将a和b的低64位数以16位为单位进行交错;
__m128i _mm_unpacklo_epi16 (__m128i a, __m128i b)
//将a和b的低64位数以32位为单位进行交错;
__m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
//将a和b的低64位数以64位为单位进行交错;
__m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
//将a和b中的压缩16位整数相加,并将结果存储在dst中。
__m128i _mm_add_epi16 (__m128i a, __m128i b);
//将a和b中的压缩32位整数相加,并将结果存储在dst中。
__m128i _mm_add_epi32 (__m128i a, __m128i b);
//将a和b中的压缩64位整数相加,并将结果存储在dst中。
__m128i _mm_add_epi64 (__m128i a, __m128i b);
//将a和b中的压缩8位整数相加,并将结果存储在dst中。
__m128i _mm_add_epi8 (__m128i a, __m128i b);
//将 a 右移 imm8 个字节,同时移入零,并将结果存储在 dst 中。
_m128i _mm_srli_si128 (__m128i a, int imm8)
//将64位整数a右移imm8,同时进行零移位,并将结果存储在dst中。
__m64 _mm_srli_si64 (__m64 a, int imm8)
 //可用于计算汉明距离
//对无符号16位整数a中设置为1的位数进行计数,并在dst中返回该计数。
__m128i _mm_popcnt_epi16 (__m128i a)
//对无符号32位整数a中设置为1的位数进行计数,并在dst中返回该计数。
__m128i _mm_popcnt_epi32 (__m128i a)
//对无符号64位整数a中设置为1的位数进行计数,并在dst中返回该计数。
__m128i _mm_popcnt_epi64 (__m128i a)
//对无符号8位整数a中设置为1的位数进行计数,并在dst中返回该计数。
__m128i _mm_popcnt_epi8 (__m128i a)

SIMD指令集分析(C/C++)相关推荐

  1. OpenCV算法加速(2)使用SIMD指令集(MMX、SSE、AVX)和MIPP实现视觉算法优化

    一.概述 很多人觉得OpenCV速度比较慢,其实提升OpenCV运行速度,最常见的就是重新编译OpenCV,添加各种指令集优化支持. SIMD(Single Instruction Multiple ...

  2. SIMD指令集——一条指令操作多个数,SSE,AVX都是,例如:乘累加,Shuffle等

    SIMD指令集 from:https://zhuanlan.zhihu.com/p/31271788 SIMD,即Single Instruction, Multiple Data,一条指令操作多个数 ...

  3. 一文读懂SIMD指令集 目前最全SSE/AVX介绍

    SIMD指令集 SSE/AVX 概述 参考手册 Intel® Intrinsics Guide Tommesani.com Docs Intel® 64 and IA-32 Architectures ...

  4. MMX、SSE、AVX等SIMD指令集说明

    发展历程 SIMD指令集头文件对应查找表 头文件    指令集描述 intrin.h    All Architectures mmintrin.h    MMX xmmintrin.h    SSE ...

  5. ARM SIMD 指令集:NEON 简介

    ARM SIMD 指令集:NEON 简介 一.NEON 简介 1.1.NEON 简介 1.2.NEON 使用方式 1.3.编译器自动向量化的编译选项 1.3.1 Arm Compiler 中使能自动向 ...

  6. Intel 64/x86_64/IA-32/x86处理器 - SIMD指令集 - SSE扩展(9) - 64位整型指令(MMX指令集扩展)

    SSE 64-Bit SIMD Integer Instructions SSE扩展增加了几条64位组合的整型指令,这些指令操作MMX寄存器和64位的存储器操作数,这些指令可以看作是对MMX指令集的扩 ...

  7. Intel 64/x86_64/IA-32/x86处理器 - SIMD指令集 - SSE扩展(4) - 数据传输指令

    SSE Instruction Set SSE指令集大致可以分为4个功能组: 组合的与标量的单精度浮点指令 数据传输指令 算术指令 逻辑指令 比较指令 混洗shuffle指令 转换指令 64位SIMD ...

  8. Intel 64/x86_64/IA-32/x86处理器 - SIMD指令集 - SSE扩展(1) - 概述/历史/新数据类型/XMM寄存器组

    SSE Instructions SSE Overview & History Intel SSE技术的全称是Streaming SIMD Extension,中文译作流式单指令多数据指令扩展 ...

  9. Intel 64/x86_64/IA-32/x86处理器 - SIMD指令集 - MMX技术(1) - 概述 传输指令

    MMX™ Instructions IA-32架构引入了4个指令集扩展,使得IA-32处理器可以执行单指令多数据SIMD操作.这些扩展包括MMX技术,SSE扩展,SSE2扩展,SSE3扩展. MMX指 ...

最新文章

  1. 如何改变Redis用不好的误区
  2. TEASOFT 关于远程WEICHAT命令
  3. Microsoft Azure Site Recovery (1) 安装VMM服务器代理
  4. C语言 找数码是否存在
  5. Graph Destruction 并查集,图论(500)
  6. python3精要(61)-线性回归
  7. win32汇编寄存器汇总
  8. P3246 [HNOI2016]序列(莫队+单调栈+ST表)
  9. html中左侧播放器插件,简洁实用的html5音乐播放器插件
  10. 【转】重装系统后找不到硬盘
  11. 25.212---复用和信道编码
  12. c 语言编程游戏代码大全,C语言编程游戏代码
  13. AutoCAD2012从入门到精通中文视频教程 第46课 三维实体及创建实体命令1(个人收藏)
  14. 在Ubuntu20.04上安装Nsight Systems
  15. cad转shp 奥维_如何将地图数据及奥维对象导出到CAD
  16. 域——windows服务器域详解
  17. dw可以编辑java吗_用dreamweaver开发ASP图文教程。(修改资料篇)
  18. 512内存安装php7,《滴水石穿-php》虚拟机中安装php7内存错误
  19. fofa批量验证poc脚本笔记
  20. waf指纹识别工具WAFW00F的使用

热门文章

  1. php 原生 excel,关于原生php实现excel文件读写的方法
  2. 【转】寻找最好的笔记软件:海选篇 (v1.0)
  3. 驱动器使用 —— DS402状态切换(个人笔记)
  4. dnspod每步nat123及3322动态域名同类端口映射域名解析软件对比
  5. 2011年10月最新电脑城GhostXPSP3流金岁月装机版V11.10 舞
  6. 智慧城管数据普查与综合数据建库
  7. java_导出_word_[转载]java导出word的5种方式
  8. PGL 斯德哥尔摩Major相关物品介绍(通行证、印花等)
  9. 根据String类型日期算年龄
  10. Java中常用关键字总结(足以应对无聊面试官)