编译分析：提memcmp.c+gcc四十米大刀到C#世界中屠龙

在stackoverflow上有关于在C#中如何进行最快的byte数组判断：https://stackoverflow.com/questions/43289/comparing-two-byte-arrays-in-net
据讨论的网友统计，最快的方法是使用C语言的memcmp.c。下文会针对memcmp源码写一个C#版本进行比较测试，并进行编译分析。

在Unity内使用memcmp与同版本的C#方法进行测试比较

测试脚本：

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System.Runtime.InteropServices;public class TestScript2 : MonoBehaviour {// Use this for initializationvoid Start () {byte[] b1 = new byte[10000000];byte[] b2 = new byte[10000000];for (int i = 0; i < 10000000;i++){byte t=(byte)UnityEngine.Random.Range(1, 64);b1[i] = t;b2[i] = t;}int pTime = System.Environment.TickCount;int r=ByteArrayCompare(b1, b2,10000000);int cTime = System.Environment.TickCount;print(r);print(cTime-pTime);for (int i = 0; i < 10000000; i++){byte t = (byte)UnityEngine.Random.Range(1, 64);b1[i] = t;b2[i] = t;}long l=10000000;pTime = System.Environment.TickCount;int r2 = memcmp(b1, b2,l);cTime = System.Environment.TickCount;print(r2);print(cTime-pTime);}int ByteArrayCompare(byte[] a1, byte[] a2,int n){for (int i = 0; i<n ; i++)if (a1[i] != a2[i])return a1[i]-a2[i];return 0;}[DllImport("msvcrt.dll", CallingConvention = CallingConvention.Cdecl)]static extern int memcmp(byte[] b1, byte[] b2, long count);
}

memcmp.c 源码：

/** memcmp.c --**    Source code for the "memcmp" library routine.** Copyright (c) 1998 Sun Microsystems, Inc.** See the file "license.terms" for information on usage and redistribution* of this file, and for a DISCLAIMER OF ALL WARRANTIES.** SCCS: @(#) memcmp.c 1.2 98/01/19 10:48:58*/#include "tcl.h"
#include "tclPort.h"/** Here is the prototype just in case it is not included* in tclPort.h.*/int     memcmp _ANSI_ARGS_((CONST VOID *s1,CONST VOID *s2, size_t n));/**----------------------------------------------------------------------** memcmp --**   Compares two bytes sequences.** Results:*     compares  its  arguments, looking at the first n*     bytes (each interpreted as an unsigned char), and  returns*     an integer less than, equal to, or greater than 0, accord-*     ing as s1 is less  than,  equal  to,  or*     greater than s2 when taken to be unsigned 8 bit numbers.** Side effects:* None.**----------------------------------------------------------------------*/int
memcmp(s1, s2, n)
CONST VOID *s1;         /* First string. */
CONST VOID *s2;         /* Second string. */
size_t      n;                      /* Length to compare. */
{unsigned char u1, u2;for ( ; n-- ; s1++, s2++) {u1 = * (unsigned char *) s1;u2 = * (unsigned char *) s2;if ( u1 != u2) {return (u1-u2);}}return 0;
}

一千万个byte比较，测试结果：
C#：120毫秒
C: 2毫秒

汇编代码分析

C#版本
mcs打包出.net的exe文件，mono再编译出的目标文件（AOT编译最高优化版本，不确定与实际运行的JIT版本完全相同），otool反汇编如下：

0000000000000a40 subq    $0x28, %rsp
0000000000000a44    movq    %r12, (%rsp)
0000000000000a48    movq    %r13, 0x8(%rsp)
0000000000000a4d    movq    %r14, 0x10(%rsp)
0000000000000a52    movq    %r15, 0x18(%rsp)
0000000000000a57    movq    %rdi, %r13
0000000000000a5a    movq    %rsi, %r14
0000000000000a5d    movq    %rdx, %r15
0000000000000a60    xorl    %r12d, %r12d
0000000000000a63    jmp 0xab3                       //循环开始
0000000000000a68    movslq  %r12d, %rax             //
0000000000000a6b    cmpl    %eax, 0x18(%r13)        //
0000000000000a6f    jbe 0xae1                       //检查index是否大于0？
0000000000000a75    leaq    0x20(%r13,%rax), %rax   //数组a index递增
0000000000000a7a    movzbl  (%rax), %eax            //访存，取出一个a数组的byte
0000000000000a7d    movslq  %r12d, %rcx             //
0000000000000a80    cmpl    %ecx, 0x18(%r14)        //
0000000000000a84    jbe 0xad2                       //检查index是否大于等于0？
0000000000000a8a    leaq    0x20(%r14,%rcx), %rcx   //数组b index递增
0000000000000a8f    movzbl  (%rcx), %ecx            //访存，取出一个b数组的byte
0000000000000a92    cmpl    %ecx, %eax              //两个byte比较
0000000000000a94    je  0xab0                       //如果相同则跳转
0000000000000a96    movslq  %r12d, %rax             //走到这里则是发现了两个不相等的byte
0000000000000a99    leaq    0x20(%r13,%rax), %rax   //不过这重新访存取值一遍是什么鬼？
0000000000000a9e    movzbl  (%rax), %eax
0000000000000aa1    movslq  %r12d, %rcx
0000000000000aa4    leaq    0x20(%r14,%rcx), %rcx
0000000000000aa9    movzbl  (%rcx), %ecx
0000000000000aac    subl    %ecx, %eax              //两个byte相减，并将结果存进%eax
0000000000000aae    jmp 0xaba                       //出循环
0000000000000ab0    incl    %r12d                   //i递增
0000000000000ab3    cmpl    %r15d, %r12d            //i<n
0000000000000ab6    jl  0xa68                       //循环出口
0000000000000ab8    xorl    %eax, %eax
0000000000000aba    movq    (%rsp), %r12
0000000000000abe    movq    0x8(%rsp), %r13
0000000000000ac3    movq    0x10(%rsp), %r14
0000000000000ac8    movq    0x18(%rsp), %r15
0000000000000acd    addq    $0x28, %rsp
0000000000000ad1    retq
0000000000000ad2    movl    $0x5d, %esi
0000000000000ad7    movl    $0x177, %edi
0000000000000adc    callq   0xb2e
0000000000000ae1    movl    $0x72, %esi
0000000000000ae6    jmp 0xad7

主要关注循环内的指令。循环里面有6条数组边界检查指令，会影响循环的速度，最主要的副作用是夹在了两个数组的访存取值中间，目测会对CPU的智商产生考验（影响并行访存取值的判断），如果改为边界检查a–>边界检查b–>取a byte–>取b byte可能会好一点。0000000000000a94是一个大部分会符合条件的case，在循环中向前跳转非常不好，很容易出现分支预测错误。0000000000000a96处找到了两个不相等的数后重复了一遍访存取值，不确定原因，但是不太影响性能，因为只会运行一次。数组index递增后访存取值这两行也有点问题

0000000000000a75 leaq    0x20(%r13,%rax), %rax   //数组a index递增
0000000000000a7a    movzbl  (%rax), %eax            //访存，取出一个a数组的byte

%rax造成了数据相关，leaq这行对于递增+1这个操作来讲也有点复杂。总体来说数据相关太多，有难预测的跳转，某些细节有些多余指令。

C版本
memcmp.c gcc优化编译后(优化级别忘了)的汇编代码版本：

00000000000000b0 pushq   %rbp
00000000000000b1    movq    %rsp, %rbp
00000000000000b4    jmp 0xc9
00000000000000b6    nopw    %cs:_main(%rax,%rax)
00000000000000c0    decq    %rdx                            //循环入口,n-=1
00000000000000c3    incq    %rdi                            //s1++
00000000000000c6    incq    %rsi                            //s2++
00000000000000c9    testq   %rdx, %rdx                      //n==0
00000000000000cc    je  0xda                                //如果n==0,跳到0xda处，直接返回0
00000000000000ce    movzbl  _main(%rdi), %eax               //取u1
00000000000000d1    movzbl  _main(%rsi), %ecx               //取u2
00000000000000d4    subl    %ecx, %eax                      //u1-u2,结果存入%eax
00000000000000d6    je  0xc0                                //如果u1-u2=0,也既是u1==u2,跳到0xc0再次开始循环
00000000000000d8    jmp 0xdc                                //否则，既是u1!=u2,跳到0xdc,将%eax出栈
00000000000000da    xorl    %eax, %eax                      //循环结束，0=>%eax
00000000000000dc    popq    %rbp
00000000000000dd    retq

相对C#版本，指令明显精简许多，同样只关注循环内部。n–,s1++,s2++非常好，放在一起，cpu执行单元完全可以同时并发执行这三条指令，00000000000000cc的je向前跳转也非常好，cpu完全可以预测not taken并忽略testq，movzbl访存取u1,u2，这两条访存指令放在一起利于cpu执行访存的硬件单元进行并行取值。00000000000000d6循环向回跳转，对CPU分支预测比较友好，符合Backwards taken forwards not taken (BTFNT)的习惯，cpu基本会taken不需要等待subl两个数相减的计算结果。理论上来讲整个循环结束前只会出现一次分支预测惩罚。再关注一个细节是00000000000000d4的subl %ecx，%eax，这条指令将计算结果直接存到了函数返回值寄存器%eax，并更改了ZF条件码(上一次操作是否得0)，相比C#版的cmp je后再sub可以看出gcc和mono智商的区别。另外memcmp源码选择的返回相减结果比返回一个true要快，返回true的话汇编层面还要多一条movl $0x1, %eax的指令。

看完了细节，其实这段汇编指令最NB的是它的结构，非常适合CPU的并行指令执行与流水线化指令发射。
分析一下它循环内的结构

3个递增计算—>1个判断跳转—>两个访存—>1个判断跳转—>3个递增计算—>…

上面分析过，由于两个跳转方向比较合理，cpu完全可以正确预测，忽略掉这两个环节，那么就成了

3个递增计算—>两个访存—>3个递增计算—>两个访存—>3个递增计算—>两个访存—>…

由于三个计算，两个访存中没有数据和逻辑相关，对于较新的cpu来讲基本都是都是并行执行的，只有计算与访存，计算与计算之间有数据相关，所以实际就是

计算—>访存
             计算—>访存
                          计算—>访存
                                             …
利用数据转发机制，前一个计算结果在一个周期内就可以到达寄存器，接下来的访存和计算可以不间断进行，所以理论上来讲这段汇编指令cpu可以满负荷连续发射，只有在最后那一次循环(函数返回)才会踩刹车，比C#版本快了60倍也是理所当然。

————————————————————————————
参考：
https://stackoverflow.com/questions/43289/comparing-two-byte-arrays-in-net
https://docs.oracle.com/cd/E19455-01/806-3773/instructionset-23/index.html --Oracle
深入理解计算机系统 --R.E.Bryant,D.R.O’Hallaron
————————————————————————————
日志：
2017-7-4：修改了标题
2017-8-22：将“…最快的方法是memcmp.c。”改为“…最快的方法是通过[Dllimport]第三方调用的memcmp.c。”
2020-2-3：重写