加载中…
正文 字体大小:

在线汇编 for delphi(三):实战无符号128位大整数

(2013-02-08 13:55:55)
标签:

delphixe3

在线汇编

it

128位整数

分类: IT技术-在线汇编
为了便于测试和说明,我们将条件编译的段落分拆成单个函数,最后测试完毕后,整理到前面的《128位大整数扩展和操作符重载》 里去。Let's go baby!
  1. 加法(+ add):
    • 纯psacal:
      function U128AddPas(a, b: uint128): uint128;
      begin
        Int128Rec(Result).Hi := Int128Rec(a).Hi + Int128Rec(b).Hi;
        Int128Rec(Result).Lo := Int128Rec(a).Lo + Int128Rec(b).Lo;
        if ((Int128Rec(Result).Lo <= Int128Rec(a).Lo) or (Int128Rec(Result).Lo <= Int128Rec(b).Lo)) and
          ((Int128Rec(a).Lo > 0) and (Int128Rec(b).Lo > 0)) then Int128Rec(Result).Hi := Int128Rec(Result).Hi + 1;
      end;

    • 汇编:
      function U128Add(x, y: uint128): uint128;
      {$IFDEF CPUX86}
      asm
        //[eax]->x, [edx]->y , [ecx]->result
        //push [eax]
        //push [eax+4]
        //push [eax+8]
        //push [eax+12]
        //pop [ecx+12]
        //pop [ecx+8]
        //pop [ecx+4]
        //pop [ecx]
        MOVDQU xmm0,[eax] //大胆使用sse4以下的指令,以提高速度
        MOVDQU [ecx],xmm0   //result:=a
        mov eax,[edx]
        add [ecx],eax
        mov eax,[edx+4]
        adc [ecx+4],eax
        mov eax,[edx+8]
        adc [ecx+8],eax
        mov eax,[edx+12]
        adc [ecx+12],eax
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        //[rdx]->x, [r8]->y,[rcx]->result
      //  push [r8]
      //  push [r8+8]
      //  pop [rcx+8]
      //  pop [rcx]
        MOVDQU xmm0,[eax]   //大胆使用sse4以下的指令,以提高速度
        MOVDQU [ecx],xmm0   //result:=a
        mov rax,[rdx]
        add [rcx],rax
        mov rax,[rdx+8]
        adc [rcx+8],rax
      end;
      {$ENDIF CPUX64}

    • 测试对比:
      ----------- 32位 程序 -------------
      --------------- add -------------
      开始1亿次 Asm add: U=U+(a + b)  |1180591620717411303423 + 423565324656768865676 =1604156945374180169099
      1亿次Asm add 耗费 3963 毫秒 | u= 160415694537418016909900000000
      ----------------------------
      开始1亿次 pascal add:U128addPas(A, B) | U128addPas(1180591620717411303423,423565324656768865676) =1604156945374180169099
      1亿次pascal add 耗费 4711 毫秒 | u= 160415694537418016909900000000

      ----------- 64位 程序 -------------
      --------------- add -------------
      开始1亿次 Asm add: U=U+(a + b)  |1180591620717411303423 + 423565324656768865676 =1604156945374180169099
      1亿次Asm add 耗费 1685 毫秒 | u= 160415694537418016909900000000
      ----------------------------
      开始1亿次 pascal add:U128addPas(A, B) | U128addPas(1180591620717411303423,423565324656768865676) =1604156945374180169099
      1亿次pascal add 耗费 2824 毫秒 | u= 160415694537418016909900000000

  2. 减法(- sub):
    • 纯pascal
      function U128subPas(a, b: uint128): uint128;
      begin
        Int128Rec(Result).Lo := Int128Rec(a).Lo - Int128Rec(b).Lo;
        if CompareValue(Int128Rec(Result).Lo, Int128Rec(a).Lo) = GreaterThanValue then Int128Rec(Result).Hi := Int128Rec(Result).Hi - 1;
        Int128Rec(Result).Hi := Int128Rec(a).Hi - Int128Rec(b).Hi;
      end;
    • 汇编
      function U128sub(a, b: uint128): uint128;
      {$IFDEF CPUX86}
      asm
        //[eax]->x, [edx]->y , [ecx]->result
        MOVDQU xmm0,[eax]
        MOVDQU [ecx],xmm0
        mov eax,[edx]
        sub [ecx],eax
        mov eax,[edx+4]
        sbb [ecx+4],eax
        mov eax,[edx+8]
        sbb [ecx+8],eax
        mov eax,[edx+12]
        sbb [ecx+12],eax
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        //[rdx]->x, [r8]->y,[rcx]->result
        //push [rdx]
        //push [rdx+8]
        //pop [rcx+8]
        //pop [rcx]
        MOVDQU xmm0,[rdx]
        MOVDQU [rcx],xmm0   //result:=a
        mov rax,[r8]
        sub [rcx],rax
        mov rax,[r8+8]
        sbb [rcx+8],rax
      end;
      {$ENDIF CPUX64}
    • 测试对比:
      ----------- 32位 程序 -------------
      --------------- sub -------------
      开始1亿次 Asm 减法: U=a - b  | 1180591620717411303423 - 423565324656768865676 =757026296060642437747
      1亿次Asm 减法 耗费 1529 毫秒
      ----------------------------
      开始1亿次 pascal 减法:U128addPas(A, B) | U128subPas(1180591620717411303423,423565324656768865676) =757026296060642437747
      1亿次pascal 减法 耗费 5616 毫秒
      ----------- 64位 程序 -------------
      --------------- sub -------------
      开始1亿次 Asm 减法: U=a - b  | 1180591620717411303423 - 423565324656768865676 =757026296060642437747
      1亿次Asm 减法 耗费 796 毫秒
      ----------------------------
      开始1亿次 pascal 减法:U128addPas(A, B) | U128subPas(1180591620717411303423,423565324656768865676) =757026296060642437747
      1亿次pascal 减法 耗费 3401 毫秒

  3. AND
    • 纯pascal
      function U128BitAndPas(a, b: uint128): uint128;
      begin
        Int128Rec(Result).Lo := Int128Rec(a).Lo and Int128Rec(b).Lo;
        Int128Rec(Result).Hi := Int128Rec(a).Hi and Int128Rec(b).Hi;
      end;
    • 汇编
      function U128BitAnd(a, b: uint128): uint128;
      {$IFDEF CPUX86}
      ASM
        push [eax]
        push [eax+4]
        push [eax+8]
        push [eax+12]
        pop [ecx+12]
        pop [ecx+8]
        pop [ecx+4]
        pop [ecx]      //result:=x;
        mov eax,[edx]
        and [ecx],eax
        mov eax,[edx+4]
        and [ecx+4],eax
        mov eax,[edx+8]
        and [ecx+8],eax
        mov eax,[edx+12]
        and [ecx+12],eax
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        push [rdx]
        push [rdx+8]
        pop [rcx+8]
        pop [rcx] //result:=x;
        mov rax,[r8]
        and [rcx],rax
        mov rax,[r8+8]
        and [rcx+8],rax
      end;
    • 汇编sse
      function U128BitAndSSE(a, b: uint128): uint128;
      {$IFDEF CPUX86}
      //[eax]->x, [edx]->y , [ecx]->result
      ASM
        MOVDQU xmm0,[eax]
        MOVDQU xmm1,[edx]
        pand xmm0,xmm1
        MOVDQU [ecx],xmm0
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      //[rdx]->x, [r8]->y,[rcx]->result
      asm
        MOVDQU xmm0,[rdx]
        MOVDQU xmm1,[r8]
        pand xmm0,xmm1
        MOVDQU [ecx],xmm0
      end;
      {$ENDIF CPUX64}
    • 测设比较
      ----------- 32位 程序 -------------
      --------------- AND -------------
      开始1亿次 Asm shl SSE: U= a AND b  1180591620717411303423 AND 423565324656768865676 =423565324656768865676
      1亿次Asm AND SEE 耗费 468 毫秒
      ----------------------------
      开始1亿次 Asm and: U= a and b  1180591620717411303423 and 423565324656768865676 =423565324656768865676
      1亿次Asm and 耗费 1061 毫秒
      ----------------------------
      开始1亿次 pascal and:U128andPas(A, B) | U128andPas(1180591620717411303423,423565324656768865676) =423565324656768865676
      1亿次pascal and 耗费 1935 毫秒
      ----------- 64位 程序 -------------
      --------------- AND -------------
      开始1亿次 Asm shl SSE: U= a AND b  1180591620717411303423 AND 423565324656768865676 =423565324656768865676
      1亿次Asm AND SEE 耗费 390 毫秒
      ----------------------------
      开始1亿次 Asm and: U= a and b  1180591620717411303423 and 423565324656768865676 =423565324656768865676
      1亿次Asm and 耗费 765 毫秒
      ----------------------------
      开始1亿次 pascal and:U128andPas(A, B) | U128andPas(1180591620717411303423,423565324656768865676) =423565324656768865676
      1亿次pascal and 耗费 1217 毫秒
  4. OR
    • 纯pascal
      function U128BitORPas(a, b: uint128): uint128;
      begin
        Int128Rec(Result).Lo := Int128Rec(a).Lo OR Int128Rec(b).Lo;
        Int128Rec(Result).Hi := Int128Rec(a).Hi OR Int128Rec(b).Hi;
      end;
    • 汇编sse
      function U128BitORSSE(a, b: uint128): uint128;
      {$IFDEF CPUX86}
      //[eax]->x, [edx]->y , [ecx]->result
      ASM
        MOVDQU xmm0,[eax]
        MOVDQU xmm1,[edx]
        por xmm0,xmm1
        MOVDQU [ecx],xmm0
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      //[rdx]->x, [r8]->y,[rcx]->result
      asm
        MOVDQU xmm0,[rdx]
        MOVDQU xmm1,[r8]
        por xmm0,xmm1
        MOVDQU [ecx],xmm0
      end;
      {$ENDIF CPUX64}
    • 汇编
      function U128BitOR(a, b: uint128): uint128;
      {$IFDEF CPUX86}
      ASM
        push [eax]
        push [eax+4]
        push [eax+8]
        push [eax+12]
        pop [ecx+12]
        pop [ecx+8]
        pop [ecx+4]
        pop [ecx]      //result:=x;
        mov eax,[edx]
        OR [ecx],eax
        mov eax,[edx+4]
        OR [ecx+4],eax
        mov eax,[edx+8]
        OR [ecx+8],eax
        mov eax,[edx+12]
        OR [ecx+12],eax
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        push [rdx]
        push [rdx+8]
        pop [rcx+8]
        pop [rcx] //result:=x;
        mov rax,[r8]
        OR [rcx],rax
        mov rax,[r8+8]
        OR [rcx+8],rax
      end;
      {$ENDIF CPUX64}
    • 测设比较
      ----------- 32位 程序 -------------
      --------------- OR -------------
      开始1亿次 Asm OR SSE: U= a AND b  1180591620717411303423 OR 423565324656768865676 =1180591620717411303423
      1亿次Asm OR SEE 耗费 468 毫秒
      ----------------------------
      开始1亿次 Asm OR: U= a and b  1180591620717411303423 OR 423565324656768865676 =1180591620717411303423
      1亿次Asm and 耗费 952 毫秒
      ----------------------------
      开始1亿次 pascal OR:U128ORPas(A, B) | U128shlPas(1180591620717411303423,423565324656768865676) =1180591620717411303423
      1亿次pascal OR 耗费 1809 毫秒
      ----------- 64位 程序 -------------
      --------------- OR -------------
      开始1亿次 Asm OR SSE: U= a AND b  1180591620717411303423 OR 423565324656768865676 =1180591620717411303423
      1亿次Asm OR SEE 耗费 375 毫秒
      ----------------------------
      开始1亿次 Asm OR: U= a and b  1180591620717411303423 OR 423565324656768865676 =1180591620717411303423
      1亿次Asm and 耗费 765 毫秒
      ----------------------------
      开始1亿次 pascal OR:U128ORPas(A, B) | U128shlPas(1180591620717411303423,423565324656768865676) =1180591620717411303423
      1亿次pascal OR 耗费 1233 毫秒

  5. XOR
    • 纯pascal
      function U128BitXORPas(a, b: uint128): uint128;
      begin
        Int128Rec(Result).Lo := Int128Rec(a).Lo XOR Int128Rec(b).Lo;
        Int128Rec(Result).Hi := Int128Rec(a).Hi XOR Int128Rec(b).Hi;
      end;
    • 汇编
      function U128BitXOR(a, b: uint128): uint128;
      {$IFDEF CPUX86}
      ASM
        push [eax]
        push [eax+4]
        push [eax+8]
        push [eax+12]
        pop [ecx+12]
        pop [ecx+8]
        pop [ecx+4]
        pop [ecx]      //result:=x;
        mov eax,[edx]
        XOR [ecx],eax
        mov eax,[edx+4]
        XOR [ecx+4],eax
        mov eax,[edx+8]
        XOR [ecx+8],eax
        mov eax,[edx+12]
        XOR [ecx+12],eax
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        push [rdx]
        push [rdx+8]
        pop [rcx+8]
        pop [rcx] //result:=x;
        mov rax,[r8]
        XOR [rcx],rax
        mov rax,[r8+8]
        XOR [rcx+8],rax
      end;
      {$ENDIF CPUX64}
    • 汇编sse
      function U128BitXORSSE(a, b: uint128): uint128;
      {$IFDEF CPUX86}
      //[eax]->x, [edx]->y , [ecx]->result
      ASM
        MOVDQU xmm0,[eax]
        MOVDQU xmm1,[edx]
        pxor xmm0,xmm1
        MOVDQU [ecx],xmm0
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      //[rdx]->x, [r8]->y,[rcx]->result
      asm
        MOVDQU xmm0,[rdx]
        MOVDQU xmm1,[r8]
        pxor xmm0,xmm1
        MOVDQU [ecx],xmm0
      end;
      {$ENDIF CPUX64}
    • 测设比较
      ----------- 32位 程序 -------------
      --------------- XOR -------------
      开始1亿次 Asm XOR SSE: U= a XOR b  1180591620717411303423 XOR 423565324656768865676 =757026296060642437747
      1亿次Asm AND SEE 耗费 452 毫秒
      ----------------------------
      开始1亿次 Asm shl: U= a and b  1180591620717411303423 XOR 423565324656768865676 =757026296060642437747
      1亿次Asm XOR 耗费 1030 毫秒
      ----------------------------
      开始1亿次 pascal XOR:U128XORPas(A, B) | U128XORPas(1180591620717411303423,423565324656768865676) =757026296060642437747
      1亿次pascal XOR 耗费 1779 毫秒

      ----------- 64位 程序 -------------
      --------------- XOR -------------
      开始1亿次 Asm XOR SSE: U= a XOR b  1180591620717411303423 XOR 423565324656768865676 =757026296060642437747
      1亿次Asm AND SEE 耗费 359 毫秒
      ----------------------------
      开始1亿次 Asm shl: U= a and b  1180591620717411303423 XOR 423565324656768865676 =757026296060642437747
      1亿次Asm XOR 耗费 780 毫秒
      ----------------------------
      开始1亿次 pascal XOR:U128XORPas(A, B) | U128XORPas(1180591620717411303423,423565324656768865676) =757026296060642437747
      1亿次pascal XOR 耗费 1170 毫秒

  6. shl
    • 纯pascal

      function U128shlPas(a: uint128; count: byte): uint128;
      var
        u64: uint64;
      begin
        if count = 0 then exit(a);
        if count >= 128 then exit(0);
        if count >= 64 then
        begin
          Int128Rec(Result).Hi := Int128Rec(a).Lo shl (count - 64);
          Int128Rec(Result).Lo := 0;
          exit;
        end;
        Int128Rec(Result).Hi := Int128Rec(a).Hi shl count;
        u64 := Int128Rec(a).Lo;
        u64 := u64 shr (64 - count);
        Int128Rec(Result).Hi := Int128Rec(Result).Hi or u64;
        Int128Rec(Result).Lo := Int128Rec(a).Lo shl count;
      end;
    • 汇编
      function U128shl(x: uint128; count: byte): uint128;
      {$IFDEF CPUX86}
      asm
        push [eax]
        push [eax+4]
        push [eax+8]
        push [eax+12]
        pop [ecx+12]
        pop [ecx+8]
        pop [ecx+4]
        pop [ecx]      //result:=x;
        mov eax,ecx   //cl要用到,所以
        mov cl,dl     //cl:=count
        cmp  cl, 0     //IT count=0 then exit
        je @@4
        cmp cl ,128     //count>=128 则result:=0
        jb @@96
        mov [eax],0
        mov [eax+4],0
        mov [eax+8],0
        mov [eax+12],0
        jmp @@4
      @@96:
        cmp cl,96      //count>=96 则1dword->4dword    其余置0
        jb @@64
        mov edx,[eax]
        sub cl,96
        shl edx,cl
        mov [eax+12],edx
        mov [eax],0
        mov [eax+4],0
        mov [eax+8],0
        jmp @@4
      @@64:
        cmp cl,64      //count>=64 则1dword->3dword; 2dword->4dword ;1、2dword<-0
        jb @@32
        sub cl,64
        mov edx, [eax]
        shld [eax+4],edx,cl
        shl edx,cl
        mov [eax+8],edx
        mov edx,[eax+4]
        mov [eax+12],edx
        mov [eax],0
        mov [eax+4],0
        jmp @@4
      @@32:cmp cl, 32     //count>=32 则 1dword->2dword ;2dword-3dword; 3dword->4dword; 1dword<-0
        jb @@3
        sub cl,32
        mov edx,[eax+4]
        shld [eax+8], edx, cl
        mov edx,[eax]
        shld [eax+4],edx,cl
        shl [eax],cl
        mov edx,[eax+8]
        mov [eax+12],edx
        mov edx,[eax+4]
        mov [eax+8],edx
        mov edx,[eax]
        mov [eax+4],edx
        mov [eax],0
        jmp @@4
      @@3:            //count<32 则本字移动
        mov edx, [eax+8]
        shld [eax+12],edx,cl
        mov edx,[eax+4]
        shld [eax+8], edx,cl
        mov edx,[eax]
        shld [eax+4],edx,cl
        shl [eax],cl
      @@4:
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        mov rax,rcx  //[rax]->result
        push [rdx]
        push [rdx+8]
        pop [rax+8]
        pop [rax] //result:=x;
        mov rcx,r8
        cmp cl,0
        je @@1
        cmp cl,128
        jb @@Higher64
        mov [rax],0
        mov [rax+4],0
        mov [rax+8],0
        mov [rax+12],0
        jmp @@1
      @@Higher64:
        cmp cl,64
        jl @@lower64
        sub cl,64
        mov rdx,[rax]
        shl rdx,cl
        mov [rax+8],rdx
        push 0
        pop [rax]
        jmp @@1
      @@lower64:
        mov rdx,[rax]
        shld [rax+8], rdx,cl
        shl rdx,cl
        mov [rax],rdx
      @@1:
      end;
      {$ENDIF CPUX64}
    • 汇编sse
      function U128shrSSE(x: uint128; count: byte): uint128;
      {$IFDEF CPUX86}
      asm
        MOVDQU xmm0,[eax]
        MOVDQU [ecx],xmm0
        mov eax,ecx
        xor ecx,ecx
        mov cl,dl
        cmp cl,0
        je @@4
        cmp cl,128
        jb @@Higher64
        PXOR xmm0,xmm0
        MOVDQU [eax],xmm0
        jmp @@4
      @@Higher64:
        cmp cl,64
        jb @@lower64
        sub cl,64
        MOVD xmm1,ecx
        PSRLDQ xmm0,8
        //MOVQ xmm0,[eax+8]
        PSRLQ xmm0,xmm1
        MOVDQU [eax],xmm0
        jmp @@4
      @@lower64:
        MOVD xmm1,ecx
        PSRLQ xmm0,xmm1
        mov edx,64
        sub edx,ecx
        MOVD xmm1,edx
        MOVQ xmm2,[eax+8]
        PSLLQ xmm2,xmm1
        por xmm0,xmm2
        MOVDQU [eax],xmm0
      @@4:
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        MOVDQU xmm0,[rdx]
        MOVDQU [rcx],xmm0 //result:=x
        mov rax,rcx
        xor rcx,rcx
        mov rdx,r8
        mov cl,dl
        cmp cl,0
        je @@4
        cmp cl,128
        jb @@Higher64
        PXOR xmm0,xmm0
        MOVDQU [rax],xmm0
        jmp @@4
      @@Higher64:
        cmp cl,64
        jb @@lower64
        sub cl,64
        MOVD xmm1,ecx
        PSRLDQ xmm0,8
        //MOVQ xmm0,[eax+8]
        PSRLQ xmm0,xmm1
        MOVDQU [rax],xmm0
        jmp @@4
      @@lower64:
        MOVD xmm1,ecx
        PSRLQ xmm0,xmm1
        mov edx,64
        sub edx,ecx
        MOVD xmm1,edx
        MOVQ xmm2,[rax+8]
        PSLLQ xmm2,xmm1
        por xmm0,xmm2
        MOVDQU [rax],xmm0
      @@4:
      end;
      {$ENDIF CPUX64}
    • 测设比较
      ----------- 32位 程序 -------------
      --------------- shl -------------
      开始1亿次 Asm shl SSE: U= a shl b  1180591620717411303423 shl 25 =39614081257132168796738420736
      1亿次Asm shl SEE 耗费 1872 毫秒
      ----------------------------
      开始1亿次 Asm shl: U= a shl b  1180591620717411303423 shl 25 =39614081257132168796738420736
      1亿次Asm shl 耗费 2449 毫秒
      ----------------------------
      开始1亿次 pascal shl:U128shlPas(A, B) | U128shlPas(1180591620717411303423,25) =39614081257132168796738420736
      1亿次pascal shl 耗费 3822 毫秒
      ----------- 64位 程序 -------------
      --------------- shl -------------
      开始1亿次 Asm shl SSE: U= a shl b  1180591620717411303423 shl 25 =39614081257132168796738420736
      1亿次Asm shl SEE 耗费 1498 毫秒
      ----------------------------
      开始1亿次 Asm shl: U= a shl b  1180591620717411303423 shl 25 =39614081257132168796738420736
      1亿次Asm shl 耗费 1529 毫秒
      ----------------------------
      开始1亿次 pascal shl:U128shlPas(A, B) | U128shlPas(1180591620717411303423,25) =39614081257132168796738420736
      1亿次pascal shl 耗费 2044 毫秒
  7. shr
    • 纯pascal
      function U128shrPas(a: uint128; count: byte): uint128;
      var
        u64: uint64;
      begin
        if count = 0 then exit(a);
        if count >= 128 then exit(0);
        if count >= 64 then
        begin
          Int128Rec(Result).Lo := Int128Rec(a).Hi shr (count - 64);
          Int128Rec(Result).Hi := 0;
          exit;
        end;
        Int128Rec(Result).Lo := Int128Rec(a).Lo shr count;
        u64 := Int128Rec(a).Hi;
        u64 := u64 shl (64 - count);
        Int128Rec(Result).Lo := Int128Rec(Result).Lo or u64;
        Int128Rec(Result).Hi := Int128Rec(a).Hi shr count;
      end;
    • 汇编
      function U128shr(x: uint128; count: byte): uint128;
      {$IFDEF CPUX86}
      asm
        push [eax]
        push [eax+4]
        push [eax+8]
        push [eax+12]
        pop [ecx+12]
        pop [ecx+8]
        pop [ecx+4]
        pop [ecx]      //result:=x;
        //MOVDQU xmm0,[eax]
        //MOVDQU [ecx],xmm0
        mov eax,ecx   //cl要用到,所以
        mov cl,dl     //cl:=count
        cmp  cl, 0     //IT count=0 then exit
        je @@4
        cmp cl ,128     //count>=128 则result:=0
        jb @@96
        mov [eax],0
        mov [eax+4],0
        mov [eax+8],0
        mov [eax+12],0
        //PXOR xmm0,xmm0
        //MOVDQU [eax],xmm0
        jmp @@4
      @@96:
        cmp cl,96      //count>=96 则1dword->4dword    其余置0
        jb @@64
        mov edx,[eax+12]
        sub cl,96
        shr edx,cl
        mov [eax],edx
        mov [eax+12],0
        mov [eax+4],0
        mov [eax+8],0
        jmp @@4
      @@64:
        cmp cl,64      //count>=64 则3dword->1dword; 4dword->2dword ;1、2dword<-0
        jb @@32
        sub cl,64
        mov edx, [eax+12]
        shrd [eax+8],edx,cl
        shr edx,cl
        mov [eax+8],edx
        mov edx,[eax+8]
        mov [eax+4],edx
        mov [eax+12],0
        mov [eax+8],0
        jmp @@4
      @@32:cmp cl, 32     //count>=32 则 4dword->3dword ;3dword-2dword; 2dword->1dword; 4dword<-0
        jb @@3
        sub cl,32
        mov edx,[eax+8]
        shrd [eax+4], edx, cl
        mov edx,[eax+12]
        shrd [eax+8],edx,cl
        shr [eax+12],cl
        mov edx,[eax+4]
        mov [eax],edx
        mov edx,[eax+8]
        mov [eax+4],edx
        mov edx,[eax+12]
        mov [eax+8],edx
        mov [eax+12],0
        jmp @@4
      @@3:            //count<32 则本字移动
        mov edx, [eax+4]
        shrd [eax],edx,cl
        mov edx,[eax+8]
        shrd [eax+4], edx,cl
        mov edx,[eax+12]
        shrd [eax+8],edx,cl
        shr [eax+12],cl
      @@4:

      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        mov rax,rcx  //[rax]->result
        push [rdx]
        push [rdx+8]
        pop [rax+8]
        pop [rax] //result:=x;
        mov rcx,r8
        cmp cl,0
        je @@1
        cmp cl,128
        jb @@Higher64
        mov [rax],0
        mov [rax+4],0
        mov [rax+8],0
        mov [rax+12],0
        jmp @@1
      @@Higher64:
        cmp cl,64
        jl @@lower64
        sub cl,64
        mov rdx,[rax+8]
        shr rdx,cl
        mov [rax],rdx
        push 0
        pop [rax+8]
        jmp @@1
      @@lower64:
        mov rdx,[rax+8]
        shrd [rax], rdx,cl
        shr rdx,cl
        mov [rax+8],rdx
      @@1:

      end;
      {$ENDIF CPUX64}
    • 汇编sse
      function U128shrSSE(x: uint128; count: byte): uint128;
      {$IFDEF CPUX86}
      asm
        MOVDQU xmm0,[eax]
        MOVDQU [ecx],xmm0
        mov eax,ecx
        xor ecx,ecx
        mov cl,dl
        cmp cl,0
        je @@4
        cmp cl,128
        jb @@Higher64
        PXOR xmm0,xmm0
        MOVDQU [eax],xmm0
        jmp @@4
      @@Higher64:
        cmp cl,64
        jb @@lower64
        sub cl,64
        MOVD xmm1,ecx
        PSRLDQ xmm0,8
        //MOVQ xmm0,[eax+8]
        PSRLQ xmm0,xmm1
        MOVDQU [eax],xmm0
        jmp @@4
      @@lower64:
        MOVD xmm1,ecx
        PSRLQ xmm0,xmm1
        mov edx,64
        sub edx,ecx
        MOVD xmm1,edx
        MOVQ xmm2,[eax+8]
        PSLLQ xmm2,xmm1
        por xmm0,xmm2
        MOVDQU [eax],xmm0
      @@4:
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        MOVDQU xmm0,[rdx]
        MOVDQU [rcx],xmm0 //result:=x
        mov rax,rcx
        xor rcx,rcx
        mov rdx,r8
        mov cl,dl
        cmp cl,0
        je @@4
        cmp cl,128
        jb @@Higher64
        PXOR xmm0,xmm0
        MOVDQU [rax],xmm0
        jmp @@4
      @@Higher64:
        cmp cl,64
        jb @@lower64
        sub cl,64
        MOVD xmm1,ecx
        PSRLDQ xmm0,8
        //MOVQ xmm0,[eax+8]
        PSRLQ xmm0,xmm1
        MOVDQU [rax],xmm0
        jmp @@4
      @@lower64:
        MOVD xmm1,ecx
        PSRLQ xmm0,xmm1
        mov edx,64
        sub edx,ecx
        MOVD xmm1,edx
        MOVQ xmm2,[rax+8]
        PSLLQ xmm2,xmm1
        por xmm0,xmm2
        MOVDQU [rax],xmm0
      @@4:
      end;
      {$ENDIF CPUX64}
    • 测设比较:由于,sse4.1才提供直接操作xmm高位的指令,如果不使用sse4.1,右移必须反复移位操作,效率已经没有优势了,特别是在x64位下
      ----------- 32位 程序 -------------
      --------------- shr -------------
      开始1亿次 Asm SSE shr: U= a shl b  1180591620717411303423 shr 25 =35184372088831
      1亿次AsmSSE shr 耗费 2262 毫秒
      ----------------------------
      开始1亿次 Asm  shr: U= a shl b  1180591620717411303423 shr 25 =35184372088831
      1亿次Asm shr 耗费 2948 毫秒
      ----------------------------
      开始1亿次 pascal shr:U128shrPas(A, B) | U128shrPas(1180591620717411303423,25) =35184372088831
      1亿次pascal shr 耗费 3822 毫秒

      ----------- 64位 程序 -------------
      --------------- shr -------------
      开始1亿次 Asm SSE shr: U= a shl b  1180591620717411303423 shr 25 =35184372088831
      1亿次AsmSSE shr 耗费 1498 毫秒
      ----------------------------
      开始1亿次 Asm  shr: U= a shl b  1180591620717411303423 shr 25 =35184372088831
      1亿次Asm shr 耗费 1451 毫秒
      ----------------------------
      开始1亿次 pascal shr:U128shrPas(A, B) | U128shrPas(1180591620717411303423,25) =35184372088831
      1亿次pascal shr 耗费 1996 毫秒

  8. bsr:这是一个求数据使用有效位(bit)指令.直接条件编译了,不在做测试对比,主要是下面的div和mod其中一种算法要用到
    • function UInt128.BSR: Int32;
      {$IFDEF PUREPASCAL}
      begin
        if Int128Rec(self).Hi > 0 then result := 64 + UintsHighBit(Int128Rec(self).Hi)
        else result := UintsHighBit(Int128Rec(self).Lo);
      end;
      {$ELSE !PUREPASCAL}
      {$IFDEF CPUX86}
      asm
        mov edx,eax
        xor ecx,ecx
        bsr eax,[edx+12]
        jz @@4
        mov  ecx,96
        jmp  @@1
      @@4:
        bsr eax,[edx+8]
        jz @@3
        mov ecx,64
        jmp @@1
      @@3:
        bsr eax,[edx+4]
        jz @@2
        mov ecx,32
        jmp @@1
      @@2:
        bsr eax,[edx]
        jnz @@1
        mov eax,0
        jmp @@0
      @@1:
        add eax,ecx
        inc eax
      @@0:
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        mov rdx,0
        bsr rax,[rcx+8]
        jz @@2
        mov rdx,64
        jmp @@1
      @@2:
        bsr rax,[rcx]
        jnz @@1
        mov rax,0
        jmp @@0
      @@1:
        add rax,rdx
        inc rax
      @@0:
      end;
      {$ENDIF CPUX64}
      {$ENDIF !PUREPASCAL}
  9. 乘法(* mul)
    • 纯pascal
      function U128MUlPas(a, b: uint128): uint128;
        function mul64To128(aa, bb: uint64): uint128;
        var
          AHi, ALo, BHi, BLo: uint64;
          T128: uint128;
        begin
          T128 := 0;
          AHi := int64Rec(aa).Hi;
          ALo := int64Rec(aa).Lo;
          BHi := int64Rec(bb).Hi;
          BLo := int64Rec(bb).Lo;
          Int128Rec(T128).Lo := AHi * BHi;
          Result := T128 shl 64;
          Int128Rec(T128).Lo := AHi * BLo; //A.hi*B.Lo
          Result := Result + (T128 shl 32);
          Int128Rec(T128).Lo := ALo * BHi; //A.Lo*B.Hi
          Result := Result + (T128 shl 32);
          Int128Rec(T128).Lo := ALo * BLo; //Lo*Lo
          Result := Result + T128;
        end;
      var
        AHi, ALo, BHi, BLo: uint64;
        T128: uint128;
      begin
        AHi := Int128Rec(a).Hi;
        ALo := Int128Rec(a).Lo;
        BHi := Int128Rec(b).Hi;
        BLo := Int128Rec(b).Lo;
        T128 := mul64To128(AHi, BLo);
        Result := T128 shl 64;
        T128 := mul64To128(ALo, BHi);
        Result := Result + (T128 shl 64);
        T128 := mul64To128(ALo, BLo);
        Result := Result + T128;
      end;
    • 汇编
      function U128MUl(a, b: uint128): uint128;
      {$IFDEF CPUX86}
      {
       //[eax]->a, [edx]->b , [ecx]->result
       {a1,a2,a3,a4 分别乘 b1,b2,b3,b4 进位是(N*32)+(N*32)bit
       N为后面的数值-1 所以result:=
       a1*b1+ (a1*b2 shl 32)+(a1*b3 shl 64) +(a1*b4 shl 96)
       +(a2*b1 shl 32)+(a2*b2 shl 64)+(a2*b3 shl 96)+(a2*b4 shl 128)
       +(a3*b1 shl 64)+(a3*b2 shl 96)+(a3*b3 shl 128)+(a3*b4 shl 160)
       +(a4*b1 shl 96)+(a4*b2 shl 128)+(a4*b3 shl 160)(a4*b4 shl 192)

       去掉 进位>=128的 然后吧进位相同的合并
       resulr:=
       a1*b1+((a1*b2 + a2*b1) shl 32)
       + ((a1*b3 +a2*b2+a3*b1) shl 64)
       +((a1*b4+ a2*b3+a3*b2+ a4*b1) shl 96)
      }
      asm

        push ebp
        mov ebp,esp
        sub ebp,32
        push ebx
        mov [ecx+8],0
        mov [ecx+12],0 //result.HI:=0
        MOVDQU xmm0,[eax]
        MOVDQU [ebp],xmm0 //[ebp]<-a
        mov ebx,edx       //[ebx]<-b
        mov eax,[ebp]
        mul [ebx]
        mov [ecx],eax
        mov [ecx+4],edx //a1*b1
        mov eax,[ebp]
        mul [ebx+4]
        add [ecx+4],eax
        adc [ecx+8],edx  //(a1*b2 ) shl 32
        mov eax,[ebp+4]
        mul [ebx]       //a2*b1
        add [ecx+4],eax
        adc [ecx+8],edx  //(a2*b1) shl 32
        adc [ecx+12],0
        mov eax,[ebp]
        mul [ebx+8]
        add [ecx+8],eax
        adc [ecx+12],edx //(a1*b3) shl 64
        mov eax,[ebp+4]
        mul [ebx+4]
        add [ecx+8],eax
        adc [ecx+12],edx //(a2*b2) shl 64
        mov eax,[ebp+8]
        mul [ebx]
        add [ecx+8],eax
        adc [ecx+12],edx //(a3*b1) shl 64
        mov eax,[ebp]
        mul [ebx+12]
        add [ecx+12],eax //(a1*b4) shl 96
        mov eax,[ebp+4]
        mul [ebx+8]
        add [ecx+12],eax //(a2*b3) shl 96
        mov eax,[ebp+8]
        mul [ebx+4]
        add [ecx+12],eax //(a3*b2) shl 96
        mov eax,[ebp+12]
        mul [ebx]
        add [ecx+12],eax //(a4*b1) shl 96
        pop ebx
        pop ebp

      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      //[rdx]->a, [r8]->b,[rcx]->result
      asm
        {a1,a2 b1 b2
        a1*b1+((a1*b2) shl 64)+((a2*b1) shl 64)+((a2*b2) shl 128)
        }
        mov r9,rdx    //[r9]<-x
        mov rax,[r9]
        mov rdx,[r8+8]
        mul rdx     //a1*b2
        mov [rcx+8],rax
        mov rax,[r9+8]
        mov rdx,[r8]
        mul rdx      //a2*b1
        add [rcx+8],rax
        mov rax,[r9]
        mov rdx,[r8]
        mul rdx      //a1*b1
        mov [rcx],rax
        add [rcx+8],rdx
      end;
      {$ENDIF CPUX64}
    • 测设比较:这时64位优势好明显.当然如果支持AVX,则可以直接使用rmmx进行128位乘除.
      ----------- 32位 程序 -------------
      --------------- mul -------------
      ----------------------------
      开始1亿次 Asm X: U= a X b  1180591620717411303 X 253452334 =299223701771670649183331202
      1亿次Asm X 耗费 4509 毫秒
      ----------------------------
      开始1亿次 pascal X:U128MUlPas(A, B) | U128shlPas(1180591620717411303,253452334) =299223701771670649183331202
      1亿次pascal X 耗费 45973 毫秒
      ----------- 64位 程序 -------------
      --------------- mul -------------
      ----------------------------
      开始1亿次 Asm X: U= a X b  1180591620717411303 X 253452334 =299223701771670649183331202
      1亿次Asm X 耗费 764 毫秒
      ----------------------------
      开始1亿次 pascal X:U128MUlPas(A, B) | U128shlPas(1180591620717411303,253452334) =299223701771670649183331202
      1亿次pascal X 耗费 28829 毫秒
  10. 整除(div intdiv):整除和求模基本上是一种运算得到的两个结果,这里有两种教快的算法,其实也是一种,说不上谁更快,针对不同的参数会有不同的结果
    • 纯pascal
      function U128IntdivPas(a, D: uint128): uint128;
        function IntDivU128(aa, dd: uint128): uint128;
        var
          c: integer;
          U, t: uint128;
        begin
          U := 0;
          t := 0;
          Result := 0;
          case CompareValue(aa, dd) of
            EqualsValue:
              begin
                Result := 1;
                exit;
              end;
            LessThanValue: exit;
          else
            begin
              Result := 1;
              c := UintsHighBit(aa) - UintsHighBit(dd);
              Result := Result shl c;
              U := dd shl c;
              if CompareValue(aa, U) = LessThanValue then
              begin
                U := U shr 1;
                Result := Result shr 1;
              end;
              t := aa - U;
              Result := Result + IntDivU128(t, dd);
            end;
          end;
        end;

      begin
        Result := IntDivU128(a, D);
      end;

    • 汇编1:这是delphixe3里64位 整数所用的方法,特点是无论参数分布如何,能得到较稳定的速度
      function U128Intdiv(a, b: uint128): uint128;
      {$IFDEF CPUX86}
      //[eax]->a, [edx]->b , [ecx]->result
      asm
        push ebp
        push ebx
        push esi
        push edi
        sub esp,64
        mov ebp,esp
        mov [ebp],eCX    //@result<->[ebp]
        mov ebx,ebp
        mov esi,ebp
        add ebx,32     //[ebp+32]~[ebp+47]  作为循环左移的buff 用ebx保存地址
        add esi,16    //[esi]指向 [ebp+16]~[ebp+31]
        MOVDQU xmm0,[eax]
        MOVDQU [esi],xmm0  //[esi]:=a
        mov edi,edx        //[edi]:=b
        mov eax,[esi+4]
        or eax,[esi+8]
        jnz @@slow_ldiv
        mov eax,[esi+12]
        or eax,[edi+12]   //并且  b.[32~127]为0 则直接计算
        jnz @@slow_ldiv
        mov eax,[edi+4]
        or eax,[edi+8]
        jz @@quick_ldiv
      @@slow_ldiv:
        MOV   eCX,128                  //shift counter
        PXOR  xmm0,xmm0
        MOVDQU [ebx], xmm0  //循环左移缓冲区清零
      @@xloop:
        SHL     [esi],1                   //shift dividend left one bit
        RCL     [esi+4],1
        RCL     [esi+8],1
        RCL     [esi+12],1
        RCL     [ebx],1
        RCL     [ebx+4],1
        RCL     [ebx+8],1
        RCL     [ebx+12],1
        mov     eax,[ebx+12]
        CMP     eax,[edi+12]                //dividend larger?
        JB      @@nosub
        JA      @@subtract
        mov     eax,[ebx+8]
        CMP     eax,[edi+8]                //dividend larger?
        JB      @@nosub
        JA      @@subtract
        mov     eax,[ebx+4]
        CMP     eax,[edi+4]                //dividend larger?
        JB      @@nosub
        JA      @@subtract
        mov     eax,[ebx]
        CMP     eax,[edi]                //dividend larger?
        JB      @@nosub
        JA      @@subtract
      @@subtract:
        mov     eax,[edi]
        SUB     [ebx],eax
        mov     eax,[edi+4]
        SBB     [ebx+4],eax                 //subtract the divisor
        mov     eax ,[edi+8]
        SBB     [ebx+8],eax                //subtract the diviso
        mov     eax,[edi+12]
        SBB     [ebx+12],eax                 //subtract the divisor
        INC     [esi]                     //build quotient
      @@nosub:
        LOOP    @@xloop
        //
        //When done with the loop the four registers values' look like:
        //
        //|          [ebx]                    [esi]        |
        //|        remainder               quotient        |
        //
        MOVDQU xmm0,[esi]
        mov ecx,[ebp]
        MOVDQU [ecx],xmm0
        jmp @@finish
      @@quick_ldiv:
        mov eax,[esi]
        mov edx ,[esi+4]
        DIV [edi]                     //unsigned divide
        mov ecx,[ebp]
        mov [ecx],eax
      @@finish:
        add esp,64
        pop edi
        pop esi
        pop ebx
        pop ebp
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        //[rdx]->a, [r8]->b,[rcx]->result
        PUSH    RBX
        PUSH    RSI
        PUSH    RDI
        mov     r9,rcx    //result<->r9
        MOV     RBX,[r8]             //b.lo->rbx
        MOV     R8,[r8+8]             //b.hi-r8
        mov     rax,[rdx]    //a.lo->rax
        mov     rdx,[rdx+8]    //a.hi->rdx
        OR      R8,R8
        JNZ     @@slow_ldiv     //both high words are zero
        OR      rdx,rdx
        JZ      @@quick_ldiv
        OR      RBX,RBX
        JZ      @@quick_ldiv    //if RCX:RBX == 0 force a zero divide
      @@slow_ldiv:
        MOV     RCX,128                  //shift counter
        XOR     RDI,RDI                 //fake a 64 bit dividend
        XOR     RSI,RSI                 //

      @@xloop:
        SHL     RAX,1                   //shift dividend left one bit
        RCL     RDX,1
        RCL     RSI,1
        RCL     RDI,1
        CMP     RDI,R8                 //dividend larger?
        JB      @@nosub
        JA      @@subtract
        CMP     RSI,RBX                 //maybe
        JB      @@nosub
      @@subtract:
        SUB     RSI,RBX
        SBB     RDI,R8                 //subtract the divisor
        INC     RAX                     //build quotient
      @@nosub:
        LOOP    @@xloop
        //
        //When done with the loop the four registers values' look like:
        //
        //|     RDI      RSI       RDX       RAX     |
        //|        remainder               quotient        |
        //
        //MOV     RAX,RSI
        //MOV     RDX,RDI                 //use remainder
        jmp @@finish

      @@quick_ldiv:
        DIV     RBX                     //unsigned divide
        //XCHG    RAX,RDX
        XOR     RDX,RDX
      @@finish:
        mov [r9],rax
        mov [r9+8],rdx
        POP     RDI
        POP     RSI
        POP     RBX
      end;
      {$ENDIF CPUX64}

    • 汇编2:在某些参数分布情况下,速度会快很多,其它情况因为多出一些判断指令,比第一种稍慢.第二种可以看作第一种方法的变种,第一种方法不做任何判断直接依次位移"1"相减求值,第二种则判断计算后,直接位移 "N" 再相减求值,但某些参数范围,每次判断计算的结果大多数就是第一种,这时 速度就比第一种慢了

      function U128IntdivBsky(a, b: uint128): uint128;
      {$IFDEF CPUX86}
      //[eax]->a, [edx]->b , [ecx]->result
      asm
        Pxor xmm0,xmm0
        MOVDQU [ecx],xmm0     //result:=0
        push ebp
        push ebx
        push esi
        push edi
        sub esp,64
        mov ebp,esp
        mov esi,eax    //@a <-> [esi]
        mov edi,edx  //@b <-> [edi]
        mov [ebp],ecx  //@result <-> [ebp]
        mov ebx,ebp
        add ebx, 16   //@tmpb <-> [ebx]
        mov esi,ebp
        add esi,32         //@tmpa:=esi
        MOVDQU xmm0,[eax]
        MOVDQU [esi],xmm0   //tmpa:=a
        mov [ebp+12],ebp
        add [ebp+12],48  //@tmpresult<-> [ebp+12]
        MOVDQU xmm0,[edi]
        MOVDQU [ebx],xmm0      //tmpb:=b
        mov eax,edi
        call uint128.bsr  //注意 结构的函数调用.类函数调用又有所不同
        mov [ebp+8],eax        //b.bsr <-> [ebp+8]
      @@a_bsr:
        MOV [ebp+48],1       //tmpresult:=1
        MOV [ebp+52],0
        MOV [ebp+56],0
        MOV [ebp+60],0
        mov eax,esi
        call uint128.bsr
        cmp eax,[ebp+8]
        jb  @@end  //if a.bsr< b.bsr then result:=a
        sub eax,[ebp+8]
        mov [ebp+4],eax     //if a.bsr>b.bsr then a.bsr-b.bsr <-> [ebp+4]
        cmp [ebp+8],32
        ja @@a_Compare_b  //if b.bsr>32 then compare(a,b)
        cmp eax,32
        jnl @@a_Compare_b //if b.bsr<=32 and (a.bsr-b.asr)<32  then 直接div
        mov eax, [esi]  //a.lo-> eax
        mov edx,[esi+4]//a.hi-> edx
        div [edi]    //div b
        mov ecx,[ebp]
        add [ecx],eax   //div -> result:= result+eax
        adc [ecx+4],0
        adc [ecx+8],0
        adc [ecx+12],0
        jmp @@end
      @@a_Compare_b:
        mov eax,[esi+12]
        cmp eax,[edi+12]
        ja @@dayu
        jb @@end
        mov eax,[esi+8]
        cmp eax,[edi+8]
        ja @@dayu
        jb @@end
        mov eax,[esi+4]
        cmp eax,[edi+4]
        ja @@dayu
        jb @@end
        mov eax,[esi]
        cmp eax,[edi]
        ja @@dayu
        jb @@end
      @@dengyu:
        mov edx,[ebp]
        mov eax,[ebp+48]
        add [edx],eax
        mov eax,[ebp+52]
        adc [edx+4],eax
        mov eax,[ebp+56]
        adc [edx+8],eax
        mov eax,[ebp+60]
        adc [edx+12],eax   //result:=result+tempresult
        jmp @@end
      @@dayu:
        mov eax,edi  //u128shr(b,count) b<-> [edx]  count<->[ecx] 参数传递 b:eax, count:edx ,result:ecx
        mov edx,[ebp+4]    //
        cmp edx,0
        je @@a_sub_tmpb //if (a>b)and ( a.bsr-b.bsr=0) then tmpa:=a-b
        mov ecx,ebx
        call U128shl    //tmpb:= b shl (a.bsr-b.bsr)
        mov eax,[ebp+12]
        mov edx,[ebp+4]
        mov ecx,[ebp+12]
        call U128shl   //tempresult:= tempresult shl (a.bsr-b.bsr)
        mov eax,[esi+12]
        cmp eax,[ebx+12]
        jb @@tmpb_shr_1
        ja @@a_sub_tmpb
        mov eax,[esi+8]
        cmp eax,[ebx+8]
        jb @@tmpb_shr_1
        ja @@a_sub_tmpb
        mov eax,[esi+4]
        cmp eax,[ebx+4]
        jb @@tmpb_shr_1
        ja @@a_sub_tmpb
        mov eax,[esi]
        cmp eax,[ebx]
        jb @@tmpb_shr_1
        ja @@a_sub_tmpb
        jmp @@dengyu
      @@tmpb_shr_1:
        shr [ebx+12],1
        rcr [ebx+8],1
        rcr [ebx+4],1
        rcr [ebx],1
        shr [ebp+60],1
        rcr [ebp+56],1
        rcr [ebp+52],1
        rcr [ebp+48],1
      @@a_sub_tmpb:
        mov eax,[ebx]   //a:=a-tmpb
        sub [esi], eax
        mov eax,[ebx+4]
        sbb [esi+4],eax
        mov eax,[ebx+8]
        sbb [esi+8],eax
        mov eax,[ebx+12]
        sbb [esi+12],eax
        //result:=result+tempresult
        mov edx,[ebp]
        mov eax,[ebp+48]
        add [edx],eax
        mov eax,[ebp+52]
        adc [edx+4],eax
        mov eax,[ebp+56]
        adc [edx+8],eax
        mov eax,[ebp+60]
        adc [edx+12],eax
        //恢复tmpb:=b
        MOVDQU xmm0,[edi]
        MOVDQU [ebx],xmm0      //恢复tmpb:=b
        jmp @@a_bsr
      @@end:
        add esp,64
        pop edi
        pop esi
        pop ebx
        pop ebp
      end;
      {$ENDIF CPUX86}
      {$IFDEF CPUX64}
      asm
        //[rdx]->a, [r8]->b,[rcx]->result
        //tmpa.lo =rsi tmpa.hi=rdi
        //b.lo= rbx  b.hi=r8
        //tmpb.lo=rbp tmpb.hi=r11
        //b.bsr=r10  a.bsr&(a.bsr-b.bsr)=rcx
        //@result=r9
        Pxor xmm0,xmm0
        MOVDQU [rcx],xmm0     //result:=0
        push rbp
        push rbx
        push rsi
        push rdi
        push r12
        push r13
        mov r9,rcx //@result <-> [r9]
        MOV rsi,[rdx]  //a.lo-> rsi
        mov rdi,[rdx+8] //a.hi->rdi
        mov rbx,[r8]    //b.lo->rbx
        mov r8,[r8+8]   //b.hi->r8
      @@b_bsr:

        bsr r10,r8
        jz @@b_bsr_lo
        add r10,64     //b.bsr->r10
        jmp @@b_bsr_end
      @@b_bsr_lo:
        bsr r10,rbx
        jz @@end
      @@b_bsr_end:
        Inc r10

      @@a_bsr:
        mov rbp,rbx    //tmpb:=b
        mov r11,r8     //tmpb.lo->rbp; tmpb.hi->r11

        mov r12,1     //tempresult:=1
        mov r13,0

        bsr rcx,rdi
        jz @@a_bsr_lo
        add rcx,64     //a.bsr->rcx
        jmp @@a_bsr_end
      @@a_bsr_lo:
        bsr rcx,rsi
        jz @@end
      @@a_bsr_end:
        inc rcx

        cmp rcx,r10
        jb  @@end  //if a.bsr< b.bsr then exit
        sub rcx,r10    //if a.bsr>=b.bsr then a.bsr-b.bsr
        cmp rcx,64
        jb @@b_bsr_cmp_64
        sub rcx,64
        shl rbp,cl
        mov r11,rbp
        mov rbp,0
        shl r12,cl
        mov r13,r12
        mov r12,0
        jmp @@a_cmp_tmpb
      @@b_bsr_cmp_64:
        cmp r10,64
        jna @@dirct_div //if b.bsr<=64 and (a.bsr-b.asr)<64  then 直接div
      @@a_Compare_b:
        cmp rdi,r11
        ja @@dayu
        jb @@end
        cmp rsi,rbp
        ja @@dayu
        jb @@end
      @@dengyu:
        add [r9],r12
        adc [r9+8],r13
        jmp @@end      //if a=b then result:=0
      @@dayu:
        JRCXZ @@a_sub_tmpb //if (a>b)and ( a.bsr-b.bsr=0) then tmpa:=a-b
      @@b_shl_rcx:
        shld r11 ,rbp,cl
        shl rbp,cl    //tmpb:= b shl (a.bsr-b.bsr)
        shld r13,r12,cl
        shl r12,cl
      @@a_cmp_tmpb:
        cmp rdi,r11
        jb @@tmpb_shr_1
        ja @@a_sub_tmpb
        cmp rsi,rbp
        jb @@tmpb_shr_1
        ja @@a_sub_tmpb
        add [r9],r12
        adc [r9+8],r13
        jmp @@end
      @@tmpb_shr_1:
        shr r11,1
        rcr rbp,1
        shr r13,1
        rcr r12,1
      @@a_sub_tmpb:
        sub rsi,rbp   //a:=a-tmpb
        sbb rdi, r11
        add [r9],r12
        adc [r9+8],13
        jmp @@a_bsr
      @@dirct_div:
        mov rax, rsi  //a.lo-> rax
        mov rdx, rdi//a.hi-> rdx
        div rbx    //div b
        add [r9],rax //mod -> result.0qw
        adc [r9+8],0
      @@end:
        pop r13
        pop r12
        pop rdi
        pop rsi
        pop rbx
        pop rbp
      end;
      {$ENDIF CPUX64}
    • 测设比较:汇编两种方法速度,在参数不同时 各有高低,但比纯pascal那是快太多了
      ----------- 32位 程序 -------------
      --------------- div -------------
      ----------------------------
      开始1千万次 Asm div: U= a div b  1180591620717411303 DIV 253452334 =4658042015
      1千万次Asm div 耗费 13852 毫秒
      ----------------------------
      开始1千万次 pascal DIV:U128divPas(A, B) | U128DIVPas(1180591620717411303,253452334) =4658042015
      1千万次pascal DIV 耗费 35272 毫秒
      ----------------------------
      开始1千万次 Asm besky div: U= a div b  1180591620717411303 DIV 253452334 =4658042015
      1千万次Asm besky div 耗费 1388 毫秒
      ----------- 64位 程序 -------------
      --------------- div -------------
      ----------------------------
      开始1千万次 Asm div: U= a div b  1180591620717411303 DIV 253452334 =4658042015
      1千万次Asm div 耗费 374 毫秒
      ----------------------------
      开始1千万次 pascal DIV:U128divPas(A, B) | U128DIVPas(1180591620717411303,253452334) =4658042015
      1千万次pascal DIV 耗费 19188 毫秒
      ----------------------------
      开始1千万次 Asm besky div: U= a div b  1180591620717411303 DIV 253452334 =4658042015
      1千万次Asm besky div 耗费 718 毫秒

  11. 模(mod):汇编情况和div 几乎一样的,代码也差别很小就不贴出来了
    • 纯pascal
      function U128MODPas(a, M: uint128): uint128;
        function ModU128(aa, mm: uint128): uint128;
        var
          c, ac, mc: integer;
          U, t: uint128;
        begin
          U := 0;
          t := 0;
          Result := 0;
          mc := UintsHighBit(mm);
          case CompareValue(aa, mm) of
            EqualsValue: exit;
            LessThanValue: Result := aa;
          else
            begin
              ac := UintsHighBit(aa);
              if ac <= 64 then
              begin
                Result := Int128Rec(aa).Lo mod Int128Rec(mm).Lo;
                exit;
              end;
              c := ac - mc;
              U := mm shl c;
              if CompareValue(aa, U) = LessThanValue then U := U shr 1;
              t := aa - U;
              Result := ModU128(t, mm);
            end;
          end; //case
        end;
      begin
        Result := ModU128(a, M);
      end;
    • 测设比较

      ----------- 32位 程序 -------------
      --------------- mod -------------
      ----------------------------
      开始1千万次 Asm mod: U= a mod b  1180591620717411303 mod 253452334 =145598293
      1千万次Asm mod 耗费 1139 毫秒
      ----------------------------
      开始1千万次 pascal mod:U128modPas(A, B) | U128shlPas(1180591620717411303,253452334) =145598293
      1千万次pascal mod 耗费 3744 毫秒
      ----------------------------
      开始1千万次 Asm Bskymod: U= a mod b  1180591620717411303 mod 253452334 =145598293
      1千万次Asm Bskymod 耗费 1202 毫秒
      ----------- 64位 程序 -------------
      --------------- mod -------------
      ----------------------------
      开始1千万次 Asm mod: U= a mod b  1180591620717411303 mod 253452334 =145598293
      1千万次Asm mod 耗费 390 毫秒
      ----------------------------
      开始1千万次 pascal mod:U128modPas(A, B) | U128shlPas(1180591620717411303,253452334) =145598293
      1千万次pascal mod 耗费 983 毫秒
      ----------------------------
      开始1千万次 Asm Bskymod: U= a mod b  1180591620717411303 mod 253452334 =145598293
      1千万次Asm Bskymod 耗费 671 毫秒
  12. 总结:
    • 越复杂的运算,汇编的速度优势越明显,同时汇编调试越苦难
    • X64总是比x86快,
    • sse的对齐问题,使得 sse不是每种情况都适用.
    • 算法很重要,好的算法可以百倍千倍的提高效率

一点感慨:在编写这些会变代码时,仿佛又回到了大学在实验室写汇编的日子,那台80286,那些8051、8098单片开发机、示波仪。第一次通过232接口pc联通程控电话交换机主板上的51的喜悦和兴奋,第一次见到光隔时的神奇。操作毕业设计那套让我程序控制的随心所欲的4自由度的高频淬火数字机床的爽快。还有实验室门外的我最喜欢足球场。哦!还有总是在那块草地的柏树下对我笑的姑娘。

附测试程序下载:FuncTestVcl20130208.rar

0

阅读 评论 收藏 转载 喜欢 打印举报
  • 评论加载中,请稍候...
发评论

    发评论

    以上网友发言只代表其个人观点,不代表新浪网的观点或立场。

      

    新浪BLOG意见反馈留言板 电话:4006900000 提示音后按1键(按当地市话标准计费) 欢迎批评指正

    新浪简介 | About Sina | 广告服务 | 联系我们 | 招聘信息 | 网站律师 | SINA English | 会员注册 | 产品答疑

    新浪公司 版权所有