1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 |
- TEXT ·_dpavx_int32clang(SB),4, $0-32
- MOVQ a+0(FP), DI
- MOVQ b+8(FP), SI
- MOVQ gN+16(FP), DX
- MOVQ res+24(FP), CX
- MOVQ (DX),R8 // mov r8d, dword [rdx]
- CMPQ R8,$0 // test r8d, r8d
- JLE LBB0_8 // LBB0_1
- CMPQ R8,$15 // cmp r8d, 15
- JA LBB0_4
- XORQ R9, R9 // xor r9d, r9d
- XORQ AX, AX // xor eax, eax
- JMP LBB0_8 //LBB0_7
- LBB0_1:
- XORQ AX, AX // xor eax, eax
- JMP LBB0_8
- LBB0_4:
- MOVQ R8,R9 // mov r9d, r8d
- ANDQ $-16,R9 // and r9d, -16
- VPXOR X0,X0,X0 // vpxor xmm0, xmm0, xmm0
- XORQ AX, AX // xor eax, eax
- VPXOR X1, X1, X1 // vpxor xmm1, xmm1, xmm1
- VPXOR X2, X2, X2 // vpxor xmm2, xmm2, xmm2
- VPXOR X3, X3, X3 // vpxor xmm3, xmm3, xmm3
-
- LBB0_5:
- VMOVDQU (SI)(AX*4), X4 // vmovdqu xmm4, oword [rsi + 4*rax]
- VMOVDQU 16(SI)(AX*4), X5 // vmovdqu xmm5, oword [rsi + 4*rax + 16]
- VMOVDQU 32(SI)(AX*4), X6 // vmovdqu xmm6, oword [rsi + 4*rax + 32]
- VMOVDQU 48(SI)(AX*4), X7 // vmovdqu xmm7, oword [rsi + 4*rax + 48]
-
- VPMULLD (DI)(AX*4), X4, X4 // vpmulld xmm4, xmm4, oword [rdi + 4*rax]
- VPADDD X0, X4, X0 // vpaddd xmm0, xmm4, xmm0
-
- VPMULLD (DI)(AX*4),X5, X4 // vpmulld xmm4, xmm5, oword [rdi + 4*rax + 16]
- VPADDD X1, X4, X1 // vpaddd xmm1, xmm4, xmm1
- VPMULLD (DI)(AX*4), X6, X4 // vpmulld xmm4, xmm6, oword [rdi + 4*rax + 32]
- VPMULLD (DI)(AX*4), X7, X5 // vpmulld xmm5, xmm7, oword [rdi + 4*rax + 48]
- VPADDD X2, X4, X2 // vpaddd xmm2, xmm4, xmm2
- VPADDD X3, X5, X3 // vpaddd xmm3, xmm5, xmm3
- ADDQ $16,AX // add rax, 16
- CMPQ AX,R9 // cmp r9, rax
- JNE LBB0_5
- VPADDD X0, X1, X0 // vpaddd xmm0, xmm1, xmm0
- VPADDD X0, X2, X0 // vpaddd xmm0, xmm2, xmm0
- VPADDD X0, X3, X0 // vpaddd xmm0, xmm3, xmm0
-
- VPSHUFD $78 , X0,X1 // vpshufd xmm1, xmm0, 78
- VPADDD X1, X0, X0 // vpaddd xmm0, xmm0, xmm1
- VPSHUFD $229, X0, X1 // vpshufd xmm1, xmm0, 229
- VPADDD X1, X0, X0 // vpaddd xmm0, xmm0, xmm1
-
- VMOVD X0,AX // vmovd eax, xmm0
- CMPQ R8,R9 // cmp r9, r8
- JE LBB0_8
- LBB0_7:
- MOVL (SI)(R9*4),DX // mov edx, dword [rsi + 4*r9]
- IMULL (DI)(R9*4),DX // imul edx, dword [rdi + 4*r9]
- ADDQ DX,AX // add eax, edx
- ADDQ $1,R9 // add r9, 1
- CMPQ R9,R8 // cmp r8, r9
- JNE LBB0_7
- LBB0_8:
- MOVQ (CX),AX // mov dword [rcx], eax
- VZEROUPPER // JMB
- RET
-
-
|