TEXT ยท_dpavx_int32clang(SB),4, $0-32 MOVQ a+0(FP), DI MOVQ b+8(FP), SI MOVQ gN+16(FP), DX MOVQ res+24(FP), CX MOVQ (DX),R8 // mov r8d, dword [rdx] CMPQ R8,$0 // test r8d, r8d JLE LBB0_8 // LBB0_1 CMPQ R8,$15 // cmp r8d, 15 JA LBB0_4 XORQ R9, R9 // xor r9d, r9d XORQ AX, AX // xor eax, eax JMP LBB0_8 //LBB0_7 LBB0_1: XORQ AX, AX // xor eax, eax JMP LBB0_8 LBB0_4: MOVQ R8,R9 // mov r9d, r8d ANDQ $-16,R9 // and r9d, -16 VPXOR X0,X0,X0 // vpxor xmm0, xmm0, xmm0 XORQ AX, AX // xor eax, eax VPXOR X1, X1, X1 // vpxor xmm1, xmm1, xmm1 VPXOR X2, X2, X2 // vpxor xmm2, xmm2, xmm2 VPXOR X3, X3, X3 // vpxor xmm3, xmm3, xmm3 LBB0_5: VMOVDQU (SI)(AX*4), X4 // vmovdqu xmm4, oword [rsi + 4*rax] VMOVDQU 16(SI)(AX*4), X5 // vmovdqu xmm5, oword [rsi + 4*rax + 16] VMOVDQU 32(SI)(AX*4), X6 // vmovdqu xmm6, oword [rsi + 4*rax + 32] VMOVDQU 48(SI)(AX*4), X7 // vmovdqu xmm7, oword [rsi + 4*rax + 48] VPMULLD (DI)(AX*4), X4, X4 // vpmulld xmm4, xmm4, oword [rdi + 4*rax] VPADDD X0, X4, X0 // vpaddd xmm0, xmm4, xmm0 VPMULLD (DI)(AX*4),X5, X4 // vpmulld xmm4, xmm5, oword [rdi + 4*rax + 16] VPADDD X1, X4, X1 // vpaddd xmm1, xmm4, xmm1 VPMULLD (DI)(AX*4), X6, X4 // vpmulld xmm4, xmm6, oword [rdi + 4*rax + 32] VPMULLD (DI)(AX*4), X7, X5 // vpmulld xmm5, xmm7, oword [rdi + 4*rax + 48] VPADDD X2, X4, X2 // vpaddd xmm2, xmm4, xmm2 VPADDD X3, X5, X3 // vpaddd xmm3, xmm5, xmm3 ADDQ $16,AX // add rax, 16 CMPQ AX,R9 // cmp r9, rax JNE LBB0_5 VPADDD X0, X1, X0 // vpaddd xmm0, xmm1, xmm0 VPADDD X0, X2, X0 // vpaddd xmm0, xmm2, xmm0 VPADDD X0, X3, X0 // vpaddd xmm0, xmm3, xmm0 VPSHUFD $78 , X0,X1 // vpshufd xmm1, xmm0, 78 VPADDD X1, X0, X0 // vpaddd xmm0, xmm0, xmm1 VPSHUFD $229, X0, X1 // vpshufd xmm1, xmm0, 229 VPADDD X1, X0, X0 // vpaddd xmm0, xmm0, xmm1 VMOVD X0,AX // vmovd eax, xmm0 CMPQ R8,R9 // cmp r9, r8 JE LBB0_8 LBB0_7: MOVL (SI)(R9*4),DX // mov edx, dword [rsi + 4*r9] IMULL (DI)(R9*4),DX // imul edx, dword [rdi + 4*r9] ADDQ DX,AX // add eax, edx ADDQ $1,R9 // add r9, 1 CMPQ R9,R8 // cmp r8, r9 JNE LBB0_7 LBB0_8: MOVQ (CX),AX // mov dword [rcx], eax VZEROUPPER // JMB RET