123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- // definition of func _dpavx_int32(a *int32, b *int32, gN *int32, res *int32)
- // $32 denotes the size in bytes of the stack-frame.
- // $32 specifies the size of the arguments passed in by the caller.
- TEXT ·_dpavx_int32(SB),4, $32-32
- // Move the address of a, address of b, and array length to registers
- // SI, DI, and CX respectively. For simplicity, we assume the length of
- // array a and b are equal and addresses have a 32-byte alignment.
- MOVQ a+0(FP), DI
- MOVQ b+8(FP), SI
- MOVQ gN+16(FP), CX
- MOVQ res+24(FP), DX
- MOVQ (CX),R8 // value of gN
- MOVQ DX,R9 // return address
- // Y4 is an accumulator that sums all vector multiplication results.
- // Compute Y3 = Y1 * Y2 and Y4 = Y4 + Y3 using the VMOVDQU, VPMULLD
- // and VPADDD instructions. If the array length is greater than 8,
- // loop execution until we reach the end of array. Store Y4 to the stack
- // frame address, vr, which is 32 bytes (512 bits) long
- VPXOR Y4, Y4, Y4
- XORQ AX,AX
- start:
- VMOVDQU (SI), Y1
- ADDQ $32, SI
- VMOVDQU (DI), Y2
- ADDQ $32, DI
- VPMULLD Y1, Y2, Y3
- VPADDD Y3, Y4, Y4
- ADDQ $8, AX
- CMPL AX, R8
- JNE start
- VMOVDQU Y4, d0-32(SP) // vector result to stack
- // Convert the vector result to a scalar result by summing
- // the INT32 elements and return the result.
- LEAQ d0-32(SP), BX
- MOVQ $8, CX // array length
- XORQ SI, SI // clean SI
- redux: //8 bytes => int32 reduction
- ADDL (BX), SI
- ADDQ $4, BX
- DECQ CX
- JNZ redux
- MOVL SI,(R9)
- RET
|