| 123456789101112131415161718192021222324252627282930313233343536373839404142434445 | 
							- // definition of func _dpavx_int32(a *int32, b *int32, gN *int32, res *int32)
 
- // $32 denotes the size in bytes of the stack-frame.
 
- // $32 specifies the size of the arguments passed in by the caller.
 
- TEXT ·_dpavx_int32(SB),4, $32-32 
 
- // Move the address of a, address of b, and array length to registers
 
- // SI, DI, and CX respectively. For simplicity, we assume the length of
 
- // array a and b are equal and addresses have a 32-byte alignment.
 
- 		MOVQ a+0(FP), DI
 
-         MOVQ b+8(FP), SI
 
-         MOVQ gN+16(FP), CX
 
-         MOVQ res+24(FP), DX
 
-         MOVQ (CX),R8 // value of gN
 
-         MOVQ DX,R9 // return address
 
- // Y4 is an accumulator that sums all vector multiplication results.
 
- // Compute Y3 = Y1 * Y2 and Y4 = Y4 + Y3 using the VMOVDQU, VPMULLD
 
- // and VPADDD instructions. If the array length is greater than 8,
 
- // loop execution until we reach the end of array. Store Y4 to the stack
 
- // frame address, vr, which is 32 bytes (512 bits) long
 
-   VPXOR Y4, Y4, Y4
 
-   XORQ AX,AX
 
- start:
 
-   VMOVDQU (SI), Y1
 
-   ADDQ $32, SI
 
-   VMOVDQU (DI), Y2
 
-   ADDQ $32, DI
 
-   VPMULLD Y1, Y2, Y3
 
-   VPADDD Y3, Y4, Y4  
 
-   ADDQ $8, AX
 
-   CMPL AX, R8 
 
-   JNE start
 
-   VMOVDQU Y4, d0-32(SP) // vector result to stack
 
- // Convert the vector result to a scalar result by summing
 
- // the INT32 elements and return the result.
 
-   LEAQ d0-32(SP), BX
 
-   MOVQ $8, CX          // array length
 
-   XORQ SI, SI    // clean SI
 
- redux:  //8 bytes => int32 reduction
 
-   ADDL (BX), SI
 
-   ADDQ $4,  BX
 
-   DECQ CX
 
-   JNZ  redux
 
-   MOVL SI,(R9)
 
-   RET
 
 
  |