M2_IHPS
/
GLCS-CM10-2024


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445
							// definition of func _dpavx_int32(a *int32, b *int32, gN *int32, res *int32)
// $32 denotes the size in bytes of the stack-frame.
// $32 specifies the size of the arguments passed in by the caller.
TEXT ·_dpavx_int32(SB),4, $32-32 
// Move the address of a, address of b, and array length to registers
// SI, DI, and CX respectively. For simplicity, we assume the length of
// array a and b are equal and addresses have a 32-byte alignment.

		MOVQ a+0(FP), DI
        MOVQ b+8(FP), SI
        MOVQ gN+16(FP), CX
        MOVQ res+24(FP), DX
        MOVQ (CX),R8 // value of gN
        MOVQ DX,R9 // return address
// Y4 is an accumulator that sums all vector multiplication results.
// Compute Y3 = Y1 * Y2 and Y4 = Y4 + Y3 using the VMOVDQU, VPMULLD
// and VPADDD instructions. If the array length is greater than 8,
// loop execution until we reach the end of array. Store Y4 to the stack
// frame address, vr, which is 32 bytes (512 bits) long
  VPXOR Y4, Y4, Y4
  XORQ AX,AX
start:
  VMOVDQU (SI), Y1
  ADDQ $32, SI
  VMOVDQU (DI), Y2
  ADDQ $32, DI
  VPMULLD Y1, Y2, Y3
  VPADDD Y3, Y4, Y4  
  ADDQ $8, AX
  CMPL AX, R8 
  JNE start
  VMOVDQU Y4, d0-32(SP) // vector result to stack
// Convert the vector result to a scalar result by summing
// the INT32 elements and return the result.
  LEAQ d0-32(SP), BX
  MOVQ $8, CX          // array length
  XORQ SI, SI    // clean SI
redux:  //8 bytes => int32 reduction
  ADDL (BX), SI
  ADDQ $4,  BX
  DECQ CX
  JNZ  redux
  MOVL SI,(R9)
  RET