dotproductavx_amd64.s 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. // definition of func _dpavx_int32(a *int32, b *int32, gN *int32, res *int32)
  2. // $32 denotes the size in bytes of the stack-frame.
  3. // $32 specifies the size of the arguments passed in by the caller.
  4. TEXT ·_dpavx_int32(SB),4, $32-32
  5. // Move the address of a, address of b, and array length to registers
  6. // SI, DI, and CX respectively. For simplicity, we assume the length of
  7. // array a and b are equal and addresses have a 32-byte alignment.
  8. MOVQ a+0(FP), DI
  9. MOVQ b+8(FP), SI
  10. MOVQ gN+16(FP), CX
  11. MOVQ res+24(FP), DX
  12. MOVQ (CX),R8 // value of gN
  13. MOVQ DX,R9 // return address
  14. // Y4 is an accumulator that sums all vector multiplication results.
  15. // Compute Y3 = Y1 * Y2 and Y4 = Y4 + Y3 using the VMOVDQU, VPMULLD
  16. // and VPADDD instructions. If the array length is greater than 8,
  17. // loop execution until we reach the end of array. Store Y4 to the stack
  18. // frame address, vr, which is 32 bytes (512 bits) long
  19. VPXOR Y4, Y4, Y4
  20. XORQ AX,AX
  21. start:
  22. VMOVDQU (SI), Y1
  23. ADDQ $32, SI
  24. VMOVDQU (DI), Y2
  25. ADDQ $32, DI
  26. VPMULLD Y1, Y2, Y3
  27. VPADDD Y3, Y4, Y4
  28. ADDQ $8, AX
  29. CMPL AX, R8
  30. JNE start
  31. VMOVDQU Y4, d0-32(SP) // vector result to stack
  32. // Convert the vector result to a scalar result by summing
  33. // the INT32 elements and return the result.
  34. LEAQ d0-32(SP), BX
  35. MOVQ $8, CX // array length
  36. XORQ SI, SI // clean SI
  37. redux: //8 bytes => int32 reduction
  38. ADDL (BX), SI
  39. ADDQ $4, BX
  40. DECQ CX
  41. JNZ redux
  42. MOVL SI,(R9)
  43. RET