Browse Source

fixup! More vector optimizations.

Yawning Angel 1 year ago
parent
commit
383140faf9
1 changed files with 43 additions and 43 deletions
  1. 43 43
      hwaccel_amd64.s

+ 43 - 43
hwaccel_amd64.s

@@ -2290,12 +2290,12 @@ TEXT ·pointwiseAccK2AVX2(SB), NOSPLIT, $0-24
 
 looptop2:
 	// load a
-	VMOVDQA (SI)(BX*1), Y4
-	VMOVDQA 32(SI)(BX*1), Y5
-	VMOVDQA 64(SI)(BX*1), Y6
-	VMOVDQA (R8)(BX*1), Y7
-	VMOVDQA 32(R8)(BX*1), Y8
-	VMOVDQA 64(R8)(BX*1), Y9
+	VMOVDQU (SI)(BX*1), Y4
+	VMOVDQU 32(SI)(BX*1), Y5
+	VMOVDQU 64(SI)(BX*1), Y6
+	VMOVDQU (R8)(BX*1), Y7
+	VMOVDQU 32(R8)(BX*1), Y8
+	VMOVDQU 64(R8)(BX*1), Y9
 
 	// mul montsq
 	VPMULLW Y2, Y4, Y3
@@ -2332,12 +2332,12 @@ looptop2:
 	VPSUBW  Y8, Y15, Y8
 
 	// load b
-	VMOVDQA (DX)(BX*1), Y9
-	VMOVDQA 32(DX)(BX*1), Y10
-	VMOVDQA 64(DX)(BX*1), Y11
-	VMOVDQA (R11)(BX*1), Y12
-	VMOVDQA 32(R11)(BX*1), Y13
-	VMOVDQA 64(R11)(BX*1), Y14
+	VMOVDQU (DX)(BX*1), Y9
+	VMOVDQU 32(DX)(BX*1), Y10
+	VMOVDQU 64(DX)(BX*1), Y11
+	VMOVDQU (R11)(BX*1), Y12
+	VMOVDQU 32(R11)(BX*1), Y13
+	VMOVDQU 64(R11)(BX*1), Y14
 
 	// mul
 	VPMULLW Y3, Y9, Y15
@@ -2394,9 +2394,9 @@ looptop2:
 	VPSUBW  Y10, Y7, Y7
 
 	// store
-	VMOVDQA Y5, (DI)(BX*1)
-	VMOVDQA Y6, 32(DI)(BX*1)
-	VMOVDQA Y7, 64(DI)(BX*1)
+	VMOVDQU Y5, (DI)(BX*1)
+	VMOVDQU Y6, 32(DI)(BX*1)
+	VMOVDQU Y7, 64(DI)(BX*1)
 
 	ADDQ $1, AX
 	ADDQ $96, BX
@@ -2404,10 +2404,10 @@ looptop2:
 	JB   looptop2
 
 	// load
-	VMOVDQA (SI)(BX*1), Y4
-	VMOVDQA (R8)(BX*1), Y7
-	VMOVDQA (DX)(BX*1), Y9
-	VMOVDQA (R11)(BX*1), Y12
+	VMOVDQU (SI)(BX*1), Y4
+	VMOVDQU (R8)(BX*1), Y7
+	VMOVDQU (DX)(BX*1), Y9
+	VMOVDQU (R11)(BX*1), Y12
 
 	// mul montsq
 	VPMULLW Y2, Y4, Y3
@@ -2448,7 +2448,7 @@ looptop2:
 	VPSUBW  Y8, Y5, Y5
 
 	// store
-	VMOVDQA Y5, (DI)(BX*1)
+	VMOVDQU Y5, (DI)(BX*1)
 
 	VZEROUPPER
 	RET
@@ -2475,12 +2475,12 @@ TEXT ·pointwiseAccK3AVX2(SB), NOSPLIT, $0-24
 
 looptop3:
 	// load a
-	VMOVDQA (SI)(BX*1), Y4
-	VMOVDQA 32(SI)(BX*1), Y5
-	VMOVDQA (R8)(BX*1), Y6
-	VMOVDQA 32(R8)(BX*1), Y7
-	VMOVDQA (R9)(BX*1), Y8
-	VMOVDQA 32(R9)(BX*1), Y9
+	VMOVDQU (SI)(BX*1), Y4
+	VMOVDQU 32(SI)(BX*1), Y5
+	VMOVDQU (R8)(BX*1), Y6
+	VMOVDQU 32(R8)(BX*1), Y7
+	VMOVDQU (R9)(BX*1), Y8
+	VMOVDQU 32(R9)(BX*1), Y9
 
 	// mul montsq
 	VPMULLW Y2, Y4, Y3
@@ -2517,12 +2517,12 @@ looptop3:
 	VPSUBW  Y8, Y15, Y8
 
 	// load b
-	VMOVDQA (DX)(BX*1), Y9
-	VMOVDQA 32(DX)(BX*1), Y10
-	VMOVDQA (R11)(BX*1), Y11
-	VMOVDQA 32(R11)(BX*1), Y12
-	VMOVDQA (R12)(BX*1), Y13
-	VMOVDQA 32(R12)(BX*1), Y14
+	VMOVDQU (DX)(BX*1), Y9
+	VMOVDQU 32(DX)(BX*1), Y10
+	VMOVDQU (R11)(BX*1), Y11
+	VMOVDQU 32(R11)(BX*1), Y12
+	VMOVDQU (R12)(BX*1), Y13
+	VMOVDQU 32(R12)(BX*1), Y14
 
 	// mul
 	VPMULLW Y3, Y9, Y15
@@ -2576,8 +2576,8 @@ looptop3:
 	VPSUBW  Y9, Y7, Y7
 
 	// store
-	VMOVDQA Y6, (DI)(BX*1)
-	VMOVDQA Y7, 32(DI)(BX*1)
+	VMOVDQU Y6, (DI)(BX*1)
+	VMOVDQU Y7, 32(DI)(BX*1)
 
 	ADDQ $1, AX
 	ADDQ $64, BX
@@ -2612,10 +2612,10 @@ TEXT ·pointwiseAccK4AVX2(SB), NOSPLIT, $0-24
 
 looptop4:
 	// load a
-	VMOVDQA (SI)(BX*1), Y6
-	VMOVDQA (R8)(BX*1), Y7
-	VMOVDQA (R9)(BX*1), Y8
-	VMOVDQA (R10)(BX*1), Y9
+	VMOVDQU (SI)(BX*1), Y6
+	VMOVDQU (R8)(BX*1), Y7
+	VMOVDQU (R9)(BX*1), Y8
+	VMOVDQU (R10)(BX*1), Y9
 
 	// mul montsq
 	VPMULLW Y2, Y6, Y5
@@ -2642,10 +2642,10 @@ looptop4:
 	VPSUBW  Y8, Y13, Y8
 
 	// load b
-	VMOVDQA (DX)(BX*1), Y9
-	VMOVDQA (R11)(BX*1), Y10
-	VMOVDQA (R12)(BX*1), Y11
-	VMOVDQA (R13)(BX*1), Y12
+	VMOVDQU (DX)(BX*1), Y9
+	VMOVDQU (R11)(BX*1), Y10
+	VMOVDQU (R12)(BX*1), Y11
+	VMOVDQU (R13)(BX*1), Y12
 
 	// mul
 	VPMULLW Y5, Y9, Y4
@@ -2683,7 +2683,7 @@ looptop4:
 	VPSUBW  Y8, Y7, Y8
 
 	// store
-	VMOVDQA Y8, (DI)(BX*1)
+	VMOVDQU Y8, (DI)(BX*1)
 
 	ADDQ $1, AX
 	ADDQ $32, BX