Browse Source

fixup! Low hanging fruit vector optimizations.

Yawning Angel 1 year ago
parent
commit
d91023c93a
1 changed files with 48 additions and 48 deletions
  1. 48 48
      hwaccel_amd64.s

+ 48 - 48
hwaccel_amd64.s

@@ -1438,10 +1438,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPERM2I128 $0x13, Y9, Y10, Y9
 
 	// update
-	VMOVDQU Y11, Y12
-	VMOVDQU Y4, Y13
-	VMOVDQU Y6, Y14
-	VMOVDQU Y8, Y15
+	VMOVDQA Y11, Y12
+	VMOVDQA Y4, Y13
+	VMOVDQA Y6, Y14
+	VMOVDQA Y8, Y15
 	VPADDW  Y11, Y3, Y10
 	VPADDW  Y4, Y5, Y4
 	VPADDW  Y6, Y7, Y6
@@ -1483,10 +1483,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// level 5
 	// update
-	VMOVDQU Y10, Y12
-	VMOVDQU Y3, Y13
-	VMOVDQU Y6, Y14
-	VMOVDQU Y7, Y15
+	VMOVDQA Y10, Y12
+	VMOVDQA Y3, Y13
+	VMOVDQA Y6, Y14
+	VMOVDQA Y7, Y15
 	VPADDW  Y10, Y4, Y10
 	VPADDW  Y3, Y5, Y3
 	VPADDW  Y6, Y8, Y6
@@ -1536,10 +1536,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// level 6
 	// update
-	VMOVDQU Y10, Y12
-	VMOVDQU Y3, Y13
-	VMOVDQU Y4, Y14
-	VMOVDQU Y5, Y15
+	VMOVDQA Y10, Y12
+	VMOVDQA Y3, Y13
+	VMOVDQA Y4, Y14
+	VMOVDQA Y5, Y15
 	VPADDW  Y10, Y6, Y10
 	VPADDW  Y3, Y7, Y3
 	VPADDW  Y4, Y8, Y4
@@ -1643,10 +1643,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// level 0
 	// update
-	VMOVDQU Y4, Y12
-	VMOVDQU Y6, Y13
-	VMOVDQU Y8, Y14
-	VMOVDQU Y10, Y15
+	VMOVDQA Y4, Y12
+	VMOVDQA Y6, Y13
+	VMOVDQA Y8, Y14
+	VMOVDQA Y10, Y15
 	VPADDW  Y4, Y5, Y4
 	VPADDW  Y6, Y7, Y6
 	VPADDW  Y8, Y9, Y8
@@ -1707,10 +1707,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPBLENDW $0xAA, Y11, Y14, Y11
 
 	// update
-	VMOVDQU Y4, Y12
-	VMOVDQU Y6, Y13
-	VMOVDQU Y8, Y14
-	VMOVDQU Y10, Y15
+	VMOVDQA Y4, Y12
+	VMOVDQA Y6, Y13
+	VMOVDQA Y8, Y14
+	VMOVDQA Y10, Y15
 	VPADDW  Y4, Y5, Y4
 	VPADDW  Y6, Y7, Y6
 	VPADDW  Y8, Y9, Y8
@@ -1788,10 +1788,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPBLENDD $0xAA, Y11, Y14, Y11
 
 	// update
-	VMOVDQU Y4, Y12
-	VMOVDQU Y6, Y13
-	VMOVDQU Y8, Y14
-	VMOVDQU Y10, Y15
+	VMOVDQA Y4, Y12
+	VMOVDQA Y6, Y13
+	VMOVDQA Y8, Y14
+	VMOVDQA Y10, Y15
 	VPADDW  Y4, Y5, Y4
 	VPADDW  Y6, Y7, Y6
 	VPADDW  Y8, Y9, Y8
@@ -1843,10 +1843,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VSHUFPD $0x0F, Y11, Y10, Y10
 
 	// update
-	VMOVDQU Y3, Y12
-	VMOVDQU Y5, Y13
-	VMOVDQU Y7, Y14
-	VMOVDQU Y9, Y15
+	VMOVDQA Y3, Y12
+	VMOVDQA Y5, Y13
+	VMOVDQA Y7, Y14
+	VMOVDQA Y9, Y15
 	VPADDW  Y3, Y4, Y3
 	VPADDW  Y5, Y6, Y5
 	VPADDW  Y7, Y8, Y7
@@ -1916,10 +1916,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPERM2I128 $0x13, Y9, Y10, Y9
 
 	// update
-	VMOVDQU Y11, Y12
-	VMOVDQU Y4, Y13
-	VMOVDQU Y6, Y14
-	VMOVDQU Y8, Y15
+	VMOVDQA Y11, Y12
+	VMOVDQA Y4, Y13
+	VMOVDQA Y6, Y14
+	VMOVDQA Y8, Y15
 	VPADDW  Y11, Y3, Y10
 	VPADDW  Y4, Y5, Y4
 	VPADDW  Y6, Y7, Y6
@@ -1961,10 +1961,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// level 5
 	// update
-	VMOVDQU Y10, Y12
-	VMOVDQU Y3, Y13
-	VMOVDQU Y6, Y14
-	VMOVDQU Y7, Y15
+	VMOVDQA Y10, Y12
+	VMOVDQA Y3, Y13
+	VMOVDQA Y6, Y14
+	VMOVDQA Y7, Y15
 	VPADDW  Y10, Y4, Y10
 	VPADDW  Y3, Y5, Y3
 	VPADDW  Y6, Y8, Y6
@@ -2014,10 +2014,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// level 6
 	// update
-	VMOVDQU Y10, Y12
-	VMOVDQU Y3, Y13
-	VMOVDQU Y4, Y14
-	VMOVDQU Y5, Y15
+	VMOVDQA Y10, Y12
+	VMOVDQA Y3, Y13
+	VMOVDQA Y4, Y14
+	VMOVDQA Y5, Y15
 	VPADDW  Y10, Y6, Y10
 	VPADDW  Y3, Y7, Y3
 	VPADDW  Y4, Y8, Y4
@@ -2088,10 +2088,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// level 7
 	// update
-	VMOVDQU Y4, Y12
-	VMOVDQU Y5, Y13
-	VMOVDQU Y6, Y14
-	VMOVDQU Y7, Y15
+	VMOVDQA Y4, Y12
+	VMOVDQA Y5, Y13
+	VMOVDQA Y6, Y14
+	VMOVDQA Y7, Y15
 	VPADDW  Y4, Y8, Y4
 	VPADDW  Y5, Y9, Y5
 	VPADDW  Y6, Y10, Y6
@@ -2188,10 +2188,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// level 7
 	// update
-	VMOVDQU Y4, Y12
-	VMOVDQU Y5, Y13
-	VMOVDQU Y6, Y14
-	VMOVDQU Y7, Y15
+	VMOVDQA Y4, Y12
+	VMOVDQA Y5, Y13
+	VMOVDQA Y6, Y14
+	VMOVDQA Y7, Y15
 	VPADDW  Y4, Y8, Y4
 	VPADDW  Y5, Y9, Y5
 	VPADDW  Y6, Y10, Y6