Browse Source

Clean up the assembly a bit (Removed useless parens).

Yawning Angel 1 year ago
parent
commit
40fb2b7e5a
1 changed files with 239 additions and 239 deletions
  1. 239 239
      hwaccel_amd64.s

+ 239 - 239
hwaccel_amd64.s

@@ -114,13 +114,13 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 	// first round
 	// load
 	VMOVDQU (DI), Y4
-	VMOVDQU (32)(DI), Y5
-	VMOVDQU (64)(DI), Y6
-	VMOVDQU (96)(DI), Y7
-	VMOVDQU (256)(DI), Y8
-	VMOVDQU (288)(DI), Y9
-	VMOVDQU (320)(DI), Y10
-	VMOVDQU (352)(DI), Y11
+	VMOVDQU 32(DI), Y5
+	VMOVDQU 64(DI), Y6
+	VMOVDQU 96(DI), Y7
+	VMOVDQU 256(DI), Y8
+	VMOVDQU 288(DI), Y9
+	VMOVDQU 320(DI), Y10
+	VMOVDQU 352(DI), Y11
 
 	// level 0
 	// mul
@@ -159,26 +159,26 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// store
 	VMOVDQU Y4, (DI)
-	VMOVDQU Y5, (32)(DI)
-	VMOVDQU Y6, (64)(DI)
-	VMOVDQU Y7, (96)(DI)
-	VMOVDQU Y8, (256)(DI)
-	VMOVDQU Y9, (288)(DI)
-	VMOVDQU Y10, (320)(DI)
-	VMOVDQU Y11, (352)(DI)
+	VMOVDQU Y5, 32(DI)
+	VMOVDQU Y6, 64(DI)
+	VMOVDQU Y7, 96(DI)
+	VMOVDQU Y8, 256(DI)
+	VMOVDQU Y9, 288(DI)
+	VMOVDQU Y10, 320(DI)
+	VMOVDQU Y11, 352(DI)
 
 	ADDQ $128, DI
 
 	// second round
 	// load
 	VMOVDQU (DI), Y4
-	VMOVDQU (32)(DI), Y5
-	VMOVDQU (64)(DI), Y6
-	VMOVDQU (96)(DI), Y7
-	VMOVDQU (256)(DI), Y8
-	VMOVDQU (288)(DI), Y9
-	VMOVDQU (320)(DI), Y10
-	VMOVDQU (352)(DI), Y11
+	VMOVDQU 32(DI), Y5
+	VMOVDQU 64(DI), Y6
+	VMOVDQU 96(DI), Y7
+	VMOVDQU 256(DI), Y8
+	VMOVDQU 288(DI), Y9
+	VMOVDQU 320(DI), Y10
+	VMOVDQU 352(DI), Y11
 
 	// level 0
 	// mul
@@ -217,29 +217,29 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// store
 	VMOVDQU Y4, (DI)
-	VMOVDQU Y5, (32)(DI)
-	VMOVDQU Y6, (64)(DI)
-	VMOVDQU Y7, (96)(DI)
-	VMOVDQU Y8, (256)(DI)
-	VMOVDQU Y9, (288)(DI)
-	VMOVDQU Y10, (320)(DI)
-	VMOVDQU Y11, (352)(DI)
+	VMOVDQU Y5, 32(DI)
+	VMOVDQU Y6, 64(DI)
+	VMOVDQU Y7, 96(DI)
+	VMOVDQU Y8, 256(DI)
+	VMOVDQU Y9, 288(DI)
+	VMOVDQU Y10, 320(DI)
+	VMOVDQU Y11, 352(DI)
 
 	SUBQ $128, DI
 
 	// first round
 	// zetas
-	VMOVDQU (32)(SI), Y3
+	VMOVDQU 32(SI), Y3
 
 	// load
 	VMOVDQU (DI), Y4
-	VMOVDQU (32)(DI), Y5
-	VMOVDQU (64)(DI), Y6
-	VMOVDQU (96)(DI), Y7
-	VMOVDQU (128)(DI), Y8
-	VMOVDQU (160)(DI), Y9
-	VMOVDQU (192)(DI), Y10
-	VMOVDQU (224)(DI), Y11
+	VMOVDQU 32(DI), Y5
+	VMOVDQU 64(DI), Y6
+	VMOVDQU 96(DI), Y7
+	VMOVDQU 128(DI), Y8
+	VMOVDQU 160(DI), Y9
+	VMOVDQU 192(DI), Y10
+	VMOVDQU 224(DI), Y11
 
 	// level 1
 	// mul
@@ -278,8 +278,8 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// level 2
 	// zetas
-	VMOVDQU (96)(SI), Y15
-	VMOVDQU (128)(SI), Y3
+	VMOVDQU 96(SI), Y15
+	VMOVDQU 128(SI), Y3
 
 	// mul
 	VPMULLW Y15, Y6, Y12
@@ -317,10 +317,10 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// level 3
 	// zetas
-	VMOVDQU (224)(SI), Y13
-	VMOVDQU (256)(SI), Y14
-	VMOVDQU (288)(SI), Y15
-	VMOVDQU (320)(SI), Y3
+	VMOVDQU 224(SI), Y13
+	VMOVDQU 256(SI), Y14
+	VMOVDQU 288(SI), Y15
+	VMOVDQU 320(SI), Y3
 
 	// mul
 	VPMULLW Y13, Y5, Y12
@@ -380,10 +380,10 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// level 4
 	// zetas
-	VMOVDQU (480)(SI), Y12
-	VMOVDQU (512)(SI), Y13
-	VMOVDQU (544)(SI), Y14
-	VMOVDQU (576)(SI), Y15
+	VMOVDQU 480(SI), Y12
+	VMOVDQU 512(SI), Y13
+	VMOVDQU 544(SI), Y14
+	VMOVDQU 576(SI), Y15
 
 	// shuffle
 	VPERM2I128 $0x02, Y4, Y5, Y3
@@ -431,10 +431,10 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// level 5
 	// zetas
-	VMOVDQU (736)(SI), Y12
-	VMOVDQU (768)(SI), Y13
-	VMOVDQU (800)(SI), Y14
-	VMOVDQU (832)(SI), Y15
+	VMOVDQU 736(SI), Y12
+	VMOVDQU 768(SI), Y13
+	VMOVDQU 800(SI), Y14
+	VMOVDQU 832(SI), Y15
 
 	// shuffle
 	VSHUFPD $0x00, Y4, Y3, Y11
@@ -500,10 +500,10 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 	VPBLENDD $0xAA, Y9, Y14, Y9
 
 	// zetas
-	VMOVDQU (992)(SI), Y12
-	VMOVDQU (1024)(SI), Y13
-	VMOVDQU (1056)(SI), Y14
-	VMOVDQU (1088)(SI), Y15
+	VMOVDQU 992(SI), Y12
+	VMOVDQU 1024(SI), Y13
+	VMOVDQU 1056(SI), Y14
+	VMOVDQU 1088(SI), Y15
 
 	// mul
 	VPMULLW Y12, Y3, Y11
@@ -582,10 +582,10 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 	VPBLENDW $0xAA, Y9, Y13, Y9
 
 	// zetas
-	VMOVDQU (1248)(SI), Y12
-	VMOVDQU (1280)(SI), Y13
-	VMOVDQU (1312)(SI), Y14
-	VMOVDQU (1344)(SI), Y15
+	VMOVDQU 1248(SI), Y12
+	VMOVDQU 1280(SI), Y13
+	VMOVDQU 1312(SI), Y14
+	VMOVDQU 1344(SI), Y15
 
 	// mul
 	VPMULLW Y12, Y3, Y11
@@ -660,29 +660,29 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// store
 	VMOVDQU Y11, (DI)
-	VMOVDQU Y12, (32)(DI)
-	VMOVDQU Y13, (64)(DI)
-	VMOVDQU Y14, (96)(DI)
-	VMOVDQU Y15, (128)(DI)
-	VMOVDQU Y3, (160)(DI)
-	VMOVDQU Y4, (192)(DI)
-	VMOVDQU Y5, (224)(DI)
+	VMOVDQU Y12, 32(DI)
+	VMOVDQU Y13, 64(DI)
+	VMOVDQU Y14, 96(DI)
+	VMOVDQU Y15, 128(DI)
+	VMOVDQU Y3, 160(DI)
+	VMOVDQU Y4, 192(DI)
+	VMOVDQU Y5, 224(DI)
 
 	ADDQ $256, DI
 
 	// second round
 	// zetas
-	VMOVDQU (64)(SI), Y3
+	VMOVDQU 64(SI), Y3
 
 	// load
 	VMOVDQU (DI), Y4
-	VMOVDQU (32)(DI), Y5
-	VMOVDQU (64)(DI), Y6
-	VMOVDQU (96)(DI), Y7
-	VMOVDQU (128)(DI), Y8
-	VMOVDQU (160)(DI), Y9
-	VMOVDQU (192)(DI), Y10
-	VMOVDQU (224)(DI), Y11
+	VMOVDQU 32(DI), Y5
+	VMOVDQU 64(DI), Y6
+	VMOVDQU 96(DI), Y7
+	VMOVDQU 128(DI), Y8
+	VMOVDQU 160(DI), Y9
+	VMOVDQU 192(DI), Y10
+	VMOVDQU 224(DI), Y11
 
 	// level 1
 	// mul
@@ -721,8 +721,8 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// level 2
 	// zetas
-	VMOVDQU (160)(SI), Y15
-	VMOVDQU (192)(SI), Y3
+	VMOVDQU 160(SI), Y15
+	VMOVDQU 192(SI), Y3
 
 	// mul
 	VPMULLW Y15, Y6, Y12
@@ -760,10 +760,10 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// level 3
 	// zetas
-	VMOVDQU (352)(SI), Y13
-	VMOVDQU (384)(SI), Y14
-	VMOVDQU (416)(SI), Y15
-	VMOVDQU (448)(SI), Y3
+	VMOVDQU 352(SI), Y13
+	VMOVDQU 384(SI), Y14
+	VMOVDQU 416(SI), Y15
+	VMOVDQU 448(SI), Y3
 
 	// mul
 	VPMULLW Y13, Y5, Y12
@@ -823,10 +823,10 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// level 4
 	// zetas
-	VMOVDQU (608)(SI), Y12
-	VMOVDQU (640)(SI), Y13
-	VMOVDQU (672)(SI), Y14
-	VMOVDQU (704)(SI), Y15
+	VMOVDQU 608(SI), Y12
+	VMOVDQU 640(SI), Y13
+	VMOVDQU 672(SI), Y14
+	VMOVDQU 704(SI), Y15
 
 	// shuffle
 	VPERM2I128 $0x02, Y4, Y5, Y3
@@ -874,10 +874,10 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// level 5
 	// zetas
-	VMOVDQU (864)(SI), Y12
-	VMOVDQU (896)(SI), Y13
-	VMOVDQU (928)(SI), Y14
-	VMOVDQU (960)(SI), Y15
+	VMOVDQU 864(SI), Y12
+	VMOVDQU 896(SI), Y13
+	VMOVDQU 928(SI), Y14
+	VMOVDQU 960(SI), Y15
 
 	// shuffle
 	VSHUFPD $0x00, Y4, Y3, Y11
@@ -943,10 +943,10 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 	VPBLENDD $0xAA, Y9, Y14, Y9
 
 	// zetas
-	VMOVDQU (1120)(SI), Y12
-	VMOVDQU (1152)(SI), Y13
-	VMOVDQU (1184)(SI), Y14
-	VMOVDQU (1216)(SI), Y15
+	VMOVDQU 1120(SI), Y12
+	VMOVDQU 1152(SI), Y13
+	VMOVDQU 1184(SI), Y14
+	VMOVDQU 1216(SI), Y15
 
 	// mul
 	VPMULLW Y12, Y3, Y11
@@ -1025,10 +1025,10 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 	VPBLENDW $0xAA, Y9, Y13, Y9
 
 	// zetas
-	VMOVDQU (1376)(SI), Y12
-	VMOVDQU (1408)(SI), Y13
-	VMOVDQU (1440)(SI), Y14
-	VMOVDQU (1472)(SI), Y15
+	VMOVDQU 1376(SI), Y12
+	VMOVDQU 1408(SI), Y13
+	VMOVDQU 1440(SI), Y14
+	VMOVDQU 1472(SI), Y15
 
 	// mul
 	VPMULLW Y12, Y3, Y11
@@ -1103,13 +1103,13 @@ TEXT ·nttAVX2(SB), NOSPLIT, $0-16
 
 	// store
 	VMOVDQU Y11, (DI)
-	VMOVDQU Y12, (32)(DI)
-	VMOVDQU Y13, (64)(DI)
-	VMOVDQU Y14, (96)(DI)
-	VMOVDQU Y15, (128)(DI)
-	VMOVDQU Y3, (160)(DI)
-	VMOVDQU Y4, (192)(DI)
-	VMOVDQU Y5, (224)(DI)
+	VMOVDQU Y12, 32(DI)
+	VMOVDQU Y13, 64(DI)
+	VMOVDQU Y14, 96(DI)
+	VMOVDQU Y15, 128(DI)
+	VMOVDQU Y3, 160(DI)
+	VMOVDQU Y4, 192(DI)
+	VMOVDQU Y5, 224(DI)
 
 	VZEROUPPER
 	RET
@@ -1136,13 +1136,13 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	// first round
 	// load
 	VMOVDQU (DI), Y4
-	VMOVDQU (32)(DI), Y5
-	VMOVDQU (64)(DI), Y6
-	VMOVDQU (96)(DI), Y7
-	VMOVDQU (128)(DI), Y8
-	VMOVDQU (160)(DI), Y9
-	VMOVDQU (192)(DI), Y10
-	VMOVDQU (224)(DI), Y11
+	VMOVDQU 32(DI), Y5
+	VMOVDQU 64(DI), Y6
+	VMOVDQU 96(DI), Y7
+	VMOVDQU 128(DI), Y8
+	VMOVDQU 160(DI), Y9
+	VMOVDQU 192(DI), Y10
+	VMOVDQU 224(DI), Y11
 
 	// reorder
 	VMOVDQU   ·lowdword<>(SB), Y3
@@ -1192,9 +1192,9 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// zetas
 	VMOVDQU (R8), Y7
-	VMOVDQU (32)(R8), Y9
-	VMOVDQU (64)(R8), Y11
-	VMOVDQU (96)(R8), Y3
+	VMOVDQU 32(R8), Y9
+	VMOVDQU 64(R8), Y11
+	VMOVDQU 96(R8), Y3
 
 	// mul
 	VPMULLW Y7, Y12, Y5
@@ -1251,10 +1251,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPADDW Y10, Y11, Y10
 
 	// zetas
-	VMOVDQU (256)(R8), Y7
-	VMOVDQU (288)(R8), Y9
-	VMOVDQU (320)(R8), Y11
-	VMOVDQU (352)(R8), Y3
+	VMOVDQU 256(R8), Y7
+	VMOVDQU 288(R8), Y9
+	VMOVDQU 320(R8), Y11
+	VMOVDQU 352(R8), Y3
 
 	// mul
 	VPMULLW Y7, Y12, Y5
@@ -1328,10 +1328,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPADDW Y10, Y11, Y10
 
 	// zetas
-	VMOVDQU (512)(R8), Y7
-	VMOVDQU (544)(R8), Y9
-	VMOVDQU (576)(R8), Y11
-	VMOVDQU (608)(R8), Y3
+	VMOVDQU 512(R8), Y7
+	VMOVDQU 544(R8), Y9
+	VMOVDQU 576(R8), Y11
+	VMOVDQU 608(R8), Y3
 
 	// mul
 	VPMULLW Y7, Y12, Y5
@@ -1379,10 +1379,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPADDW Y9, Y10, Y9
 
 	// zetas
-	VMOVDQU (768)(R8), Y6
-	VMOVDQU (800)(R8), Y8
-	VMOVDQU (832)(R8), Y10
-	VMOVDQU (864)(R8), Y11
+	VMOVDQU 768(R8), Y6
+	VMOVDQU 800(R8), Y8
+	VMOVDQU 832(R8), Y10
+	VMOVDQU 864(R8), Y11
 
 	// mul
 	VPMULLW Y6, Y12, Y4
@@ -1452,10 +1452,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPSUBW  Y9, Y15, Y9
 
 	// zetas
-	VMOVDQU (1024)(R8), Y12
-	VMOVDQU (1056)(R8), Y13
-	VMOVDQU (1088)(R8), Y14
-	VMOVDQU (1120)(R8), Y15
+	VMOVDQU 1024(R8), Y12
+	VMOVDQU 1056(R8), Y13
+	VMOVDQU 1088(R8), Y14
+	VMOVDQU 1120(R8), Y15
 
 	// mul
 	VPMULLW Y12, Y3, Y11
@@ -1497,8 +1497,8 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPSUBW  Y9, Y15, Y9
 
 	// zetas
-	VMOVDQU (1280)(SI), Y14
-	VMOVDQU (1312)(SI), Y15
+	VMOVDQU 1280(SI), Y14
+	VMOVDQU 1312(SI), Y15
 
 	// mul
 	VPMULLW Y14, Y4, Y11
@@ -1550,7 +1550,7 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPSUBW  Y9, Y15, Y9
 
 	// zetas
-	VMOVDQU (1408)(SI), Y15
+	VMOVDQU 1408(SI), Y15
 
 	// mul
 	VPMULLW Y15, Y6, Y11
@@ -1584,13 +1584,13 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// store
 	VMOVDQU Y10, (DI)
-	VMOVDQU Y3, (32)(DI)
-	VMOVDQU Y4, (64)(DI)
-	VMOVDQU Y5, (96)(DI)
-	VMOVDQU Y6, (128)(DI)
-	VMOVDQU Y7, (160)(DI)
-	VMOVDQU Y8, (192)(DI)
-	VMOVDQU Y9, (224)(DI)
+	VMOVDQU Y3, 32(DI)
+	VMOVDQU Y4, 64(DI)
+	VMOVDQU Y5, 96(DI)
+	VMOVDQU Y6, 128(DI)
+	VMOVDQU Y7, 160(DI)
+	VMOVDQU Y8, 192(DI)
+	VMOVDQU Y9, 224(DI)
 
 	ADDQ $256, DI
 	ADDQ $128, R8
@@ -1598,13 +1598,13 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	// second round
 	// load
 	VMOVDQU (DI), Y4
-	VMOVDQU (32)(DI), Y5
-	VMOVDQU (64)(DI), Y6
-	VMOVDQU (96)(DI), Y7
-	VMOVDQU (128)(DI), Y8
-	VMOVDQU (160)(DI), Y9
-	VMOVDQU (192)(DI), Y10
-	VMOVDQU (224)(DI), Y11
+	VMOVDQU 32(DI), Y5
+	VMOVDQU 64(DI), Y6
+	VMOVDQU 96(DI), Y7
+	VMOVDQU 128(DI), Y8
+	VMOVDQU 160(DI), Y9
+	VMOVDQU 192(DI), Y10
+	VMOVDQU 224(DI), Y11
 
 	// reorder
 	VMOVDQU   ·lowdword<>(SB), Y3
@@ -1658,9 +1658,9 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// zetas
 	VMOVDQU (R8), Y13
-	VMOVDQU (32)(R8), Y14
-	VMOVDQU (64)(R8), Y15
-	VMOVDQU (96)(R8), Y3
+	VMOVDQU 32(R8), Y14
+	VMOVDQU 64(R8), Y15
+	VMOVDQU 96(R8), Y3
 
 	// mul
 	VPMULLW Y13, Y5, Y12
@@ -1721,10 +1721,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPSUBW  Y11, Y15, Y11
 
 	// zetas
-	VMOVDQU (256)(R8), Y13
-	VMOVDQU (288)(R8), Y14
-	VMOVDQU (320)(R8), Y15
-	VMOVDQU (352)(R8), Y3
+	VMOVDQU 256(R8), Y13
+	VMOVDQU 288(R8), Y14
+	VMOVDQU 320(R8), Y15
+	VMOVDQU 352(R8), Y3
 
 	// mul
 	VPMULLW Y13, Y5, Y12
@@ -1802,10 +1802,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPSUBW  Y11, Y15, Y11
 
 	// zetas
-	VMOVDQU (512)(R8), Y13
-	VMOVDQU (544)(R8), Y14
-	VMOVDQU (576)(R8), Y15
-	VMOVDQU (608)(R8), Y3
+	VMOVDQU 512(R8), Y13
+	VMOVDQU 544(R8), Y14
+	VMOVDQU 576(R8), Y15
+	VMOVDQU 608(R8), Y3
 
 	// mul
 	VPMULLW Y13, Y5, Y12
@@ -1857,10 +1857,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPSUBW  Y10, Y15, Y10
 
 	// zetas
-	VMOVDQU (768)(R8), Y12
-	VMOVDQU (800)(R8), Y13
-	VMOVDQU (832)(R8), Y14
-	VMOVDQU (864)(R8), Y15
+	VMOVDQU 768(R8), Y12
+	VMOVDQU 800(R8), Y13
+	VMOVDQU 832(R8), Y14
+	VMOVDQU 864(R8), Y15
 
 	// mul
 	VPMULLW Y12, Y4, Y11
@@ -1930,10 +1930,10 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPSUBW  Y9, Y15, Y9
 
 	// zetas
-	VMOVDQU (1024)(R8), Y12
-	VMOVDQU (1056)(R8), Y13
-	VMOVDQU (1088)(R8), Y14
-	VMOVDQU (1120)(R8), Y15
+	VMOVDQU 1024(R8), Y12
+	VMOVDQU 1056(R8), Y13
+	VMOVDQU 1088(R8), Y14
+	VMOVDQU 1120(R8), Y15
 
 	// mul
 	VPMULLW Y12, Y3, Y11
@@ -1975,8 +1975,8 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPSUBW  Y9, Y15, Y9
 
 	// zetas
-	VMOVDQU (1344)(SI), Y14
-	VMOVDQU (1376)(SI), Y15
+	VMOVDQU 1344(SI), Y14
+	VMOVDQU 1376(SI), Y15
 
 	// mul
 	VPMULLW Y14, Y4, Y11
@@ -2028,7 +2028,7 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPSUBW  Y9, Y15, Y9
 
 	// zetas
-	VMOVDQU (1440)(SI), Y15
+	VMOVDQU 1440(SI), Y15
 
 	// mul
 	VPMULLW Y15, Y6, Y11
@@ -2062,13 +2062,13 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// store
 	VMOVDQU Y10, (DI)
-	VMOVDQU Y3, (32)(DI)
-	VMOVDQU Y4, (64)(DI)
-	VMOVDQU Y5, (96)(DI)
-	VMOVDQU Y6, (128)(DI)
-	VMOVDQU Y7, (160)(DI)
-	VMOVDQU Y8, (192)(DI)
-	VMOVDQU Y9, (224)(DI)
+	VMOVDQU Y3, 32(DI)
+	VMOVDQU Y4, 64(DI)
+	VMOVDQU Y5, 96(DI)
+	VMOVDQU Y6, 128(DI)
+	VMOVDQU Y7, 160(DI)
+	VMOVDQU Y8, 192(DI)
+	VMOVDQU Y9, 224(DI)
 
 	SUBQ $256, DI
 
@@ -2078,13 +2078,13 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	// first round
 	// load
 	VMOVDQU (DI), Y4
-	VMOVDQU (32)(DI), Y5
-	VMOVDQU (64)(DI), Y6
-	VMOVDQU (96)(DI), Y7
-	VMOVDQU (256)(DI), Y8
-	VMOVDQU (288)(DI), Y9
-	VMOVDQU (320)(DI), Y10
-	VMOVDQU (352)(DI), Y11
+	VMOVDQU 32(DI), Y5
+	VMOVDQU 64(DI), Y6
+	VMOVDQU 96(DI), Y7
+	VMOVDQU 256(DI), Y8
+	VMOVDQU 288(DI), Y9
+	VMOVDQU 320(DI), Y10
+	VMOVDQU 352(DI), Y11
 
 	// level 7
 	// update
@@ -2102,7 +2102,7 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 	VPSUBW  Y11, Y15, Y11
 
 	// zeta
-	VMOVDQU (1472)(SI), Y3
+	VMOVDQU 1472(SI), Y3
 
 	// mul
 	VPMULLW Y3, Y8, Y12
@@ -2162,29 +2162,29 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// store
 	VMOVDQU Y4, (DI)
-	VMOVDQU Y5, (32)(DI)
-	VMOVDQU Y6, (64)(DI)
-	VMOVDQU Y7, (96)(DI)
-	VMOVDQU Y8, (256)(DI)
-	VMOVDQU Y9, (288)(DI)
-	VMOVDQU Y10, (320)(DI)
-	VMOVDQU Y11, (352)(DI)
+	VMOVDQU Y5, 32(DI)
+	VMOVDQU Y6, 64(DI)
+	VMOVDQU Y7, 96(DI)
+	VMOVDQU Y8, 256(DI)
+	VMOVDQU Y9, 288(DI)
+	VMOVDQU Y10, 320(DI)
+	VMOVDQU Y11, 352(DI)
 
 	ADDQ $128, DI
 
 	// second round
 	// load
 	VMOVDQU (DI), Y4
-	VMOVDQU (32)(DI), Y5
-	VMOVDQU (64)(DI), Y6
-	VMOVDQU (96)(DI), Y7
-	VMOVDQU (256)(DI), Y8
-	VMOVDQU (288)(DI), Y9
-	VMOVDQU (320)(DI), Y10
-	VMOVDQU (352)(DI), Y11
+	VMOVDQU 32(DI), Y5
+	VMOVDQU 64(DI), Y6
+	VMOVDQU 96(DI), Y7
+	VMOVDQU 256(DI), Y8
+	VMOVDQU 288(DI), Y9
+	VMOVDQU 320(DI), Y10
+	VMOVDQU 352(DI), Y11
 
 	// zeta
-	VMOVDQU (1472)(SI), Y3
+	VMOVDQU 1472(SI), Y3
 
 	// level 7
 	// update
@@ -2259,13 +2259,13 @@ TEXT ·invnttAVX2(SB), NOSPLIT, $0-16
 
 	// store
 	VMOVDQU Y4, (DI)
-	VMOVDQU Y5, (32)(DI)
-	VMOVDQU Y6, (64)(DI)
-	VMOVDQU Y7, (96)(DI)
-	VMOVDQU Y8, (256)(DI)
-	VMOVDQU Y9, (288)(DI)
-	VMOVDQU Y10, (320)(DI)
-	VMOVDQU Y11, (352)(DI)
+	VMOVDQU Y5, 32(DI)
+	VMOVDQU Y6, 64(DI)
+	VMOVDQU Y7, 96(DI)
+	VMOVDQU Y8, 256(DI)
+	VMOVDQU Y9, 288(DI)
+	VMOVDQU Y10, 320(DI)
+	VMOVDQU Y11, 352(DI)
 
 	VZEROUPPER
 	RET
@@ -2283,19 +2283,19 @@ TEXT ·pointwiseAccK2AVX2(SB), NOSPLIT, $0-24
 	XORQ AX, AX
 	XORQ BX, BX
 
-	MOVQ (8)(SI), R8  // a[1]
-	MOVQ (SI), SI     // a[0]
-	MOVQ (8)(DX), R11 // b[1]
-	MOVQ (DX), DX     // b[0]
+	MOVQ 8(SI), R8  // a[1]
+	MOVQ (SI), SI   // a[0]
+	MOVQ 8(DX), R11 // b[1]
+	MOVQ (DX), DX   // b[0]
 
 looptop2:
 	// load a
 	VMOVDQA (SI)(BX*1), Y4
-	VMOVDQA (32)(SI)(BX*1), Y5
-	VMOVDQA (64)(SI)(BX*1), Y6
+	VMOVDQA 32(SI)(BX*1), Y5
+	VMOVDQA 64(SI)(BX*1), Y6
 	VMOVDQA (R8)(BX*1), Y7
-	VMOVDQA (32)(R8)(BX*1), Y8
-	VMOVDQA (64)(R8)(BX*1), Y9
+	VMOVDQA 32(R8)(BX*1), Y8
+	VMOVDQA 64(R8)(BX*1), Y9
 
 	// mul montsq
 	VPMULLW Y2, Y4, Y3
@@ -2333,11 +2333,11 @@ looptop2:
 
 	// load b
 	VMOVDQA (DX)(BX*1), Y9
-	VMOVDQA (32)(DX)(BX*1), Y10
-	VMOVDQA (64)(DX)(BX*1), Y11
+	VMOVDQA 32(DX)(BX*1), Y10
+	VMOVDQA 64(DX)(BX*1), Y11
 	VMOVDQA (R11)(BX*1), Y12
-	VMOVDQA (32)(R11)(BX*1), Y13
-	VMOVDQA (64)(R11)(BX*1), Y14
+	VMOVDQA 32(R11)(BX*1), Y13
+	VMOVDQA 64(R11)(BX*1), Y14
 
 	// mul
 	VPMULLW Y3, Y9, Y15
@@ -2395,8 +2395,8 @@ looptop2:
 
 	// store
 	VMOVDQA Y5, (DI)(BX*1)
-	VMOVDQA Y6, (32)(DI)(BX*1)
-	VMOVDQA Y7, (64)(DI)(BX*1)
+	VMOVDQA Y6, 32(DI)(BX*1)
+	VMOVDQA Y7, 64(DI)(BX*1)
 
 	ADDQ $1, AX
 	ADDQ $96, BX
@@ -2466,21 +2466,21 @@ TEXT ·pointwiseAccK3AVX2(SB), NOSPLIT, $0-24
 	XORQ AX, AX
 	XORQ BX, BX
 
-	MOVQ (16)(SI), R9  // a[2]
-	MOVQ (8)(SI), R8   // a[1]
-	MOVQ (SI), SI      // a[0]
-	MOVQ (16)(DX), R12 // b[2]
-	MOVQ (8)(DX), R11  // b[1]
-	MOVQ (DX), DX      // b[0]
+	MOVQ (16)(SI), R9 // a[2]
+	MOVQ 8(SI), R8    // a[1]
+	MOVQ (SI), SI     // a[0]
+	MOVQ 16(DX), R12  // b[2]
+	MOVQ 8(DX), R11   // b[1]
+	MOVQ (DX), DX     // b[0]
 
 looptop3:
 	// load a
 	VMOVDQA (SI)(BX*1), Y4
-	VMOVDQA (32)(SI)(BX*1), Y5
+	VMOVDQA 32(SI)(BX*1), Y5
 	VMOVDQA (R8)(BX*1), Y6
-	VMOVDQA (32)(R8)(BX*1), Y7
+	VMOVDQA 32(R8)(BX*1), Y7
 	VMOVDQA (R9)(BX*1), Y8
-	VMOVDQA (32)(R9)(BX*1), Y9
+	VMOVDQA 32(R9)(BX*1), Y9
 
 	// mul montsq
 	VPMULLW Y2, Y4, Y3
@@ -2518,11 +2518,11 @@ looptop3:
 
 	// load b
 	VMOVDQA (DX)(BX*1), Y9
-	VMOVDQA (32)(DX)(BX*1), Y10
+	VMOVDQA 32(DX)(BX*1), Y10
 	VMOVDQA (R11)(BX*1), Y11
-	VMOVDQA (32)(R11)(BX*1), Y12
+	VMOVDQA 32(R11)(BX*1), Y12
 	VMOVDQA (R12)(BX*1), Y13
-	VMOVDQA (32)(R12)(BX*1), Y14
+	VMOVDQA 32(R12)(BX*1), Y14
 
 	// mul
 	VPMULLW Y3, Y9, Y15
@@ -2577,7 +2577,7 @@ looptop3:
 
 	// store
 	VMOVDQA Y6, (DI)(BX*1)
-	VMOVDQA Y7, (32)(DI)(BX*1)
+	VMOVDQA Y7, 32(DI)(BX*1)
 
 	ADDQ $1, AX
 	ADDQ $64, BX
@@ -2601,14 +2601,14 @@ TEXT ·pointwiseAccK4AVX2(SB), NOSPLIT, $0-24
 	XORQ AX, AX
 	XORQ BX, BX
 
-	MOVQ (24)(SI), R10 // a[3]
-	MOVQ (16)(SI), R9  // a[2]
-	MOVQ (8)(SI), R8   // a[1]
-	MOVQ (SI), SI      // a[0]
-	MOVQ (24)(DX), R13 // b[3]
-	MOVQ (16)(DX), R12 // b[2]
-	MOVQ (8)(DX), R11  // b[1]
-	MOVQ (DX), DX      // b[0]
+	MOVQ 24(SI), R10 // a[3]
+	MOVQ 16(SI), R9  // a[2]
+	MOVQ 8(SI), R8   // a[1]
+	MOVQ (SI), SI    // a[0]
+	MOVQ 24(DX), R13 // b[3]
+	MOVQ 16(DX), R12 // b[2]
+	MOVQ 8(DX), R11  // b[1]
+	MOVQ (DX), DX    // b[0]
 
 looptop4:
 	// load a