Skip to content

Commit 3e44d96

Browse files
committedAug 14, 2019
[X86] Use PSADBW for v8i8 addition reductions.
Improves the 8 byte case from PR42674. Differential Revision: https://reviews.llvm.org/D66069 llvm-svn: 368864
1 parent bffa4a2 commit 3e44d96

File tree

2 files changed

+40
-104
lines changed

2 files changed

+40
-104
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+12-2
Original file line numberDiff line numberDiff line change
@@ -35440,13 +35440,23 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
3544035440
if (VecVT.getScalarType() != VT)
3544135441
return SDValue();
3544235442

35443+
SDLoc DL(ExtElt);
35444+
35445+
if (VecVT == MVT::v8i8) {
35446+
// Pad with undef.
35447+
Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
35448+
DAG.getUNDEF(VecVT));
35449+
Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
35450+
DAG.getConstant(0, DL, MVT::v16i8));
35451+
Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
35452+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
35453+
}
35454+
3544335455
// Must be a >=128-bit vector with pow2 elements.
3544435456
if ((VecVT.getSizeInBits() % 128) != 0 ||
3544535457
!isPowerOf2_32(VecVT.getVectorNumElements()))
3544635458
return SDValue();
3544735459

35448-
SDLoc DL(ExtElt);
35449-
3545035460
// vXi8 reduction - sum lo/hi halves then use PSADBW.
3545135461
if (VT == MVT::i8) {
3545235462
while (Rdx.getValueSizeInBits() > 128) {

‎llvm/test/CodeGen/X86/vector-reduce-add.ll

+28-102
Original file line numberDiff line numberDiff line change
@@ -1160,52 +1160,32 @@ define i8 @test_v4i8_load(<4 x i8>* %p) {
11601160
define i8 @test_v8i8(<8 x i8> %a0) {
11611161
; SSE2-LABEL: test_v8i8:
11621162
; SSE2: # %bb.0:
1163-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1164-
; SSE2-NEXT: paddb %xmm0, %xmm1
1165-
; SSE2-NEXT: movdqa %xmm1, %xmm0
1166-
; SSE2-NEXT: psrld $16, %xmm0
1167-
; SSE2-NEXT: paddb %xmm1, %xmm0
1168-
; SSE2-NEXT: movdqa %xmm0, %xmm1
1169-
; SSE2-NEXT: psrlw $8, %xmm1
1170-
; SSE2-NEXT: paddb %xmm0, %xmm1
1163+
; SSE2-NEXT: pxor %xmm1, %xmm1
1164+
; SSE2-NEXT: psadbw %xmm0, %xmm1
11711165
; SSE2-NEXT: movd %xmm1, %eax
11721166
; SSE2-NEXT: # kill: def $al killed $al killed $eax
11731167
; SSE2-NEXT: retq
11741168
;
11751169
; SSE41-LABEL: test_v8i8:
11761170
; SSE41: # %bb.0:
1177-
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1178-
; SSE41-NEXT: paddb %xmm0, %xmm1
1179-
; SSE41-NEXT: movdqa %xmm1, %xmm0
1180-
; SSE41-NEXT: psrld $16, %xmm0
1181-
; SSE41-NEXT: paddb %xmm1, %xmm0
1182-
; SSE41-NEXT: movdqa %xmm0, %xmm1
1183-
; SSE41-NEXT: psrlw $8, %xmm1
1184-
; SSE41-NEXT: paddb %xmm0, %xmm1
1171+
; SSE41-NEXT: pxor %xmm1, %xmm1
1172+
; SSE41-NEXT: psadbw %xmm0, %xmm1
11851173
; SSE41-NEXT: pextrb $0, %xmm1, %eax
11861174
; SSE41-NEXT: # kill: def $al killed $al killed $eax
11871175
; SSE41-NEXT: retq
11881176
;
11891177
; AVX-LABEL: test_v8i8:
11901178
; AVX: # %bb.0:
1191-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1192-
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1193-
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
1194-
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1195-
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1196-
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1179+
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1180+
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
11971181
; AVX-NEXT: vpextrb $0, %xmm0, %eax
11981182
; AVX-NEXT: # kill: def $al killed $al killed $eax
11991183
; AVX-NEXT: retq
12001184
;
12011185
; AVX512-LABEL: test_v8i8:
12021186
; AVX512: # %bb.0:
1203-
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1204-
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1205-
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1206-
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1207-
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1208-
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1187+
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1188+
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
12091189
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
12101190
; AVX512-NEXT: # kill: def $al killed $al killed $eax
12111191
; AVX512-NEXT: retq
@@ -1217,92 +1197,38 @@ define i8 @test_v8i8_load(<8 x i8>* %p) {
12171197
; SSE2-LABEL: test_v8i8_load:
12181198
; SSE2: # %bb.0:
12191199
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1220-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1221-
; SSE2-NEXT: paddb %xmm0, %xmm1
1222-
; SSE2-NEXT: movdqa %xmm1, %xmm0
1223-
; SSE2-NEXT: psrld $16, %xmm0
1224-
; SSE2-NEXT: paddb %xmm1, %xmm0
1225-
; SSE2-NEXT: movdqa %xmm0, %xmm1
1226-
; SSE2-NEXT: psrlw $8, %xmm1
1227-
; SSE2-NEXT: paddb %xmm0, %xmm1
1200+
; SSE2-NEXT: pxor %xmm1, %xmm1
1201+
; SSE2-NEXT: psadbw %xmm0, %xmm1
12281202
; SSE2-NEXT: movd %xmm1, %eax
12291203
; SSE2-NEXT: # kill: def $al killed $al killed $eax
12301204
; SSE2-NEXT: retq
12311205
;
12321206
; SSE41-LABEL: test_v8i8_load:
12331207
; SSE41: # %bb.0:
12341208
; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1235-
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1236-
; SSE41-NEXT: paddb %xmm0, %xmm1
1237-
; SSE41-NEXT: movdqa %xmm1, %xmm0
1238-
; SSE41-NEXT: psrld $16, %xmm0
1239-
; SSE41-NEXT: paddb %xmm1, %xmm0
1240-
; SSE41-NEXT: movdqa %xmm0, %xmm1
1241-
; SSE41-NEXT: psrlw $8, %xmm1
1242-
; SSE41-NEXT: paddb %xmm0, %xmm1
1209+
; SSE41-NEXT: pxor %xmm1, %xmm1
1210+
; SSE41-NEXT: psadbw %xmm0, %xmm1
12431211
; SSE41-NEXT: pextrb $0, %xmm1, %eax
12441212
; SSE41-NEXT: # kill: def $al killed $al killed $eax
12451213
; SSE41-NEXT: retq
12461214
;
1247-
; AVX1-LABEL: test_v8i8_load:
1248-
; AVX1: # %bb.0:
1249-
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1250-
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1251-
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1252-
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1253-
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1254-
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1255-
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1256-
; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1257-
; AVX1-NEXT: # kill: def $al killed $al killed $eax
1258-
; AVX1-NEXT: retq
1259-
;
1260-
; AVX2-LABEL: test_v8i8_load:
1261-
; AVX2: # %bb.0:
1262-
; AVX2-NEXT: movq (%rdi), %rax
1263-
; AVX2-NEXT: vmovq %rax, %xmm0
1264-
; AVX2-NEXT: shrq $32, %rax
1265-
; AVX2-NEXT: vmovd %eax, %xmm1
1266-
; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
1267-
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1268-
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1269-
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1270-
; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1271-
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1272-
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1273-
; AVX2-NEXT: # kill: def $al killed $al killed $eax
1274-
; AVX2-NEXT: retq
1275-
;
1276-
; AVX512BW-LABEL: test_v8i8_load:
1277-
; AVX512BW: # %bb.0:
1278-
; AVX512BW-NEXT: movq (%rdi), %rax
1279-
; AVX512BW-NEXT: vmovq %rax, %xmm0
1280-
; AVX512BW-NEXT: shrq $32, %rax
1281-
; AVX512BW-NEXT: vmovd %eax, %xmm1
1282-
; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
1283-
; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1284-
; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
1285-
; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1286-
; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
1287-
; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1288-
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
1289-
; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
1290-
; AVX512BW-NEXT: retq
1215+
; AVX-LABEL: test_v8i8_load:
1216+
; AVX: # %bb.0:
1217+
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1218+
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1219+
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1220+
; AVX-NEXT: vpextrb $0, %xmm0, %eax
1221+
; AVX-NEXT: # kill: def $al killed $al killed $eax
1222+
; AVX-NEXT: retq
12911223
;
1292-
; AVX512VL-LABEL: test_v8i8_load:
1293-
; AVX512VL: # %bb.0:
1294-
; AVX512VL-NEXT: movq (%rdi), %rax
1295-
; AVX512VL-NEXT: vmovq %rax, %xmm0
1296-
; AVX512VL-NEXT: shrq $32, %rax
1297-
; AVX512VL-NEXT: vpbroadcastd %eax, %xmm1
1298-
; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1299-
; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
1300-
; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1301-
; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
1302-
; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1303-
; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
1304-
; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
1305-
; AVX512VL-NEXT: retq
1224+
; AVX512-LABEL: test_v8i8_load:
1225+
; AVX512: # %bb.0:
1226+
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1227+
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1228+
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1229+
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1230+
; AVX512-NEXT: # kill: def $al killed $al killed $eax
1231+
; AVX512-NEXT: retq
13061232
%a0 = load <8 x i8>, <8 x i8>* %p
13071233
%1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
13081234
ret i8 %1

0 commit comments

Comments
 (0)