Skip to content

Commit ed19350

Browse files
committedSep 28, 2017
[X86] Make use of vpmovwb when possible in LowerMULH
If we have BWI, we can truncate in a much simpler way by using vpmovwb. This even works without VLX by using the wider zmm->ymm truncate with a subvector extract. Differential Revision: https://reviews.llvm.org/D38375 llvm-svn: 314457
1 parent de22fe5 commit ed19350

File tree

5 files changed

+80
-64
lines changed

5 files changed

+80
-64
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+8-15
Original file line numberDiff line numberDiff line change
@@ -21631,17 +21631,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
2163121631
SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
2163221632
Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
2163321633
DAG.getConstant(8, dl, MVT::v32i16));
21634-
// The ymm variant of PACKUS treats the 128-bit lanes separately, so
21635-
// before using PACKUS we need to permute the inputs to the correct
21636-
// lo/hi xmm lane.
21637-
const int Mask[] = { 0, 1, 2, 3, 4, 5, 6, 7,
21638-
16, 17, 18, 19, 20, 21, 22, 23,
21639-
8, 9, 10, 11, 12, 13, 14, 15,
21640-
24, 25, 26, 27, 28, 29, 30, 31};
21641-
Mul = DAG.getVectorShuffle(MVT::v32i16, dl, Mul, Mul, Mask);
21642-
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i16, Mul, Lo);
21643-
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i16, Mul, Hi);
21644-
return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21634+
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
2164521635
}
2164621636
SDValue ALo = extract128BitVector(A, 0, DAG, dl);
2164721637
SDValue BLo = extract128BitVector(B, 0, DAG, dl);
@@ -21671,10 +21661,13 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
2167121661
SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
2167221662
SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
2167321663
SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21674-
SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21675-
DAG.getConstant(8, dl, MVT::v16i16));
21676-
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21677-
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21664+
Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21665+
DAG.getConstant(8, dl, MVT::v16i16));
21666+
// If we have BWI we can use truncate instruction.
21667+
if (Subtarget.hasBWI())
21668+
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
21669+
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
21670+
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
2167821671
return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
2167921672
}
2168021673

‎llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll

+37-20
Original file line numberDiff line numberDiff line change
@@ -247,24 +247,42 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
247247
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
248248
; AVX1-NEXT: retq
249249
;
250-
; AVX2-LABEL: test_div7_16i8:
251-
; AVX2: # BB#0:
252-
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
253-
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
254-
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
255-
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
256-
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
257-
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
258-
; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1
259-
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
260-
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
261-
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
262-
; AVX2-NEXT: vpsubb %xmm2, %xmm1, %xmm1
263-
; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
264-
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
265-
; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
266-
; AVX2-NEXT: vzeroupper
267-
; AVX2-NEXT: retq
250+
; AVX2NOBW-LABEL: test_div7_16i8:
251+
; AVX2NOBW: # BB#0:
252+
; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1
253+
; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
254+
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
255+
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
256+
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
257+
; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
258+
; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1
259+
; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
260+
; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
261+
; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1
262+
; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
263+
; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0
264+
; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
265+
; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
266+
; AVX2NOBW-NEXT: vzeroupper
267+
; AVX2NOBW-NEXT: retq
268+
;
269+
; AVX512BW-LABEL: test_div7_16i8:
270+
; AVX512BW: # BB#0:
271+
; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1
272+
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
273+
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
274+
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
275+
; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
276+
; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1
277+
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
278+
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
279+
; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1
280+
; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
281+
; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0
282+
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
283+
; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
284+
; AVX512BW-NEXT: vzeroupper
285+
; AVX512BW-NEXT: retq
268286
%res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
269287
ret <16 x i8> %res
270288
}
@@ -618,8 +636,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
618636
; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1
619637
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
620638
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
621-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
622-
; AVX512BW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
639+
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
623640
; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1
624641
; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2
625642
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2

‎llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll

+2-6
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
231231
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
232232
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
233233
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
234-
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,4,5,2,3,6,7]
235-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
236-
; AVX512BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
234+
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
237235
; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
238236
; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm1
239237
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
@@ -560,9 +558,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
560558
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
561559
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
562560
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
563-
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,4,5,2,3,6,7]
564-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
565-
; AVX512BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
561+
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
566562
; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1
567563
; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2
568564
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2

‎llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll

+31-17
Original file line numberDiff line numberDiff line change
@@ -230,21 +230,36 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
230230
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
231231
; AVX1-NEXT: retq
232232
;
233-
; AVX2-LABEL: test_div7_16i8:
234-
; AVX2: # BB#0:
235-
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
236-
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
237-
; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
238-
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
239-
; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
240-
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
241-
; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
242-
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
243-
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
244-
; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0
245-
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
246-
; AVX2-NEXT: vzeroupper
247-
; AVX2-NEXT: retq
233+
; AVX2NOBW-LABEL: test_div7_16i8:
234+
; AVX2NOBW: # BB#0:
235+
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
236+
; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
237+
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
238+
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
239+
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
240+
; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
241+
; AVX2NOBW-NEXT: vpsrlw $1, %xmm0, %xmm0
242+
; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
243+
; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
244+
; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm0
245+
; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
246+
; AVX2NOBW-NEXT: vzeroupper
247+
; AVX2NOBW-NEXT: retq
248+
;
249+
; AVX512BW-LABEL: test_div7_16i8:
250+
; AVX512BW: # BB#0:
251+
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
252+
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
253+
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
254+
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
255+
; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
256+
; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm0
257+
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
258+
; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
259+
; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm0
260+
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
261+
; AVX512BW-NEXT: vzeroupper
262+
; AVX512BW-NEXT: retq
248263
%res = udiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
249264
ret <16 x i8> %res
250265
}
@@ -586,8 +601,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
586601
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
587602
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
588603
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
589-
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
590-
; AVX512BW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
604+
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
591605
; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
592606
; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2
593607
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2

‎llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll

+2-6
Original file line numberDiff line numberDiff line change
@@ -234,9 +234,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
234234
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
235235
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
236236
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
237-
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,4,5,2,3,6,7]
238-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
239-
; AVX512BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
237+
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
240238
; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
241239
; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0
242240
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -563,9 +561,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
563561
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
564562
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
565563
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
566-
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,4,5,2,3,6,7]
567-
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
568-
; AVX512BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
564+
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
569565
; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2
570566
; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2
571567
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2

0 commit comments

Comments
 (0)
Please sign in to comment.