Skip to content

Commit ad13826

Browse files
committedApr 30, 2017
[DAGCombiner] shrink/widen a vselect to match its condition operand size (PR14657)
We discussed shrinking/widening of selects in IR in D26556, and I'll try to get back to that patch eventually. But I'm hoping that this transform is less iffy in the DAG where we can check legality of the select that we want to produce. A few things to note: 1. We can't wait until after legalization and do this generically because (at least in the x86 tests from PR14657), we'll have PACKSS and bitcasts in the pattern. 2. This might benefit more of the SSE codegen if we lifted the legal-or-custom requirement, but that requires a closer look to make sure we don't end up worse. 3. There's a 'vblendv' opportunity that we're missing that results in andn/and/or in some cases. That should be fixed next. 4. I'm assuming that AVX1 offers the worst of all worlds wrt uneven ISA support with multiple legal vector sizes, but if there are other targets like that, we should add more tests. 5. There's a codegen miracle in the multi-BB tests from PR14657 (the gcc auto-vectorization tests): despite IR that is terrible for the target, this patch allows us to generate the optimal loop code because something post-ISEL is hoisting the splat extends above the vector loops. Differential Revision: https://reviews.llvm.org/D32620 llvm-svn: 301781
1 parent 57fa1de commit ad13826

File tree

2 files changed

+164
-135
lines changed

2 files changed

+164
-135
lines changed
 

‎llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+61
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,7 @@ namespace {
399399
SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
400400
ArrayRef<int> VectorMask, SDValue VecIn1,
401401
SDValue VecIn2, unsigned LeftIdx);
402+
SDValue matchVSelectOpSizesWithSetCC(SDNode *N);
402403

403404
SDValue GetDemandedBits(SDValue V, const APInt &Mask);
404405

@@ -6942,6 +6943,51 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
69426943
return SDValue(N, 0); // Return N so it doesn't get rechecked!
69436944
}
69446945

6946+
/// If we're narrowing or widening the result of a vector select and the final
6947+
/// size is the same size as a setcc (compare) feeding the select, then try to
6948+
/// apply the cast operation to the select's operands because matching vector
6949+
/// sizes for a select condition and other operands should be more efficient.
6950+
SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
6951+
unsigned CastOpcode = Cast->getOpcode();
6952+
assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND ||
6953+
CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND ||
6954+
CastOpcode == ISD::FP_ROUND) &&
6955+
"Unexpected opcode for vector select narrowing/widening");
6956+
6957+
// We only do this transform before legal ops because the pattern may be
6958+
// obfuscated by target-specific operations after legalization. Do not create
6959+
// an illegal select op, however, because that may be difficult to lower.
6960+
EVT VT = Cast->getValueType(0);
6961+
if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
6962+
return SDValue();
6963+
6964+
SDValue VSel = Cast->getOperand(0);
6965+
if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() ||
6966+
VSel.getOperand(0).getOpcode() != ISD::SETCC)
6967+
return SDValue();
6968+
6969+
// Does the setcc have the same vector size as the casted select?
6970+
SDValue SetCC = VSel.getOperand(0);
6971+
EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
6972+
if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
6973+
return SDValue();
6974+
6975+
// cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
6976+
SDValue A = VSel.getOperand(1);
6977+
SDValue B = VSel.getOperand(2);
6978+
SDValue CastA, CastB;
6979+
SDLoc DL(Cast);
6980+
if (CastOpcode == ISD::FP_ROUND) {
6981+
// FP_ROUND (fptrunc) has an extra flag operand to pass along.
6982+
CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
6983+
CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
6984+
} else {
6985+
CastA = DAG.getNode(CastOpcode, DL, VT, A);
6986+
CastB = DAG.getNode(CastOpcode, DL, VT, B);
6987+
}
6988+
return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
6989+
}
6990+
69456991
SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
69466992
SDValue N0 = N->getOperand(0);
69476993
EVT VT = N->getValueType(0);
@@ -7165,6 +7211,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
71657211
DAG.SignBitIsZero(N0))
71667212
return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
71677213

7214+
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
7215+
return NewVSel;
7216+
71687217
return SDValue();
71697218
}
71707219

@@ -7498,6 +7547,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
74987547
ShAmt);
74997548
}
75007549

7550+
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
7551+
return NewVSel;
7552+
75017553
return SDValue();
75027554
}
75037555

@@ -8292,6 +8344,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
82928344
return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
82938345
}
82948346

8347+
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
8348+
return NewVSel;
8349+
82958350
return SDValue();
82968351
}
82978352

@@ -10243,6 +10298,9 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
1024310298
Tmp, N0.getOperand(1));
1024410299
}
1024510300

10301+
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
10302+
return NewVSel;
10303+
1024610304
return SDValue();
1024710305
}
1024810306

@@ -10309,6 +10367,9 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
1030910367
return SDValue(N, 0); // Return N so it doesn't get rechecked!
1031010368
}
1031110369

10370+
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
10371+
return NewVSel;
10372+
1031210373
return SDValue();
1031310374
}
1031410375

‎llvm/test/CodeGen/X86/cast-vsel.ll

+103-135
Original file line numberDiff line numberDiff line change
@@ -49,26 +49,23 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
4949
; AVX1-LABEL: sext:
5050
; AVX1: # BB#0:
5151
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
52-
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
53-
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
54-
; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm1
55-
; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
56-
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
57-
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
58-
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
59-
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
60-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
52+
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm1
53+
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
54+
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
55+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
56+
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm2
57+
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
58+
; AVX1-NEXT: vpmovsxwd %xmm3, %xmm3
59+
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
60+
; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
6161
; AVX1-NEXT: retq
6262
;
6363
; AVX2-LABEL: sext:
6464
; AVX2: # BB#0:
6565
; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
66-
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
67-
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
68-
; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm1
69-
; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
70-
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
71-
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
66+
; AVX2-NEXT: vpmovsxwd %xmm2, %ymm1
67+
; AVX2-NEXT: vpmovsxwd %xmm3, %ymm2
68+
; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
7269
; AVX2-NEXT: retq
7370
%cmp = fcmp olt <8 x float> %a, %b
7471
%sel = select <8 x i1> %cmp, <8 x i16> %c, <8 x i16> %d
@@ -117,26 +114,23 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
117114
; AVX1-LABEL: zext:
118115
; AVX1: # BB#0:
119116
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
120-
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
121-
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
122-
; AVX1-NEXT: vpandn %xmm3, %xmm0, %xmm1
123-
; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
124-
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
125-
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
126-
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
127-
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
128-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
117+
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
118+
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
119+
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
120+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
121+
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
122+
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
123+
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
124+
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
125+
; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
129126
; AVX1-NEXT: retq
130127
;
131128
; AVX2-LABEL: zext:
132129
; AVX2: # BB#0:
133130
; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
134-
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
135-
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
136-
; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm1
137-
; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
138-
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
139-
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
131+
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
132+
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
133+
; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0
140134
; AVX2-NEXT: retq
141135
%cmp = fcmp olt <8 x float> %a, %b
142136
%sel = select <8 x i1> %cmp, <8 x i16> %c, <8 x i16> %d
@@ -173,10 +167,9 @@ define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4
173167
; AVX-LABEL: fpext:
174168
; AVX: # BB#0:
175169
; AVX-NEXT: vcmpltpd %ymm1, %ymm0, %ymm0
176-
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
177-
; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
178-
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
179-
; AVX-NEXT: vcvtps2pd %xmm0, %ymm0
170+
; AVX-NEXT: vcvtps2pd %xmm2, %ymm1
171+
; AVX-NEXT: vcvtps2pd %xmm3, %ymm2
172+
; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
180173
; AVX-NEXT: retq
181174
%cmp = fcmp olt <4 x double> %a, %b
182175
%sel = select <4 x i1> %cmp, <4 x float> %c, <4 x float> %d
@@ -188,64 +181,65 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
188181
; SSE2-LABEL: trunc:
189182
; SSE2: # BB#0:
190183
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
191-
; SSE2-NEXT: pxor %xmm1, %xmm1
192-
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
193-
; SSE2-NEXT: psrad $16, %xmm1
194-
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
195-
; SSE2-NEXT: psrad $16, %xmm0
184+
; SSE2-NEXT: pslld $16, %xmm5
185+
; SSE2-NEXT: psrad $16, %xmm5
186+
; SSE2-NEXT: pslld $16, %xmm4
187+
; SSE2-NEXT: psrad $16, %xmm4
188+
; SSE2-NEXT: packssdw %xmm5, %xmm4
189+
; SSE2-NEXT: pslld $16, %xmm3
190+
; SSE2-NEXT: psrad $16, %xmm3
191+
; SSE2-NEXT: pslld $16, %xmm2
192+
; SSE2-NEXT: psrad $16, %xmm2
193+
; SSE2-NEXT: packssdw %xmm3, %xmm2
196194
; SSE2-NEXT: pand %xmm0, %xmm2
197195
; SSE2-NEXT: pandn %xmm4, %xmm0
198196
; SSE2-NEXT: por %xmm2, %xmm0
199-
; SSE2-NEXT: pand %xmm1, %xmm3
200-
; SSE2-NEXT: pandn %xmm5, %xmm1
201-
; SSE2-NEXT: por %xmm3, %xmm1
202-
; SSE2-NEXT: pslld $16, %xmm1
203-
; SSE2-NEXT: psrad $16, %xmm1
204-
; SSE2-NEXT: pslld $16, %xmm0
205-
; SSE2-NEXT: psrad $16, %xmm0
206-
; SSE2-NEXT: packssdw %xmm1, %xmm0
207197
; SSE2-NEXT: retq
208198
;
209199
; SSE41-LABEL: trunc:
210200
; SSE41: # BB#0:
211201
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
212-
; SSE41-NEXT: pxor %xmm1, %xmm1
213-
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
214-
; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
215-
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4
216-
; SSE41-NEXT: movdqa %xmm1, %xmm0
217-
; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm5
218-
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
219-
; SSE41-NEXT: pshufb %xmm0, %xmm5
220-
; SSE41-NEXT: pshufb %xmm0, %xmm4
202+
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
203+
; SSE41-NEXT: pshufb %xmm1, %xmm5
204+
; SSE41-NEXT: pshufb %xmm1, %xmm4
221205
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
222-
; SSE41-NEXT: movdqa %xmm4, %xmm0
206+
; SSE41-NEXT: pshufb %xmm1, %xmm3
207+
; SSE41-NEXT: pshufb %xmm1, %xmm2
208+
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
209+
; SSE41-NEXT: pand %xmm0, %xmm2
210+
; SSE41-NEXT: pandn %xmm4, %xmm0
211+
; SSE41-NEXT: por %xmm2, %xmm0
223212
; SSE41-NEXT: retq
224213
;
225214
; AVX1-LABEL: trunc:
226215
; AVX1: # BB#0:
227216
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
228-
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
229-
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
230-
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
231-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
232-
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm3, %ymm0
233-
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
234-
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
235-
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
236-
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
237-
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
217+
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
218+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
219+
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
220+
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
221+
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
222+
; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm1
223+
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
224+
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
225+
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
226+
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
227+
; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
228+
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
238229
; AVX1-NEXT: vzeroupper
239230
; AVX1-NEXT: retq
240231
;
241232
; AVX2-LABEL: trunc:
242233
; AVX2: # BB#0:
243234
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
244-
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
245-
; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm3, %ymm0
246-
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
247-
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
248-
; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
235+
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
236+
; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm3
237+
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
238+
; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm3
239+
; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1
240+
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
241+
; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
242+
; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
249243
; AVX2-NEXT: vzeroupper
250244
; AVX2-NEXT: retq
251245
%cmp = icmp eq <8 x i16> %a, %b
@@ -258,61 +252,38 @@ define <4 x float> @fptrunc(<4 x float> %a, <4 x float> %b, <4 x double> %c, <4
258252
; SSE2-LABEL: fptrunc:
259253
; SSE2: # BB#0:
260254
; SSE2-NEXT: cmpltps %xmm1, %xmm0
261-
; SSE2-NEXT: movaps %xmm0, %xmm1
262-
; SSE2-NEXT: psrad $31, %xmm1
263-
; SSE2-NEXT: xorps %xmm6, %xmm6
264-
; SSE2-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
265-
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
266-
; SSE2-NEXT: movaps %xmm6, %xmm1
267-
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
268-
; SSE2-NEXT: psrad $31, %xmm6
269-
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3]
270-
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
271-
; SSE2-NEXT: pand %xmm1, %xmm3
272-
; SSE2-NEXT: pandn %xmm5, %xmm1
273-
; SSE2-NEXT: por %xmm3, %xmm1
274-
; SSE2-NEXT: pand %xmm0, %xmm2
275-
; SSE2-NEXT: pandn %xmm4, %xmm0
276-
; SSE2-NEXT: por %xmm2, %xmm0
277-
; SSE2-NEXT: cvtpd2ps %xmm0, %xmm0
278-
; SSE2-NEXT: cvtpd2ps %xmm1, %xmm1
279-
; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
255+
; SSE2-NEXT: cvtpd2ps %xmm5, %xmm1
256+
; SSE2-NEXT: cvtpd2ps %xmm4, %xmm4
257+
; SSE2-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0]
258+
; SSE2-NEXT: cvtpd2ps %xmm3, %xmm1
259+
; SSE2-NEXT: cvtpd2ps %xmm2, %xmm2
260+
; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
261+
; SSE2-NEXT: andpd %xmm0, %xmm2
262+
; SSE2-NEXT: andnpd %xmm4, %xmm0
263+
; SSE2-NEXT: orpd %xmm2, %xmm0
280264
; SSE2-NEXT: retq
281265
;
282266
; SSE41-LABEL: fptrunc:
283267
; SSE41: # BB#0:
284268
; SSE41-NEXT: cmpltps %xmm1, %xmm0
285-
; SSE41-NEXT: xorps %xmm1, %xmm1
286-
; SSE41-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
287-
; SSE41-NEXT: pmovsxdq %xmm0, %xmm0
288-
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
269+
; SSE41-NEXT: cvtpd2ps %xmm3, %xmm1
270+
; SSE41-NEXT: cvtpd2ps %xmm2, %xmm2
271+
; SSE41-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
272+
; SSE41-NEXT: cvtpd2ps %xmm5, %xmm3
273+
; SSE41-NEXT: cvtpd2ps %xmm4, %xmm1
274+
; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
275+
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
289276
; SSE41-NEXT: movaps %xmm1, %xmm0
290-
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
291-
; SSE41-NEXT: cvtpd2ps %xmm5, %xmm1
292-
; SSE41-NEXT: cvtpd2ps %xmm4, %xmm0
293-
; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
294277
; SSE41-NEXT: retq
295278
;
296-
; AVX1-LABEL: fptrunc:
297-
; AVX1: # BB#0:
298-
; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm0
299-
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
300-
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
301-
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
302-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
303-
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
304-
; AVX1-NEXT: vcvtpd2ps %ymm0, %xmm0
305-
; AVX1-NEXT: vzeroupper
306-
; AVX1-NEXT: retq
307-
;
308-
; AVX2-LABEL: fptrunc:
309-
; AVX2: # BB#0:
310-
; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm0
311-
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
312-
; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
313-
; AVX2-NEXT: vcvtpd2ps %ymm0, %xmm0
314-
; AVX2-NEXT: vzeroupper
315-
; AVX2-NEXT: retq
279+
; AVX-LABEL: fptrunc:
280+
; AVX: # BB#0:
281+
; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm0
282+
; AVX-NEXT: vcvtpd2ps %ymm2, %xmm1
283+
; AVX-NEXT: vcvtpd2ps %ymm3, %xmm2
284+
; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
285+
; AVX-NEXT: vzeroupper
286+
; AVX-NEXT: retq
316287
%cmp = fcmp olt <4 x float> %a, %b
317288
%sel = select <4 x i1> %cmp, <4 x double> %c, <4 x double> %d
318289
%tr = fptrunc <4 x double> %sel to <4 x float>
@@ -567,20 +538,20 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
567538
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
568539
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
569540
; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
541+
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2
542+
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
543+
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
544+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
545+
; AVX1-NEXT: vpmovsxwd %xmm1, %xmm2
546+
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
547+
; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1
548+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
570549
; AVX1-NEXT: .p2align 4, 0x90
571550
; AVX1-NEXT: .LBB6_1: # %vector.body
572551
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
573552
; AVX1-NEXT: vmovups da+4096(%rax), %ymm2
574553
; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm2, %ymm2
575-
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
576-
; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
577-
; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm3
578-
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2
579-
; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
580-
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
581-
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
582-
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
583-
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
554+
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm2
584555
; AVX1-NEXT: vmovups %ymm2, dj+4096(%rax)
585556
; AVX1-NEXT: addq $32, %rax
586557
; AVX1-NEXT: jne .LBB6_1
@@ -595,18 +566,15 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
595566
; AVX2-NEXT: vmovd %esi, %xmm1
596567
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
597568
; AVX2-NEXT: movq $-4096, %rax # imm = 0xF000
569+
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
570+
; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1
598571
; AVX2-NEXT: .p2align 4, 0x90
599572
; AVX2-NEXT: .LBB6_1: # %vector.body
600573
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
601574
; AVX2-NEXT: vmovups da+4096(%rax), %ymm2
602575
; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm2, %ymm2
603-
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
604-
; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
605-
; AVX2-NEXT: vpandn %xmm1, %xmm2, %xmm3
606-
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm2
607-
; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
608-
; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2
609-
; AVX2-NEXT: vmovdqu %ymm2, dj+4096(%rax)
576+
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm2
577+
; AVX2-NEXT: vmovups %ymm2, dj+4096(%rax)
610578
; AVX2-NEXT: addq $32, %rax
611579
; AVX2-NEXT: jne .LBB6_1
612580
; AVX2-NEXT: # BB#2: # %for.end

0 commit comments

Comments
 (0)
Please sign in to comment.