Skip to content

Commit 15748d2

Browse files
committedJun 26, 2017
[x86] transform vector inc/dec to use -1 constant (PR33483)
Convert vector increment or decrement to sub/add with an all-ones constant: add X, <1, 1...> --> sub X, <-1, -1...> sub X, <1, 1...> --> add X, <-1, -1...> The all-ones vector constant can be materialized using a pcmpeq instruction that is commonly recognized as an idiom (has no register dependency), so that's better than loading a splat 1 constant. AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better way to produce 512 one-bits. The general advantages of this lowering are: 1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables, so in theory, this could be better for perf, but... 2. That seems unlikely to affect any OOO implementation, and I can't measure any real perf difference from this transform on Haswell or Jaguar, but... 3. It doesn't look like it from the diffs, but this is an overall size win because we eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting a scalar load (which might itself be a bug), then we're replacing a scalar constant load + broadcast with a single cheap op, so that should always be smaller/better too. 4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1 and psub x, -1, so we should use that form for +1 too because we can. If there's some reason to favor a constant load on some CPU, let's make the reverse transform for all of these cases (either here in the DAG or in a later machine pass). This should fix: https://bugs.llvm.org/show_bug.cgi?id=33483 Differential Revision: https://reviews.llvm.org/D34336 llvm-svn: 306289
1 parent 918e6d7 commit 15748d2

21 files changed

+1746
-1605
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+32
Original file line numberDiff line numberDiff line change
@@ -35065,6 +35065,32 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
3506535065
return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
3506635066
}
3506735067

35068+
/// Convert vector increment or decrement to sub/add with an all-ones constant:
35069+
/// add X, <1, 1...> --> sub X, <-1, -1...>
35070+
/// sub X, <1, 1...> --> add X, <-1, -1...>
35071+
/// The all-ones vector constant can be materialized using a pcmpeq instruction
35072+
/// that is commonly recognized as an idiom (has no register dependency), so
35073+
/// that's better/smaller than loading a splat 1 constant.
35074+
static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
35075+
assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB &&
35076+
"Unexpected opcode for increment/decrement transform");
35077+
35078+
// Pseudo-legality check: getOnesVector() expects one of these types, so bail
35079+
// out and wait for legalization if we have an unsupported vector length.
35080+
EVT VT = N->getValueType(0);
35081+
if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
35082+
return SDValue();
35083+
35084+
SDNode *N1 = N->getOperand(1).getNode();
35085+
APInt SplatVal;
35086+
if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue())
35087+
return SDValue();
35088+
35089+
SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
35090+
unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
35091+
return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
35092+
}
35093+
3506835094
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
3506935095
const X86Subtarget &Subtarget) {
3507035096
const SDNodeFlags Flags = N->getFlags();
@@ -35084,6 +35110,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
3508435110
isHorizontalBinOp(Op0, Op1, true))
3508535111
return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
3508635112

35113+
if (SDValue V = combineIncDecVector(N, DAG))
35114+
return V;
35115+
3508735116
return combineAddOrSubToADCOrSBB(N, DAG);
3508835117
}
3508935118

@@ -35117,6 +35146,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
3511735146
isHorizontalBinOp(Op0, Op1, false))
3511835147
return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
3511935148

35149+
if (SDValue V = combineIncDecVector(N, DAG))
35150+
return V;
35151+
3512035152
return combineAddOrSubToADCOrSBB(N, DAG);
3512135153
}
3512235154

‎llvm/test/CodeGen/X86/avg.ll

+1,042-1,055
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll

+5-4
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,8 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
388388
; CHECK-LABEL: test_x86_sse2_storeu_dq:
389389
; CHECK: ## BB#0:
390390
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
391-
; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
391+
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
392+
; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0
392393
; CHECK-NEXT: vmovdqu %xmm0, (%eax)
393394
; CHECK-NEXT: retl
394395
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -434,9 +435,9 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
434435
; CHECK: ## BB#0:
435436
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
436437
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
437-
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
438-
; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
439-
; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
438+
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
439+
; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
440+
; CHECK-NEXT: vpsubb %xmm2, %xmm0, %xmm0
440441
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
441442
; CHECK-NEXT: vmovups %ymm0, (%eax)
442443
; CHECK-NEXT: vzeroupper

‎llvm/test/CodeGen/X86/avx-intrinsics-x86.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -930,17 +930,17 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
930930
; AVX-LABEL: movnt_dq:
931931
; AVX: ## BB#0:
932932
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
933-
; AVX-NEXT: vpaddq LCPI65_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A]
934-
; AVX-NEXT: ## fixup A - offset: 4, value: LCPI65_0, kind: FK_Data_4
933+
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
934+
; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfb,0xc1]
935935
; AVX-NEXT: vmovntdq %ymm0, (%eax) ## encoding: [0xc5,0xfd,0xe7,0x00]
936936
; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
937937
; AVX-NEXT: retl ## encoding: [0xc3]
938938
;
939939
; AVX512VL-LABEL: movnt_dq:
940940
; AVX512VL: ## BB#0:
941941
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
942-
; AVX512VL-NEXT: vpaddq LCPI65_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A]
943-
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI65_0, kind: FK_Data_4
942+
; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
943+
; AVX512VL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
944944
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00]
945945
; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
946946
; AVX512VL-NEXT: retl ## encoding: [0xc3]

‎llvm/test/CodeGen/X86/avx-logic.ll

+4-2
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,8 @@ entry:
247247
define <2 x i64> @vpandn(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
248248
; CHECK-LABEL: vpandn:
249249
; CHECK: # BB#0: # %entry
250-
; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
250+
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
251+
; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm1
251252
; CHECK-NEXT: vpandn %xmm0, %xmm1, %xmm0
252253
; CHECK-NEXT: retq
253254
entry:
@@ -261,7 +262,8 @@ entry:
261262
define <2 x i64> @vpand(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
262263
; CHECK-LABEL: vpand:
263264
; CHECK: # BB#0: # %entry
264-
; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
265+
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
266+
; CHECK-NEXT: vpsubq %xmm2, %xmm0, %xmm0
265267
; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
266268
; CHECK-NEXT: retq
267269
entry:

‎llvm/test/CodeGen/X86/avx-vperm2x128.ll

+20-12
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,16 @@ define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind
9797
; AVX1-LABEL: shuffle_v32i8_2323_domain:
9898
; AVX1: ## BB#0: ## %entry
9999
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
100-
; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
100+
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
101+
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
101102
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
102103
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
103104
; AVX1-NEXT: retq
104105
;
105106
; AVX2-LABEL: shuffle_v32i8_2323_domain:
106107
; AVX2: ## BB#0: ## %entry
107-
; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
108+
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
109+
; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
108110
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
109111
; AVX2-NEXT: retq
110112
entry:
@@ -127,14 +129,15 @@ entry:
127129
define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
128130
; AVX1-LABEL: shuffle_v4i64_6701_domain:
129131
; AVX1: ## BB#0: ## %entry
130-
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
132+
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
133+
; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
131134
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
132135
; AVX1-NEXT: retq
133136
;
134137
; AVX2-LABEL: shuffle_v4i64_6701_domain:
135138
; AVX2: ## BB#0: ## %entry
136-
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
137-
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
139+
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
140+
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
138141
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
139142
; AVX2-NEXT: retq
140143
entry:
@@ -148,15 +151,16 @@ define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uw
148151
; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
149152
; AVX1: ## BB#0: ## %entry
150153
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
151-
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
154+
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
155+
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
152156
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
153157
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
154158
; AVX1-NEXT: retq
155159
;
156160
; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
157161
; AVX2: ## BB#0: ## %entry
158-
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
159-
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
162+
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
163+
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
160164
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
161165
; AVX2-NEXT: retq
162166
entry:
@@ -169,13 +173,15 @@ entry:
169173
define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
170174
; AVX1-LABEL: shuffle_v16i16_4501:
171175
; AVX1: ## BB#0: ## %entry
172-
; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
176+
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
177+
; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
173178
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
174179
; AVX1-NEXT: retq
175180
;
176181
; AVX2-LABEL: shuffle_v16i16_4501:
177182
; AVX2: ## BB#0: ## %entry
178-
; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
183+
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
184+
; AVX2-NEXT: vpsubw %ymm2, %ymm0, %ymm0
179185
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
180186
; AVX2-NEXT: retq
181187
entry:
@@ -189,14 +195,16 @@ define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounw
189195
; AVX1-LABEL: shuffle_v16i16_4501_mem:
190196
; AVX1: ## BB#0: ## %entry
191197
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
192-
; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
198+
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
199+
; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
193200
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
194201
; AVX1-NEXT: retq
195202
;
196203
; AVX2-LABEL: shuffle_v16i16_4501_mem:
197204
; AVX2: ## BB#0: ## %entry
198205
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
199-
; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
206+
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
207+
; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
200208
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
201209
; AVX2-NEXT: retq
202210
entry:

‎llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll

+2-1
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,8 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
382382
; CHECK-LABEL: test_x86_avx_storeu_dq_256:
383383
; CHECK: ## BB#0:
384384
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
385-
; CHECK-NEXT: vpaddb LCPI34_0, %ymm0, %ymm0
385+
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
386+
; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0
386387
; CHECK-NEXT: vmovdqu %ymm0, (%eax)
387388
; CHECK-NEXT: vzeroupper
388389
; CHECK-NEXT: retl

‎llvm/test/CodeGen/X86/avx2-logic.ll

+16-12
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@
55
define <4 x i64> @vpandn(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
66
; X32-LABEL: vpandn:
77
; X32: ## BB#0: ## %entry
8-
; X32-NEXT: vpaddq LCPI0_0, %ymm0, %ymm1
8+
; X32-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
9+
; X32-NEXT: vpsubq %ymm1, %ymm0, %ymm1
910
; X32-NEXT: vpandn %ymm0, %ymm1, %ymm0
1011
; X32-NEXT: retl
1112
;
1213
; X64-LABEL: vpandn:
1314
; X64: ## BB#0: ## %entry
14-
; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
15-
; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
15+
; X64-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
16+
; X64-NEXT: vpsubq %ymm1, %ymm0, %ymm1
1617
; X64-NEXT: vpandn %ymm0, %ymm1, %ymm0
1718
; X64-NEXT: retq
1819
entry:
@@ -26,14 +27,15 @@ entry:
2627
define <4 x i64> @vpand(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
2728
; X32-LABEL: vpand:
2829
; X32: ## BB#0: ## %entry
29-
; X32-NEXT: vpaddq LCPI1_0, %ymm0, %ymm0
30+
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
31+
; X32-NEXT: vpsubq %ymm2, %ymm0, %ymm0
3032
; X32-NEXT: vpand %ymm1, %ymm0, %ymm0
3133
; X32-NEXT: retl
3234
;
3335
; X64-LABEL: vpand:
3436
; X64: ## BB#0: ## %entry
35-
; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
36-
; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
37+
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
38+
; X64-NEXT: vpsubq %ymm2, %ymm0, %ymm0
3739
; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
3840
; X64-NEXT: retq
3941
entry:
@@ -46,14 +48,15 @@ entry:
4648
define <4 x i64> @vpor(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
4749
; X32-LABEL: vpor:
4850
; X32: ## BB#0: ## %entry
49-
; X32-NEXT: vpaddq LCPI2_0, %ymm0, %ymm0
51+
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
52+
; X32-NEXT: vpsubq %ymm2, %ymm0, %ymm0
5053
; X32-NEXT: vpor %ymm1, %ymm0, %ymm0
5154
; X32-NEXT: retl
5255
;
5356
; X64-LABEL: vpor:
5457
; X64: ## BB#0: ## %entry
55-
; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
56-
; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
58+
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
59+
; X64-NEXT: vpsubq %ymm2, %ymm0, %ymm0
5760
; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
5861
; X64-NEXT: retq
5962
entry:
@@ -66,14 +69,15 @@ entry:
6669
define <4 x i64> @vpxor(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
6770
; X32-LABEL: vpxor:
6871
; X32: ## BB#0: ## %entry
69-
; X32-NEXT: vpaddq LCPI3_0, %ymm0, %ymm0
72+
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
73+
; X32-NEXT: vpsubq %ymm2, %ymm0, %ymm0
7074
; X32-NEXT: vpxor %ymm1, %ymm0, %ymm0
7175
; X32-NEXT: retl
7276
;
7377
; X64-LABEL: vpxor:
7478
; X64: ## BB#0: ## %entry
75-
; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
76-
; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
79+
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
80+
; X64-NEXT: vpsubq %ymm2, %ymm0, %ymm0
7781
; X64-NEXT: vpxor %ymm1, %ymm0, %ymm0
7882
; X64-NEXT: retq
7983
entry:

‎llvm/test/CodeGen/X86/select.ll

+6-4
Original file line numberDiff line numberDiff line change
@@ -321,8 +321,9 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
321321
; GENERIC-NEXT: LBB7_6:
322322
; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
323323
; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
324-
; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1
325-
; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0
324+
; GENERIC-NEXT: pcmpeqd %xmm2, %xmm2
325+
; GENERIC-NEXT: paddd %xmm2, %xmm1
326+
; GENERIC-NEXT: paddd %xmm2, %xmm0
326327
; GENERIC-NEXT: movq %xmm0, 16(%rsi)
327328
; GENERIC-NEXT: movdqa %xmm1, (%rsi)
328329
; GENERIC-NEXT: retq
@@ -361,8 +362,9 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
361362
; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
362363
; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
363364
; ATOM-NEXT: LBB7_6:
364-
; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0
365-
; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1
365+
; ATOM-NEXT: pcmpeqd %xmm2, %xmm2
366+
; ATOM-NEXT: paddd %xmm2, %xmm0
367+
; ATOM-NEXT: paddd %xmm2, %xmm1
366368
; ATOM-NEXT: movq %xmm0, 16(%rsi)
367369
; ATOM-NEXT: movdqa %xmm1, (%rsi)
368370
; ATOM-NEXT: retq

‎llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll

+2-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
8383
; CHECK-LABEL: test_x86_sse2_storeu_dq:
8484
; CHECK: ## BB#0:
8585
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
86-
; CHECK-NEXT: paddb LCPI7_0, %xmm0
86+
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
87+
; CHECK-NEXT: psubb %xmm1, %xmm0
8788
; CHECK-NEXT: movdqu %xmm0, (%eax)
8889
; CHECK-NEXT: retl
8990
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>

‎llvm/test/CodeGen/X86/vec_ctbits.ll

+20-18
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,21 @@ define <2 x i64> @footz(<2 x i64> %a) nounwind {
1212
; CHECK-NEXT: pxor %xmm2, %xmm2
1313
; CHECK-NEXT: psubq %xmm0, %xmm2
1414
; CHECK-NEXT: pand %xmm0, %xmm2
15-
; CHECK-NEXT: psubq {{.*}}(%rip), %xmm2
16-
; CHECK-NEXT: movdqa %xmm2, %xmm0
15+
; CHECK-NEXT: pcmpeqd %xmm3, %xmm3
16+
; CHECK-NEXT: paddq %xmm2, %xmm3
17+
; CHECK-NEXT: movdqa %xmm3, %xmm0
1718
; CHECK-NEXT: psrlq $1, %xmm0
1819
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
19-
; CHECK-NEXT: psubq %xmm0, %xmm2
20+
; CHECK-NEXT: psubq %xmm0, %xmm3
2021
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
21-
; CHECK-NEXT: movdqa %xmm2, %xmm3
22-
; CHECK-NEXT: pand %xmm0, %xmm3
23-
; CHECK-NEXT: psrlq $2, %xmm2
22+
; CHECK-NEXT: movdqa %xmm3, %xmm2
2423
; CHECK-NEXT: pand %xmm0, %xmm2
25-
; CHECK-NEXT: paddq %xmm3, %xmm2
26-
; CHECK-NEXT: movdqa %xmm2, %xmm0
24+
; CHECK-NEXT: psrlq $2, %xmm3
25+
; CHECK-NEXT: pand %xmm0, %xmm3
26+
; CHECK-NEXT: paddq %xmm2, %xmm3
27+
; CHECK-NEXT: movdqa %xmm3, %xmm0
2728
; CHECK-NEXT: psrlq $4, %xmm0
28-
; CHECK-NEXT: paddq %xmm2, %xmm0
29+
; CHECK-NEXT: paddq %xmm3, %xmm0
2930
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
3031
; CHECK-NEXT: psadbw %xmm1, %xmm0
3132
; CHECK-NEXT: retq
@@ -115,20 +116,21 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind {
115116
; CHECK-NEXT: pxor %xmm2, %xmm2
116117
; CHECK-NEXT: psubq %xmm0, %xmm2
117118
; CHECK-NEXT: pand %xmm0, %xmm2
118-
; CHECK-NEXT: psubq {{.*}}(%rip), %xmm2
119-
; CHECK-NEXT: movdqa %xmm2, %xmm0
119+
; CHECK-NEXT: pcmpeqd %xmm3, %xmm3
120+
; CHECK-NEXT: paddq %xmm2, %xmm3
121+
; CHECK-NEXT: movdqa %xmm3, %xmm0
120122
; CHECK-NEXT: psrlq $1, %xmm0
121123
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
122-
; CHECK-NEXT: psubq %xmm0, %xmm2
124+
; CHECK-NEXT: psubq %xmm0, %xmm3
123125
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
124-
; CHECK-NEXT: movdqa %xmm2, %xmm3
125-
; CHECK-NEXT: pand %xmm0, %xmm3
126-
; CHECK-NEXT: psrlq $2, %xmm2
126+
; CHECK-NEXT: movdqa %xmm3, %xmm2
127127
; CHECK-NEXT: pand %xmm0, %xmm2
128-
; CHECK-NEXT: paddq %xmm3, %xmm2
129-
; CHECK-NEXT: movdqa %xmm2, %xmm0
128+
; CHECK-NEXT: psrlq $2, %xmm3
129+
; CHECK-NEXT: pand %xmm0, %xmm3
130+
; CHECK-NEXT: paddq %xmm2, %xmm3
131+
; CHECK-NEXT: movdqa %xmm3, %xmm0
130132
; CHECK-NEXT: psrlq $4, %xmm0
131-
; CHECK-NEXT: paddq %xmm2, %xmm0
133+
; CHECK-NEXT: paddq %xmm3, %xmm0
132134
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
133135
; CHECK-NEXT: psadbw %xmm1, %xmm0
134136
; CHECK-NEXT: retq

‎llvm/test/CodeGen/X86/vector-tzcnt-128.ll

+406-352
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/vector-tzcnt-256.ll

+92-70
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/vector-tzcnt-512.ll

+64-44
Large diffs are not rendered by default.

‎llvm/test/CodeGen/X86/widen_arith-1.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
66
; CHECK: # BB#0: # %entry
77
; CHECK-NEXT: subl $12, %esp
88
; CHECK-NEXT: movl $0, (%esp)
9-
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <1,1,1,u>
9+
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
1010
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
1111
; CHECK-NEXT: jmp .LBB0_1
1212
; CHECK-NEXT: .p2align 4, 0x90
@@ -16,7 +16,7 @@ define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
1616
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
1717
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
1818
; CHECK-NEXT: pmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
19-
; CHECK-NEXT: paddd %xmm0, %xmm2
19+
; CHECK-NEXT: psubd %xmm0, %xmm2
2020
; CHECK-NEXT: pextrb $8, %xmm2, 2(%ecx,%eax,4)
2121
; CHECK-NEXT: pshufb %xmm1, %xmm2
2222
; CHECK-NEXT: pextrw $0, %xmm2, (%ecx,%eax,4)

‎llvm/test/CodeGen/X86/widen_arith-2.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
88
; CHECK: # BB#0: # %entry
99
; CHECK-NEXT: subl $12, %esp
1010
; CHECK-NEXT: movl $0, (%esp)
11-
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
11+
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
1212
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
1313
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1414
; CHECK-NEXT: jmp .LBB0_1
@@ -26,7 +26,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
2626
; CHECK-NEXT: movl (%esp), %ecx
2727
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
2828
; CHECK-NEXT: pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
29-
; CHECK-NEXT: paddw %xmm0, %xmm3
29+
; CHECK-NEXT: psubw %xmm0, %xmm3
3030
; CHECK-NEXT: pand %xmm1, %xmm3
3131
; CHECK-NEXT: pshufb %xmm2, %xmm3
3232
; CHECK-NEXT: movq %xmm3, (%edx,%ecx,8)

‎llvm/test/CodeGen/X86/widen_arith-3.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
1414
; CHECK-NEXT: andl $-8, %esp
1515
; CHECK-NEXT: subl $40, %esp
1616
; CHECK-NEXT: movl {{\.LCPI.*}}, %eax
17-
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <1,1,1,u>
1817
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
18+
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
1919
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
2020
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
2121
; CHECK-NEXT: movw $1, {{[0-9]+}}(%esp)
@@ -29,7 +29,7 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
2929
; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
3030
; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
3131
; CHECK-NEXT: pinsrd $2, 4(%edx,%eax,8), %xmm2
32-
; CHECK-NEXT: paddd %xmm0, %xmm2
32+
; CHECK-NEXT: psubd %xmm0, %xmm2
3333
; CHECK-NEXT: pextrw $4, %xmm2, 4(%ecx,%eax,8)
3434
; CHECK-NEXT: pshufb %xmm1, %xmm2
3535
; CHECK-NEXT: movd %xmm2, (%ecx,%eax,8)

‎llvm/test/CodeGen/X86/widen_cast-2.ll

+9-10
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ define void @convert(<7 x i32>* %dst, <14 x i16>* %src) nounwind {
77
; CHECK: # BB#0: # %entry
88
; CHECK-NEXT: pushl %eax
99
; CHECK-NEXT: movl $0, (%esp)
10-
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
11-
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,1,1,1,u,u>
10+
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
1211
; CHECK-NEXT: cmpl $3, (%esp)
1312
; CHECK-NEXT: jg .LBB0_3
1413
; CHECK-NEXT: .p2align 4, 0x90
@@ -18,14 +17,14 @@ define void @convert(<7 x i32>* %dst, <14 x i16>* %src) nounwind {
1817
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
1918
; CHECK-NEXT: shll $5, %eax
2019
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
21-
; CHECK-NEXT: movdqa (%edx,%eax), %xmm2
22-
; CHECK-NEXT: paddw %xmm0, %xmm2
23-
; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm3
24-
; CHECK-NEXT: paddw %xmm1, %xmm3
25-
; CHECK-NEXT: pextrd $2, %xmm3, 24(%ecx,%eax)
26-
; CHECK-NEXT: pextrd $1, %xmm3, 20(%ecx,%eax)
27-
; CHECK-NEXT: movd %xmm3, 16(%ecx,%eax)
28-
; CHECK-NEXT: movdqa %xmm2, (%ecx,%eax)
20+
; CHECK-NEXT: movdqa (%edx,%eax), %xmm1
21+
; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm2
22+
; CHECK-NEXT: psubw %xmm0, %xmm1
23+
; CHECK-NEXT: psubw %xmm0, %xmm2
24+
; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax)
25+
; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax)
26+
; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax)
27+
; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax)
2928
; CHECK-NEXT: incl (%esp)
3029
; CHECK-NEXT: cmpl $3, (%esp)
3130
; CHECK-NEXT: jle .LBB0_2

‎llvm/test/CodeGen/X86/widen_cast-3.ll

+4-2
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,17 @@ define void @convert(<12 x i8>* %dst.addr, <3 x i32> %src) nounwind {
88
; X86-LABEL: convert:
99
; X86: # BB#0:
1010
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
11-
; X86-NEXT: paddd {{\.LCPI.*}}, %xmm0
11+
; X86-NEXT: pcmpeqd %xmm1, %xmm1
12+
; X86-NEXT: psubd %xmm1, %xmm0
1213
; X86-NEXT: pextrd $2, %xmm0, 8(%eax)
1314
; X86-NEXT: pextrd $1, %xmm0, 4(%eax)
1415
; X86-NEXT: movd %xmm0, (%eax)
1516
; X86-NEXT: retl
1617
;
1718
; X64-LABEL: convert:
1819
; X64: # BB#0:
19-
; X64-NEXT: paddd {{.*}}(%rip), %xmm0
20+
; X64-NEXT: pcmpeqd %xmm1, %xmm1
21+
; X64-NEXT: psubd %xmm1, %xmm0
2022
; X64-NEXT: pextrd $2, %xmm0, 8(%rdi)
2123
; X64-NEXT: movq %xmm0, (%rdi)
2224
; X64-NEXT: retq

‎llvm/test/CodeGen/X86/widen_cast-4.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
99
; NARROW: # BB#0: # %entry
1010
; NARROW-NEXT: subl $12, %esp
1111
; NARROW-NEXT: movl $0, (%esp)
12-
; NARROW-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
12+
; NARROW-NEXT: pcmpeqd %xmm0, %xmm0
1313
; NARROW-NEXT: movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1414
; NARROW-NEXT: jmp .LBB0_1
1515
; NARROW-NEXT: .p2align 4, 0x90
@@ -26,7 +26,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
2626
; NARROW-NEXT: movl (%esp), %ecx
2727
; NARROW-NEXT: movl {{[0-9]+}}(%esp), %edx
2828
; NARROW-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
29-
; NARROW-NEXT: paddw %xmm0, %xmm2
29+
; NARROW-NEXT: psubw %xmm0, %xmm2
3030
; NARROW-NEXT: psllw $8, %xmm2
3131
; NARROW-NEXT: psraw $8, %xmm2
3232
; NARROW-NEXT: psraw $2, %xmm2
@@ -46,7 +46,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
4646
; WIDE: # BB#0: # %entry
4747
; WIDE-NEXT: subl $12, %esp
4848
; WIDE-NEXT: movl $0, (%esp)
49-
; WIDE-NEXT: movdqa {{.*#+}} xmm0 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u>
49+
; WIDE-NEXT: pcmpeqd %xmm0, %xmm0
5050
; WIDE-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
5151
; WIDE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
5252
; WIDE-NEXT: jmp .LBB0_1
@@ -65,7 +65,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
6565
; WIDE-NEXT: movl {{[0-9]+}}(%esp), %edx
6666
; WIDE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
6767
; WIDE-NEXT: pinsrd $1, 4(%eax,%ecx,8), %xmm3
68-
; WIDE-NEXT: paddb %xmm0, %xmm3
68+
; WIDE-NEXT: psubb %xmm0, %xmm3
6969
; WIDE-NEXT: psrlw $2, %xmm3
7070
; WIDE-NEXT: pand %xmm1, %xmm3
7171
; WIDE-NEXT: pxor %xmm2, %xmm3

‎llvm/test/CodeGen/X86/widen_conv-1.ll

+8-4
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) no
3535
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
3636
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
3737
; X86-NEXT: movdqa (%ecx), %xmm0
38-
; X86-NEXT: paddd {{\.LCPI.*}}, %xmm0
38+
; X86-NEXT: pcmpeqd %xmm1, %xmm1
39+
; X86-NEXT: psubd %xmm1, %xmm0
3940
; X86-NEXT: pextrb $8, %xmm0, 2(%eax)
4041
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
4142
; X86-NEXT: pextrw $0, %xmm0, (%eax)
@@ -45,7 +46,8 @@ define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) no
4546
; X64-LABEL: convert_v3i32_to_v3i8:
4647
; X64: # BB#0: # %entry
4748
; X64-NEXT: movdqa (%rsi), %xmm0
48-
; X64-NEXT: paddd {{.*}}(%rip), %xmm0
49+
; X64-NEXT: pcmpeqd %xmm1, %xmm1
50+
; X64-NEXT: psubd %xmm1, %xmm0
4951
; X64-NEXT: pextrb $8, %xmm0, 2(%rdi)
5052
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
5153
; X64-NEXT: pextrw $0, %xmm0, (%rdi)
@@ -70,7 +72,8 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) no
7072
; X86-NEXT: movl 8(%ebp), %eax
7173
; X86-NEXT: movl 12(%ebp), %ecx
7274
; X86-NEXT: movdqa (%ecx), %xmm0
73-
; X86-NEXT: paddw {{\.LCPI.*}}, %xmm0
75+
; X86-NEXT: pcmpeqd %xmm1, %xmm1
76+
; X86-NEXT: psubw %xmm1, %xmm0
7477
; X86-NEXT: pextrb $8, %xmm0, 4(%eax)
7578
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
7679
; X86-NEXT: movd %xmm0, (%eax)
@@ -81,7 +84,8 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) no
8184
; X64-LABEL: convert_v5i16_to_v5i8:
8285
; X64: # BB#0: # %entry
8386
; X64-NEXT: movdqa (%rsi), %xmm0
84-
; X64-NEXT: paddw {{.*}}(%rip), %xmm0
87+
; X64-NEXT: pcmpeqd %xmm1, %xmm1
88+
; X64-NEXT: psubw %xmm1, %xmm0
8589
; X64-NEXT: pextrb $8, %xmm0, 4(%rdi)
8690
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
8791
; X64-NEXT: movd %xmm0, (%rdi)

0 commit comments

Comments
 (0)
Please sign in to comment.