Skip to content

Commit b356d04

Browse files
committedNov 20, 2018
[TargetLowering] Improve SimplifyDemandedVectorElts/SimplifyDemandedBits support
For bitcast nodes from larger element types, add the ability for SimplifyDemandedVectorElts to call SimplifyDemandedBits by merging the elts mask to a bits mask. I've raised https://bugs.llvm.org/show_bug.cgi?id=39689 to deal with the few places where SimplifyDemandedBits's lack of vector handling is a problem. Differential Revision: https://reviews.llvm.org/D54679 llvm-svn: 347301
1 parent a6fb85f commit b356d04

File tree

7 files changed

+96
-518
lines changed

7 files changed

+96
-518
lines changed
 

‎llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1460,6 +1460,23 @@ bool TargetLowering::SimplifyDemandedVectorElts(
14601460
TLO, Depth + 1))
14611461
return true;
14621462

1463+
// Try calling SimplifyDemandedBits, converting demanded elts to the bits
1464+
// of the large element.
1465+
// TODO - bigendian once we have test coverage.
1466+
if (TLO.DAG.getDataLayout().isLittleEndian()) {
1467+
unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits();
1468+
APInt SrcDemandedBits = APInt::getNullValue(SrcEltSizeInBits);
1469+
for (unsigned i = 0; i != NumElts; ++i)
1470+
if (DemandedElts[i]) {
1471+
unsigned Ofs = (i % Scale) * EltSizeInBits;
1472+
SrcDemandedBits.setBits(Ofs, Ofs + EltSizeInBits);
1473+
}
1474+
1475+
KnownBits Known;
1476+
if (SimplifyDemandedBits(Src, SrcDemandedBits, Known, TLO, Depth + 1))
1477+
return true;
1478+
}
1479+
14631480
// If the src element is zero/undef then all the output elements will be -
14641481
// only demanded elements are guaranteed to be correct.
14651482
for (unsigned i = 0; i != NumSrcElts; ++i) {

‎llvm/test/CodeGen/X86/known-bits-vector.ll

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -158,20 +158,12 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {
158158
define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind {
159159
; X32-LABEL: knownbits_mask_or_shuffle_uitofp:
160160
; X32: # %bb.0:
161-
; X32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
162-
; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
163-
; X32-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
164-
; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
165-
; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
161+
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
166162
; X32-NEXT: retl
167163
;
168164
; X64-LABEL: knownbits_mask_or_shuffle_uitofp:
169165
; X64: # %bb.0:
170-
; X64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
171-
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
172-
; X64-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
173-
; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
174-
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
166+
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4]
175167
; X64-NEXT: retq
176168
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
177169
%2 = or <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535>

‎llvm/test/CodeGen/X86/known-signbits-vector.ll

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1)
230230
ret <4 x double> %3
231231
}
232232

233+
; TODO: Fix vpshufd+vpsrlq -> vpshufd/vpermilps
233234
define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
234235
; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
235236
; X32: # %bb.0:
@@ -239,7 +240,8 @@ define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4
239240
;
240241
; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
241242
; X64: # %bb.0:
242-
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
243+
; X64-NEXT: vpsrlq $32, %xmm0, %xmm0
244+
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
243245
; X64-NEXT: vcvtdq2pd %xmm0, %xmm0
244246
; X64-NEXT: retq
245247
%1 = ashr <2 x i64> %a0, <i64 16, i64 16>
@@ -255,20 +257,13 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
255257
; X32-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp:
256258
; X32: # %bb.0:
257259
; X32-NEXT: pushl %eax
258-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
259260
; X32-NEXT: vpsrlq $60, %xmm0, %xmm2
260261
; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
261262
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
262263
; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0]
263264
; X32-NEXT: vpxor %xmm2, %xmm0, %xmm0
264265
; X32-NEXT: vpsubq %xmm2, %xmm0, %xmm0
265-
; X32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
266-
; X32-NEXT: sarl $31, %eax
267-
; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
268-
; X32-NEXT: vpsllq $20, %xmm1, %xmm1
269-
; X32-NEXT: vpsrad $20, %xmm1, %xmm2
270-
; X32-NEXT: vpsrlq $20, %xmm1, %xmm1
271-
; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
266+
; X32-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm1, %xmm1
272267
; X32-NEXT: vpand %xmm1, %xmm0, %xmm0
273268
; X32-NEXT: vmovd %xmm0, %eax
274269
; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0

‎llvm/test/CodeGen/X86/reduce-trunc-shl.ll

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,7 @@ define <8 x i16> @trunc_shl_17_v8i16_v8i32(<8 x i32> %a) {
7272
;
7373
; AVX2-LABEL: trunc_shl_17_v8i16_v8i32:
7474
; AVX2: # %bb.0:
75-
; AVX2-NEXT: vpslld $17, %ymm0, %ymm0
76-
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
77-
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
78-
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
79-
; AVX2-NEXT: vzeroupper
75+
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
8076
; AVX2-NEXT: retq
8177
%shl = shl <8 x i32> %a, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
8278
%conv = trunc <8 x i32> %shl to <8 x i16>

‎llvm/test/CodeGen/X86/shrink_vmul.ll

Lines changed: 10 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,10 +1154,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
11541154
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
11551155
; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
11561156
; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
1157-
; X86-SSE-NEXT: pmuludq %xmm0, %xmm2
1158-
; X86-SSE-NEXT: psllq $32, %xmm2
1159-
; X86-SSE-NEXT: paddq %xmm1, %xmm2
1160-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1157+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
11611158
; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
11621159
; X86-SSE-NEXT: popl %esi
11631160
; X86-SSE-NEXT: retl
@@ -1191,10 +1188,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
11911188
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
11921189
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
11931190
; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
1194-
; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
1195-
; X64-SSE-NEXT: psllq $32, %xmm2
1196-
; X64-SSE-NEXT: paddq %xmm1, %xmm2
1197-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1191+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
11981192
; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
11991193
; X64-SSE-NEXT: retq
12001194
;
@@ -1952,15 +1946,7 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
19521946
; X86-SSE-NEXT: pxor %xmm1, %xmm1
19531947
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
19541948
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1955-
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,u,65536,u>
1956-
; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
1957-
; X86-SSE-NEXT: movdqa %xmm2, %xmm3
1958-
; X86-SSE-NEXT: psrlq $32, %xmm3
1959-
; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
1960-
; X86-SSE-NEXT: paddq %xmm1, %xmm3
1961-
; X86-SSE-NEXT: psllq $32, %xmm3
1962-
; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
1963-
; X86-SSE-NEXT: paddq %xmm3, %xmm0
1949+
; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
19641950
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
19651951
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
19661952
; X86-SSE-NEXT: retl
@@ -1986,13 +1972,10 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
19861972
; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
19871973
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
19881974
; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000
1989-
; X64-SSE-NEXT: movq %rcx, %xmm2
1990-
; X64-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
1991-
; X64-SSE-NEXT: pmuludq %xmm2, %xmm0
1992-
; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
1993-
; X64-SSE-NEXT: psllq $32, %xmm2
1994-
; X64-SSE-NEXT: paddq %xmm0, %xmm2
1995-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1975+
; X64-SSE-NEXT: movq %rcx, %xmm1
1976+
; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1977+
; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
1978+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
19961979
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
19971980
; X64-SSE-NEXT: retq
19981981
;
@@ -2037,16 +2020,7 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
20372020
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
20382021
; X86-SSE-NEXT: psrad $16, %xmm0
20392022
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
2040-
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,u,32768,u>
2041-
; X86-SSE-NEXT: pxor %xmm2, %xmm2
2042-
; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
2043-
; X86-SSE-NEXT: movdqa %xmm1, %xmm3
2044-
; X86-SSE-NEXT: psrlq $32, %xmm3
2045-
; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
2046-
; X86-SSE-NEXT: paddq %xmm2, %xmm3
2047-
; X86-SSE-NEXT: psllq $32, %xmm3
2048-
; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
2049-
; X86-SSE-NEXT: paddq %xmm3, %xmm0
2023+
; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
20502024
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
20512025
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
20522026
; X86-SSE-NEXT: retl
@@ -2072,12 +2046,8 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
20722046
; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
20732047
; X64-SSE-NEXT: movq %rcx, %xmm1
20742048
; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
2075-
; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
2076-
; X64-SSE-NEXT: pxor %xmm2, %xmm2
2077-
; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
2078-
; X64-SSE-NEXT: psllq $32, %xmm2
2079-
; X64-SSE-NEXT: paddq %xmm0, %xmm2
2080-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2049+
; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
2050+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
20812051
; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
20822052
; X64-SSE-NEXT: retq
20832053
;

‎llvm/test/CodeGen/X86/vector-trunc-math-widen.ll

Lines changed: 31 additions & 227 deletions
Original file line numberDiff line numberDiff line change
@@ -1823,26 +1823,8 @@ define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
18231823
define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
18241824
; SSE-LABEL: trunc_mul_v4i64_v4i32:
18251825
; SSE: # %bb.0:
1826-
; SSE-NEXT: movdqa %xmm1, %xmm4
1827-
; SSE-NEXT: psrlq $32, %xmm4
1828-
; SSE-NEXT: pmuludq %xmm3, %xmm4
1829-
; SSE-NEXT: movdqa %xmm3, %xmm5
1830-
; SSE-NEXT: psrlq $32, %xmm5
1831-
; SSE-NEXT: pmuludq %xmm1, %xmm5
1832-
; SSE-NEXT: paddq %xmm4, %xmm5
1833-
; SSE-NEXT: psllq $32, %xmm5
18341826
; SSE-NEXT: pmuludq %xmm3, %xmm1
1835-
; SSE-NEXT: paddq %xmm5, %xmm1
1836-
; SSE-NEXT: movdqa %xmm0, %xmm3
1837-
; SSE-NEXT: psrlq $32, %xmm3
1838-
; SSE-NEXT: pmuludq %xmm2, %xmm3
1839-
; SSE-NEXT: movdqa %xmm2, %xmm4
1840-
; SSE-NEXT: psrlq $32, %xmm4
1841-
; SSE-NEXT: pmuludq %xmm0, %xmm4
1842-
; SSE-NEXT: paddq %xmm3, %xmm4
1843-
; SSE-NEXT: psllq $32, %xmm4
18441827
; SSE-NEXT: pmuludq %xmm2, %xmm0
1845-
; SSE-NEXT: paddq %xmm4, %xmm0
18461828
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
18471829
; SSE-NEXT: retq
18481830
;
@@ -2089,94 +2071,14 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
20892071
define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
20902072
; SSE-LABEL: trunc_mul_v16i64_v16i8:
20912073
; SSE: # %bb.0:
2092-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
2093-
; SSE-NEXT: movdqa %xmm0, %xmm9
2094-
; SSE-NEXT: psrlq $32, %xmm9
2095-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2096-
; SSE-NEXT: movdqa %xmm8, %xmm10
2097-
; SSE-NEXT: psrlq $32, %xmm10
2098-
; SSE-NEXT: pmuludq %xmm0, %xmm10
2099-
; SSE-NEXT: paddq %xmm9, %xmm10
2100-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
2101-
; SSE-NEXT: psllq $32, %xmm10
2102-
; SSE-NEXT: pmuludq %xmm8, %xmm0
2103-
; SSE-NEXT: paddq %xmm10, %xmm0
2104-
; SSE-NEXT: movdqa %xmm1, %xmm8
2105-
; SSE-NEXT: psrlq $32, %xmm8
2106-
; SSE-NEXT: pmuludq %xmm9, %xmm8
2107-
; SSE-NEXT: movdqa %xmm9, %xmm10
2108-
; SSE-NEXT: psrlq $32, %xmm10
2109-
; SSE-NEXT: pmuludq %xmm1, %xmm10
2110-
; SSE-NEXT: paddq %xmm8, %xmm10
2111-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
2112-
; SSE-NEXT: psllq $32, %xmm10
2113-
; SSE-NEXT: pmuludq %xmm9, %xmm1
2114-
; SSE-NEXT: paddq %xmm10, %xmm1
2115-
; SSE-NEXT: movdqa %xmm2, %xmm9
2116-
; SSE-NEXT: psrlq $32, %xmm9
2117-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2118-
; SSE-NEXT: movdqa %xmm8, %xmm10
2119-
; SSE-NEXT: psrlq $32, %xmm10
2120-
; SSE-NEXT: pmuludq %xmm2, %xmm10
2121-
; SSE-NEXT: paddq %xmm9, %xmm10
2122-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
2123-
; SSE-NEXT: psllq $32, %xmm10
2124-
; SSE-NEXT: pmuludq %xmm8, %xmm2
2125-
; SSE-NEXT: paddq %xmm10, %xmm2
2126-
; SSE-NEXT: movdqa %xmm3, %xmm8
2127-
; SSE-NEXT: psrlq $32, %xmm8
2128-
; SSE-NEXT: pmuludq %xmm9, %xmm8
2129-
; SSE-NEXT: movdqa %xmm9, %xmm10
2130-
; SSE-NEXT: psrlq $32, %xmm10
2131-
; SSE-NEXT: pmuludq %xmm3, %xmm10
2132-
; SSE-NEXT: paddq %xmm8, %xmm10
2133-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
2134-
; SSE-NEXT: psllq $32, %xmm10
2135-
; SSE-NEXT: pmuludq %xmm9, %xmm3
2136-
; SSE-NEXT: paddq %xmm10, %xmm3
2137-
; SSE-NEXT: movdqa %xmm4, %xmm9
2138-
; SSE-NEXT: psrlq $32, %xmm9
2139-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2140-
; SSE-NEXT: movdqa %xmm8, %xmm10
2141-
; SSE-NEXT: psrlq $32, %xmm10
2142-
; SSE-NEXT: pmuludq %xmm4, %xmm10
2143-
; SSE-NEXT: paddq %xmm9, %xmm10
2144-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
2145-
; SSE-NEXT: psllq $32, %xmm10
2146-
; SSE-NEXT: pmuludq %xmm8, %xmm4
2147-
; SSE-NEXT: paddq %xmm10, %xmm4
2148-
; SSE-NEXT: movdqa %xmm5, %xmm8
2149-
; SSE-NEXT: psrlq $32, %xmm8
2150-
; SSE-NEXT: pmuludq %xmm9, %xmm8
2151-
; SSE-NEXT: movdqa %xmm9, %xmm10
2152-
; SSE-NEXT: psrlq $32, %xmm10
2153-
; SSE-NEXT: pmuludq %xmm5, %xmm10
2154-
; SSE-NEXT: paddq %xmm8, %xmm10
2155-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
2156-
; SSE-NEXT: psllq $32, %xmm10
2157-
; SSE-NEXT: pmuludq %xmm9, %xmm5
2158-
; SSE-NEXT: paddq %xmm10, %xmm5
2159-
; SSE-NEXT: movdqa %xmm6, %xmm9
2160-
; SSE-NEXT: psrlq $32, %xmm9
2161-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2162-
; SSE-NEXT: movdqa %xmm8, %xmm10
2163-
; SSE-NEXT: psrlq $32, %xmm10
2164-
; SSE-NEXT: pmuludq %xmm6, %xmm10
2165-
; SSE-NEXT: paddq %xmm9, %xmm10
2166-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
2167-
; SSE-NEXT: psllq $32, %xmm10
2168-
; SSE-NEXT: pmuludq %xmm8, %xmm6
2169-
; SSE-NEXT: paddq %xmm10, %xmm6
2170-
; SSE-NEXT: movdqa %xmm7, %xmm8
2171-
; SSE-NEXT: psrlq $32, %xmm8
2172-
; SSE-NEXT: pmuludq %xmm9, %xmm8
2173-
; SSE-NEXT: movdqa %xmm9, %xmm10
2174-
; SSE-NEXT: psrlq $32, %xmm10
2175-
; SSE-NEXT: pmuludq %xmm7, %xmm10
2176-
; SSE-NEXT: paddq %xmm8, %xmm10
2177-
; SSE-NEXT: pmuludq %xmm9, %xmm7
2178-
; SSE-NEXT: psllq $32, %xmm10
2179-
; SSE-NEXT: paddq %xmm10, %xmm7
2074+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0
2075+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1
2076+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2
2077+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3
2078+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4
2079+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5
2080+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6
2081+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7
21802082
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
21812083
; SSE-NEXT: pand %xmm8, %xmm7
21822084
; SSE-NEXT: pand %xmm8, %xmm6
@@ -2601,22 +2503,11 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
26012503
define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
26022504
; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
26032505
; SSE: # %bb.0:
2604-
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3]
2605-
; SSE-NEXT: movdqa %xmm1, %xmm3
2606-
; SSE-NEXT: pmuludq %xmm2, %xmm3
2607-
; SSE-NEXT: psrlq $32, %xmm1
2608-
; SSE-NEXT: pmuludq %xmm2, %xmm1
2609-
; SSE-NEXT: psllq $32, %xmm1
2610-
; SSE-NEXT: paddq %xmm3, %xmm1
26112506
; SSE-NEXT: movl $1, %eax
26122507
; SSE-NEXT: movq %rax, %xmm2
26132508
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
2614-
; SSE-NEXT: movdqa %xmm0, %xmm3
2615-
; SSE-NEXT: pmuludq %xmm2, %xmm3
2616-
; SSE-NEXT: psrlq $32, %xmm0
26172509
; SSE-NEXT: pmuludq %xmm2, %xmm0
2618-
; SSE-NEXT: psllq $32, %xmm0
2619-
; SSE-NEXT: paddq %xmm3, %xmm0
2510+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
26202511
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
26212512
; SSE-NEXT: retq
26222513
;
@@ -2773,61 +2664,14 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
27732664
; SSE-NEXT: movl $1, %eax
27742665
; SSE-NEXT: movq %rax, %xmm8
27752666
; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
2776-
; SSE-NEXT: movdqa %xmm0, %xmm9
2777-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2778-
; SSE-NEXT: psrlq $32, %xmm0
27792667
; SSE-NEXT: pmuludq %xmm8, %xmm0
2780-
; SSE-NEXT: psllq $32, %xmm0
2781-
; SSE-NEXT: paddq %xmm9, %xmm0
2782-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3]
2783-
; SSE-NEXT: movdqa %xmm1, %xmm9
2784-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2785-
; SSE-NEXT: psrlq $32, %xmm1
2786-
; SSE-NEXT: pmuludq %xmm8, %xmm1
2787-
; SSE-NEXT: psllq $32, %xmm1
2788-
; SSE-NEXT: paddq %xmm9, %xmm1
2789-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5]
2790-
; SSE-NEXT: movdqa %xmm2, %xmm9
2791-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2792-
; SSE-NEXT: psrlq $32, %xmm2
2793-
; SSE-NEXT: pmuludq %xmm8, %xmm2
2794-
; SSE-NEXT: psllq $32, %xmm2
2795-
; SSE-NEXT: paddq %xmm9, %xmm2
2796-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7]
2797-
; SSE-NEXT: movdqa %xmm3, %xmm9
2798-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2799-
; SSE-NEXT: psrlq $32, %xmm3
2800-
; SSE-NEXT: pmuludq %xmm8, %xmm3
2801-
; SSE-NEXT: psllq $32, %xmm3
2802-
; SSE-NEXT: paddq %xmm9, %xmm3
2803-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9]
2804-
; SSE-NEXT: movdqa %xmm4, %xmm9
2805-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2806-
; SSE-NEXT: psrlq $32, %xmm4
2807-
; SSE-NEXT: pmuludq %xmm8, %xmm4
2808-
; SSE-NEXT: psllq $32, %xmm4
2809-
; SSE-NEXT: paddq %xmm9, %xmm4
2810-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11]
2811-
; SSE-NEXT: movdqa %xmm5, %xmm9
2812-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2813-
; SSE-NEXT: psrlq $32, %xmm5
2814-
; SSE-NEXT: pmuludq %xmm8, %xmm5
2815-
; SSE-NEXT: psllq $32, %xmm5
2816-
; SSE-NEXT: paddq %xmm9, %xmm5
2817-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13]
2818-
; SSE-NEXT: movdqa %xmm6, %xmm9
2819-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2820-
; SSE-NEXT: psrlq $32, %xmm6
2821-
; SSE-NEXT: pmuludq %xmm8, %xmm6
2822-
; SSE-NEXT: psllq $32, %xmm6
2823-
; SSE-NEXT: paddq %xmm9, %xmm6
2824-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15]
2825-
; SSE-NEXT: movdqa %xmm7, %xmm9
2826-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2827-
; SSE-NEXT: psrlq $32, %xmm7
2828-
; SSE-NEXT: pmuludq %xmm8, %xmm7
2829-
; SSE-NEXT: psllq $32, %xmm7
2830-
; SSE-NEXT: paddq %xmm9, %xmm7
2668+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
2669+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2
2670+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3
2671+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4
2672+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5
2673+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6
2674+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7
28312675
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
28322676
; SSE-NEXT: pand %xmm8, %xmm7
28332677
; SSE-NEXT: pand %xmm8, %xmm6
@@ -5538,17 +5382,10 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi
55385382
; SSE: # %bb.0:
55395383
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
55405384
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
5541-
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
5385+
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
5386+
; SSE-NEXT: pmuludq %xmm2, %xmm0
55425387
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5543-
; SSE-NEXT: pmuludq %xmm1, %xmm3
5544-
; SSE-NEXT: pxor %xmm0, %xmm0
5545-
; SSE-NEXT: pmuludq %xmm0, %xmm1
5546-
; SSE-NEXT: psllq $32, %xmm1
5547-
; SSE-NEXT: paddq %xmm3, %xmm1
5548-
; SSE-NEXT: pmuludq %xmm4, %xmm2
5549-
; SSE-NEXT: pmuludq %xmm4, %xmm0
5550-
; SSE-NEXT: psllq $32, %xmm0
5551-
; SSE-NEXT: paddq %xmm2, %xmm0
5388+
; SSE-NEXT: pmuludq %xmm3, %xmm1
55525389
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
55535390
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
55545391
; SSE-NEXT: retq
@@ -5569,40 +5406,14 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi
55695406
define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
55705407
; SSE-LABEL: mul_add_self_v4i64_v4i32:
55715408
; SSE: # %bb.0:
5572-
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
5573-
; SSE-NEXT: pxor %xmm8, %xmm8
5574-
; SSE-NEXT: pxor %xmm3, %xmm3
5575-
; SSE-NEXT: pcmpgtd %xmm2, %xmm3
5576-
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
5577-
; SSE-NEXT: pxor %xmm7, %xmm7
5578-
; SSE-NEXT: pcmpgtd %xmm0, %xmm7
5579-
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
5580-
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
5581-
; SSE-NEXT: pxor %xmm6, %xmm6
5582-
; SSE-NEXT: pcmpgtd %xmm4, %xmm6
5583-
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
5584-
; SSE-NEXT: pxor %xmm5, %xmm5
5585-
; SSE-NEXT: pcmpgtd %xmm1, %xmm5
5586-
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
5587-
; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
5588-
; SSE-NEXT: pmuludq %xmm1, %xmm7
5589-
; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
5590-
; SSE-NEXT: pmuludq %xmm0, %xmm5
5591-
; SSE-NEXT: paddq %xmm7, %xmm5
5592-
; SSE-NEXT: psllq $32, %xmm5
5593-
; SSE-NEXT: pmuludq %xmm0, %xmm1
5594-
; SSE-NEXT: paddq %xmm5, %xmm1
5595-
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
5596-
; SSE-NEXT: pmuludq %xmm4, %xmm3
5597-
; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
5598-
; SSE-NEXT: pmuludq %xmm2, %xmm6
5599-
; SSE-NEXT: paddq %xmm3, %xmm6
5600-
; SSE-NEXT: psllq $32, %xmm6
5601-
; SSE-NEXT: pmuludq %xmm2, %xmm4
5602-
; SSE-NEXT: paddq %xmm6, %xmm4
5603-
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
5604-
; SSE-NEXT: paddd %xmm1, %xmm1
5605-
; SSE-NEXT: movdqa %xmm1, %xmm0
5409+
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5410+
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
5411+
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
5412+
; SSE-NEXT: pmuludq %xmm2, %xmm0
5413+
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5414+
; SSE-NEXT: pmuludq %xmm3, %xmm1
5415+
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
5416+
; SSE-NEXT: paddd %xmm0, %xmm0
56065417
; SSE-NEXT: retq
56075418
;
56085419
; AVX-LABEL: mul_add_self_v4i64_v4i32:
@@ -5624,18 +5435,11 @@ define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nou
56245435
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
56255436
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
56265437
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
5438+
; SSE-NEXT: pmuludq %xmm2, %xmm4
56275439
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5628-
; SSE-NEXT: pmuludq %xmm1, %xmm3
5629-
; SSE-NEXT: pxor %xmm5, %xmm5
5630-
; SSE-NEXT: pmuludq %xmm5, %xmm1
5631-
; SSE-NEXT: psllq $32, %xmm1
5632-
; SSE-NEXT: paddq %xmm3, %xmm1
5633-
; SSE-NEXT: pmuludq %xmm4, %xmm2
5634-
; SSE-NEXT: pmuludq %xmm4, %xmm5
5635-
; SSE-NEXT: psllq $32, %xmm5
5636-
; SSE-NEXT: paddq %xmm2, %xmm5
5637-
; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
5638-
; SSE-NEXT: paddd %xmm5, %xmm0
5440+
; SSE-NEXT: pmuludq %xmm3, %xmm1
5441+
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
5442+
; SSE-NEXT: paddd %xmm4, %xmm0
56395443
; SSE-NEXT: retq
56405444
;
56415445
; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:

‎llvm/test/CodeGen/X86/vector-trunc-math.ll

Lines changed: 31 additions & 227 deletions
Original file line numberDiff line numberDiff line change
@@ -1823,26 +1823,8 @@ define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
18231823
define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
18241824
; SSE-LABEL: trunc_mul_v4i64_v4i32:
18251825
; SSE: # %bb.0:
1826-
; SSE-NEXT: movdqa %xmm1, %xmm4
1827-
; SSE-NEXT: psrlq $32, %xmm4
1828-
; SSE-NEXT: pmuludq %xmm3, %xmm4
1829-
; SSE-NEXT: movdqa %xmm3, %xmm5
1830-
; SSE-NEXT: psrlq $32, %xmm5
1831-
; SSE-NEXT: pmuludq %xmm1, %xmm5
1832-
; SSE-NEXT: paddq %xmm4, %xmm5
1833-
; SSE-NEXT: psllq $32, %xmm5
18341826
; SSE-NEXT: pmuludq %xmm3, %xmm1
1835-
; SSE-NEXT: paddq %xmm5, %xmm1
1836-
; SSE-NEXT: movdqa %xmm0, %xmm3
1837-
; SSE-NEXT: psrlq $32, %xmm3
1838-
; SSE-NEXT: pmuludq %xmm2, %xmm3
1839-
; SSE-NEXT: movdqa %xmm2, %xmm4
1840-
; SSE-NEXT: psrlq $32, %xmm4
1841-
; SSE-NEXT: pmuludq %xmm0, %xmm4
1842-
; SSE-NEXT: paddq %xmm3, %xmm4
1843-
; SSE-NEXT: psllq $32, %xmm4
18441827
; SSE-NEXT: pmuludq %xmm2, %xmm0
1845-
; SSE-NEXT: paddq %xmm4, %xmm0
18461828
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
18471829
; SSE-NEXT: retq
18481830
;
@@ -2089,94 +2071,14 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
20892071
define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
20902072
; SSE-LABEL: trunc_mul_v16i64_v16i8:
20912073
; SSE: # %bb.0:
2092-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
2093-
; SSE-NEXT: movdqa %xmm0, %xmm9
2094-
; SSE-NEXT: psrlq $32, %xmm9
2095-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2096-
; SSE-NEXT: movdqa %xmm8, %xmm10
2097-
; SSE-NEXT: psrlq $32, %xmm10
2098-
; SSE-NEXT: pmuludq %xmm0, %xmm10
2099-
; SSE-NEXT: paddq %xmm9, %xmm10
2100-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
2101-
; SSE-NEXT: psllq $32, %xmm10
2102-
; SSE-NEXT: pmuludq %xmm8, %xmm0
2103-
; SSE-NEXT: paddq %xmm10, %xmm0
2104-
; SSE-NEXT: movdqa %xmm1, %xmm8
2105-
; SSE-NEXT: psrlq $32, %xmm8
2106-
; SSE-NEXT: pmuludq %xmm9, %xmm8
2107-
; SSE-NEXT: movdqa %xmm9, %xmm10
2108-
; SSE-NEXT: psrlq $32, %xmm10
2109-
; SSE-NEXT: pmuludq %xmm1, %xmm10
2110-
; SSE-NEXT: paddq %xmm8, %xmm10
2111-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
2112-
; SSE-NEXT: psllq $32, %xmm10
2113-
; SSE-NEXT: pmuludq %xmm9, %xmm1
2114-
; SSE-NEXT: paddq %xmm10, %xmm1
2115-
; SSE-NEXT: movdqa %xmm2, %xmm9
2116-
; SSE-NEXT: psrlq $32, %xmm9
2117-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2118-
; SSE-NEXT: movdqa %xmm8, %xmm10
2119-
; SSE-NEXT: psrlq $32, %xmm10
2120-
; SSE-NEXT: pmuludq %xmm2, %xmm10
2121-
; SSE-NEXT: paddq %xmm9, %xmm10
2122-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
2123-
; SSE-NEXT: psllq $32, %xmm10
2124-
; SSE-NEXT: pmuludq %xmm8, %xmm2
2125-
; SSE-NEXT: paddq %xmm10, %xmm2
2126-
; SSE-NEXT: movdqa %xmm3, %xmm8
2127-
; SSE-NEXT: psrlq $32, %xmm8
2128-
; SSE-NEXT: pmuludq %xmm9, %xmm8
2129-
; SSE-NEXT: movdqa %xmm9, %xmm10
2130-
; SSE-NEXT: psrlq $32, %xmm10
2131-
; SSE-NEXT: pmuludq %xmm3, %xmm10
2132-
; SSE-NEXT: paddq %xmm8, %xmm10
2133-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
2134-
; SSE-NEXT: psllq $32, %xmm10
2135-
; SSE-NEXT: pmuludq %xmm9, %xmm3
2136-
; SSE-NEXT: paddq %xmm10, %xmm3
2137-
; SSE-NEXT: movdqa %xmm4, %xmm9
2138-
; SSE-NEXT: psrlq $32, %xmm9
2139-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2140-
; SSE-NEXT: movdqa %xmm8, %xmm10
2141-
; SSE-NEXT: psrlq $32, %xmm10
2142-
; SSE-NEXT: pmuludq %xmm4, %xmm10
2143-
; SSE-NEXT: paddq %xmm9, %xmm10
2144-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
2145-
; SSE-NEXT: psllq $32, %xmm10
2146-
; SSE-NEXT: pmuludq %xmm8, %xmm4
2147-
; SSE-NEXT: paddq %xmm10, %xmm4
2148-
; SSE-NEXT: movdqa %xmm5, %xmm8
2149-
; SSE-NEXT: psrlq $32, %xmm8
2150-
; SSE-NEXT: pmuludq %xmm9, %xmm8
2151-
; SSE-NEXT: movdqa %xmm9, %xmm10
2152-
; SSE-NEXT: psrlq $32, %xmm10
2153-
; SSE-NEXT: pmuludq %xmm5, %xmm10
2154-
; SSE-NEXT: paddq %xmm8, %xmm10
2155-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
2156-
; SSE-NEXT: psllq $32, %xmm10
2157-
; SSE-NEXT: pmuludq %xmm9, %xmm5
2158-
; SSE-NEXT: paddq %xmm10, %xmm5
2159-
; SSE-NEXT: movdqa %xmm6, %xmm9
2160-
; SSE-NEXT: psrlq $32, %xmm9
2161-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2162-
; SSE-NEXT: movdqa %xmm8, %xmm10
2163-
; SSE-NEXT: psrlq $32, %xmm10
2164-
; SSE-NEXT: pmuludq %xmm6, %xmm10
2165-
; SSE-NEXT: paddq %xmm9, %xmm10
2166-
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
2167-
; SSE-NEXT: psllq $32, %xmm10
2168-
; SSE-NEXT: pmuludq %xmm8, %xmm6
2169-
; SSE-NEXT: paddq %xmm10, %xmm6
2170-
; SSE-NEXT: movdqa %xmm7, %xmm8
2171-
; SSE-NEXT: psrlq $32, %xmm8
2172-
; SSE-NEXT: pmuludq %xmm9, %xmm8
2173-
; SSE-NEXT: movdqa %xmm9, %xmm10
2174-
; SSE-NEXT: psrlq $32, %xmm10
2175-
; SSE-NEXT: pmuludq %xmm7, %xmm10
2176-
; SSE-NEXT: paddq %xmm8, %xmm10
2177-
; SSE-NEXT: pmuludq %xmm9, %xmm7
2178-
; SSE-NEXT: psllq $32, %xmm10
2179-
; SSE-NEXT: paddq %xmm10, %xmm7
2074+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0
2075+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1
2076+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2
2077+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3
2078+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4
2079+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5
2080+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6
2081+
; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7
21802082
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
21812083
; SSE-NEXT: pand %xmm8, %xmm7
21822084
; SSE-NEXT: pand %xmm8, %xmm6
@@ -2601,22 +2503,11 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
26012503
define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
26022504
; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
26032505
; SSE: # %bb.0:
2604-
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3]
2605-
; SSE-NEXT: movdqa %xmm1, %xmm3
2606-
; SSE-NEXT: pmuludq %xmm2, %xmm3
2607-
; SSE-NEXT: psrlq $32, %xmm1
2608-
; SSE-NEXT: pmuludq %xmm2, %xmm1
2609-
; SSE-NEXT: psllq $32, %xmm1
2610-
; SSE-NEXT: paddq %xmm3, %xmm1
26112506
; SSE-NEXT: movl $1, %eax
26122507
; SSE-NEXT: movq %rax, %xmm2
26132508
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
2614-
; SSE-NEXT: movdqa %xmm0, %xmm3
2615-
; SSE-NEXT: pmuludq %xmm2, %xmm3
2616-
; SSE-NEXT: psrlq $32, %xmm0
26172509
; SSE-NEXT: pmuludq %xmm2, %xmm0
2618-
; SSE-NEXT: psllq $32, %xmm0
2619-
; SSE-NEXT: paddq %xmm3, %xmm0
2510+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
26202511
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
26212512
; SSE-NEXT: retq
26222513
;
@@ -2773,61 +2664,14 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
27732664
; SSE-NEXT: movl $1, %eax
27742665
; SSE-NEXT: movq %rax, %xmm8
27752666
; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
2776-
; SSE-NEXT: movdqa %xmm0, %xmm9
2777-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2778-
; SSE-NEXT: psrlq $32, %xmm0
27792667
; SSE-NEXT: pmuludq %xmm8, %xmm0
2780-
; SSE-NEXT: psllq $32, %xmm0
2781-
; SSE-NEXT: paddq %xmm9, %xmm0
2782-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3]
2783-
; SSE-NEXT: movdqa %xmm1, %xmm9
2784-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2785-
; SSE-NEXT: psrlq $32, %xmm1
2786-
; SSE-NEXT: pmuludq %xmm8, %xmm1
2787-
; SSE-NEXT: psllq $32, %xmm1
2788-
; SSE-NEXT: paddq %xmm9, %xmm1
2789-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5]
2790-
; SSE-NEXT: movdqa %xmm2, %xmm9
2791-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2792-
; SSE-NEXT: psrlq $32, %xmm2
2793-
; SSE-NEXT: pmuludq %xmm8, %xmm2
2794-
; SSE-NEXT: psllq $32, %xmm2
2795-
; SSE-NEXT: paddq %xmm9, %xmm2
2796-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7]
2797-
; SSE-NEXT: movdqa %xmm3, %xmm9
2798-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2799-
; SSE-NEXT: psrlq $32, %xmm3
2800-
; SSE-NEXT: pmuludq %xmm8, %xmm3
2801-
; SSE-NEXT: psllq $32, %xmm3
2802-
; SSE-NEXT: paddq %xmm9, %xmm3
2803-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9]
2804-
; SSE-NEXT: movdqa %xmm4, %xmm9
2805-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2806-
; SSE-NEXT: psrlq $32, %xmm4
2807-
; SSE-NEXT: pmuludq %xmm8, %xmm4
2808-
; SSE-NEXT: psllq $32, %xmm4
2809-
; SSE-NEXT: paddq %xmm9, %xmm4
2810-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11]
2811-
; SSE-NEXT: movdqa %xmm5, %xmm9
2812-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2813-
; SSE-NEXT: psrlq $32, %xmm5
2814-
; SSE-NEXT: pmuludq %xmm8, %xmm5
2815-
; SSE-NEXT: psllq $32, %xmm5
2816-
; SSE-NEXT: paddq %xmm9, %xmm5
2817-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13]
2818-
; SSE-NEXT: movdqa %xmm6, %xmm9
2819-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2820-
; SSE-NEXT: psrlq $32, %xmm6
2821-
; SSE-NEXT: pmuludq %xmm8, %xmm6
2822-
; SSE-NEXT: psllq $32, %xmm6
2823-
; SSE-NEXT: paddq %xmm9, %xmm6
2824-
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15]
2825-
; SSE-NEXT: movdqa %xmm7, %xmm9
2826-
; SSE-NEXT: pmuludq %xmm8, %xmm9
2827-
; SSE-NEXT: psrlq $32, %xmm7
2828-
; SSE-NEXT: pmuludq %xmm8, %xmm7
2829-
; SSE-NEXT: psllq $32, %xmm7
2830-
; SSE-NEXT: paddq %xmm9, %xmm7
2668+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
2669+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2
2670+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3
2671+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4
2672+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5
2673+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6
2674+
; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7
28312675
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
28322676
; SSE-NEXT: pand %xmm8, %xmm7
28332677
; SSE-NEXT: pand %xmm8, %xmm6
@@ -5538,17 +5382,10 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi
55385382
; SSE: # %bb.0:
55395383
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
55405384
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
5541-
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
5385+
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
5386+
; SSE-NEXT: pmuludq %xmm2, %xmm0
55425387
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5543-
; SSE-NEXT: pmuludq %xmm1, %xmm3
5544-
; SSE-NEXT: pxor %xmm0, %xmm0
5545-
; SSE-NEXT: pmuludq %xmm0, %xmm1
5546-
; SSE-NEXT: psllq $32, %xmm1
5547-
; SSE-NEXT: paddq %xmm3, %xmm1
5548-
; SSE-NEXT: pmuludq %xmm4, %xmm2
5549-
; SSE-NEXT: pmuludq %xmm4, %xmm0
5550-
; SSE-NEXT: psllq $32, %xmm0
5551-
; SSE-NEXT: paddq %xmm2, %xmm0
5388+
; SSE-NEXT: pmuludq %xmm3, %xmm1
55525389
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
55535390
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
55545391
; SSE-NEXT: retq
@@ -5569,40 +5406,14 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi
55695406
define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
55705407
; SSE-LABEL: mul_add_self_v4i64_v4i32:
55715408
; SSE: # %bb.0:
5572-
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
5573-
; SSE-NEXT: pxor %xmm8, %xmm8
5574-
; SSE-NEXT: pxor %xmm3, %xmm3
5575-
; SSE-NEXT: pcmpgtd %xmm2, %xmm3
5576-
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
5577-
; SSE-NEXT: pxor %xmm7, %xmm7
5578-
; SSE-NEXT: pcmpgtd %xmm0, %xmm7
5579-
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
5580-
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
5581-
; SSE-NEXT: pxor %xmm6, %xmm6
5582-
; SSE-NEXT: pcmpgtd %xmm4, %xmm6
5583-
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
5584-
; SSE-NEXT: pxor %xmm5, %xmm5
5585-
; SSE-NEXT: pcmpgtd %xmm1, %xmm5
5586-
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
5587-
; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
5588-
; SSE-NEXT: pmuludq %xmm1, %xmm7
5589-
; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
5590-
; SSE-NEXT: pmuludq %xmm0, %xmm5
5591-
; SSE-NEXT: paddq %xmm7, %xmm5
5592-
; SSE-NEXT: psllq $32, %xmm5
5593-
; SSE-NEXT: pmuludq %xmm0, %xmm1
5594-
; SSE-NEXT: paddq %xmm5, %xmm1
5595-
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
5596-
; SSE-NEXT: pmuludq %xmm4, %xmm3
5597-
; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
5598-
; SSE-NEXT: pmuludq %xmm2, %xmm6
5599-
; SSE-NEXT: paddq %xmm3, %xmm6
5600-
; SSE-NEXT: psllq $32, %xmm6
5601-
; SSE-NEXT: pmuludq %xmm2, %xmm4
5602-
; SSE-NEXT: paddq %xmm6, %xmm4
5603-
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
5604-
; SSE-NEXT: paddd %xmm1, %xmm1
5605-
; SSE-NEXT: movdqa %xmm1, %xmm0
5409+
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5410+
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
5411+
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
5412+
; SSE-NEXT: pmuludq %xmm2, %xmm0
5413+
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5414+
; SSE-NEXT: pmuludq %xmm3, %xmm1
5415+
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
5416+
; SSE-NEXT: paddd %xmm0, %xmm0
56065417
; SSE-NEXT: retq
56075418
;
56085419
; AVX-LABEL: mul_add_self_v4i64_v4i32:
@@ -5624,18 +5435,11 @@ define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nou
56245435
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
56255436
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
56265437
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
5438+
; SSE-NEXT: pmuludq %xmm2, %xmm4
56275439
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
5628-
; SSE-NEXT: pmuludq %xmm1, %xmm3
5629-
; SSE-NEXT: pxor %xmm5, %xmm5
5630-
; SSE-NEXT: pmuludq %xmm5, %xmm1
5631-
; SSE-NEXT: psllq $32, %xmm1
5632-
; SSE-NEXT: paddq %xmm3, %xmm1
5633-
; SSE-NEXT: pmuludq %xmm4, %xmm2
5634-
; SSE-NEXT: pmuludq %xmm4, %xmm5
5635-
; SSE-NEXT: psllq $32, %xmm5
5636-
; SSE-NEXT: paddq %xmm2, %xmm5
5637-
; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
5638-
; SSE-NEXT: paddd %xmm5, %xmm0
5440+
; SSE-NEXT: pmuludq %xmm3, %xmm1
5441+
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
5442+
; SSE-NEXT: paddd %xmm4, %xmm0
56395443
; SSE-NEXT: retq
56405444
;
56415445
; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:

0 commit comments

Comments
 (0)
Please sign in to comment.