Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7580,7 +7580,7 @@ auto *BV1 = dyn_cast(N1); auto *BV00 = dyn_cast(N00); auto *BV01 = dyn_cast(N01); - + // Check 1: Make sure that the first operand of the inner multiply is NOT // a constant. Otherwise, we may induce infinite looping. if (!(isConstOrConstSplatFP(N00) || (BV00 && BV00->isConstant()))) { @@ -12002,6 +12002,48 @@ return V; } + // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - + // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR or + // SCALAR_TO_VECTOR operation. + if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { + int NumDefElts = 0; + SmallVector Ops; + for (int M : SVN->getMask()) { + SDValue Op = DAG.getUNDEF(VT.getScalarType()); + if (M >= 0) { + int Idx = M % NumElts; + SDValue &S = (M < (int)NumElts ? N0 : N1); + if (S.getOpcode() == ISD::BUILD_VECTOR && S.hasOneUse()) { + Op = S.getOperand(Idx); + } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR && S.hasOneUse()) { + if(Idx == 0) + Op = S.getOperand(0); + } else { + break; + } + } + if (Op.getOpcode() != ISD::UNDEF) + NumDefElts++; + Ops.push_back(Op); + } + if (Ops.size() == VT.getVectorNumElements()) { + // Create SCALAR_TO_VECTOR if the only defined input is input[0]. + if (1 == NumDefElts && Ops[0].getOpcode() != ISD::UNDEF) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Ops[0]); + + // BUILD_VECTOR requires all inputs to be of the same type, find the + // maximum type and extend them all. + EVT SVT = VT.getScalarType(); + if (SVT.isInteger()) + for (SDValue &Op : Ops) + SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); + if (SVT != VT.getScalarType()) + for (SDValue &Op : Ops) + Op = DAG.getSExtOrTrunc(Op, SDLoc(N), SVT); + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Ops); + } + } + // If this shuffle only has a single input that is a bitcasted shuffle, // attempt to merge the 2 shuffles and suitably bitcast the inputs/output // back to their original types. @@ -12271,7 +12313,7 @@ SDValue RHS = N->getOperand(1); SDLoc dl(N); - // Make sure we're not running after operation legalization where it + // Make sure we're not running after operation legalization where it // may have custom lowered the vector shuffles. if (LegalOperations) return SDValue(); Index: test/CodeGen/AArch64/arm64-neon-copy.ll =================================================================== --- test/CodeGen/AArch64/arm64-neon-copy.ll +++ test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1083,13 +1083,13 @@ } define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) { -; CHECK-LABEL: test_concat_diff_v1i32_v1i32: -; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} -; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} -; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s -entry: - %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) - %d = insertelement <2 x i32> undef, i32 %c, i32 0 +; CHECK-LABEL: test_concat_diff_v1i32_v1i32: +; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} +; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} +; CHECK: ins {{v[0-9]+}}.s[1], w{{[0-9]+}} +entry: + %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) + %d = insertelement <2 x i32> undef, i32 %c, i32 0 %e = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %b) %f = insertelement <2 x i32> undef, i32 %e, i32 0 %h = shufflevector <2 x i32> %d, <2 x i32> %f, <2 x i32> Index: test/CodeGen/AArch64/arm64-vshuffle.ll =================================================================== --- test/CodeGen/AArch64/arm64-vshuffle.ll +++ test/CodeGen/AArch64/arm64-vshuffle.ll @@ -1,25 +1,11 @@ -; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s - - -; The mask: -; CHECK: lCPI0_0: -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; The second vector is legalized to undef and the elements of the first vector -; are used instead. -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 4 ; 0x4 -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 0 ; 0x0 -; CHECK: test1 -; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0 -; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8 -; CHECK: tbl.8b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]] -define <8 x i1> @test1() { -entry: - %Shuff = shufflevector <8 x i1> @test1() { +entry: + %Shuff = shufflevector <8 x i1> , <8 x i1> , @@ -27,86 +13,61 @@ i32 12, i32 14, i32 0> ret <8 x i1> %Shuff } - -; CHECK: lCPI1_0: -; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 7 ; 0x7 -; CHECK: test2 -; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF] -; CHECK: adrp x[[REG2:[0-9]+]], lCPI1_1@PAGE -; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF] -; CHECK: tbl.8b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]] -define <8 x i1>@test2() { -bb: - %Shuff = shufflevector <8 x i1> zeroinitializer, + +; CHECK: lCPI1_0: +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 1 ; 0x1 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: test2 +; CHECK: adrp x[[REG2:[0-9]+]], lCPI1_0@PAGE +; CHECK: ldr d[[REG1:[0-9]+]], [x[[REG2]], lCPI1_0@PAGEOFF] +define <8 x i1>@test2() { +bb: + %Shuff = shufflevector <8 x i1> zeroinitializer, <8 x i1> , <8 x i32> - ret <8 x i1> %Shuff -} - -; CHECK: lCPI2_0: -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 -; CHECK: test3 -; CHECK: adrp x[[REG3:[0-9]+]], lCPI2_0@PAGE -; CHECK: ldr q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF] -; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF] -; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]] -define <16 x i1> @test3(i1* %ptr, i32 %v) { -bb: - %Shuff = shufflevector <16 x i1> , <16 x i1> undef, + ret <8 x i1> %Shuff +} + +; CHECK: test3 +; CHECK: movi.4s v{{[0-9]+}}, #0x1 +define <16 x i1> @test3(i1* %ptr, i32 %v) { +bb: + %Shuff = shufflevector <16 x i1> , <16 x i1> undef, <16 x i32> - ret <16 x i1> %Shuff -} -; CHECK: lCPI3_1: -; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 1 ; 0x1 -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 18 ; 0x12 -; CHECK: .byte 4 ; 0x4 -; CHECK: .byte 5 ; 0x5 -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 7 ; 0x7 -; CHECK: .byte 8 ; 0x8 -; CHECK: .byte 31 ; 0x1f -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 30 ; 0x1e -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 13 ; 0xd -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 15 ; 0xf -; CHECK: _test4: -; CHECK: ldr q[[REG1:[0-9]+]] -; CHECK: movi.2d v[[REG0:[0-9]+]], #0000000000000000 -; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_1@PAGE -; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF] -; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]] -define <16 x i1> @test4(i1* %ptr, i32 %v) { -bb: - %Shuff = shufflevector <16 x i1> zeroinitializer, + i32 14, i32 0> + ret <16 x i1> %Shuff +} +; CHECK: lCPI3_0: +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 1 ; 0x1 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: _test4: +; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_0@PAGE +; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_0@PAGEOFF] +define <16 x i1> @test4(i1* %ptr, i32 %v) { +bb: + %Shuff = shufflevector <16 x i1> zeroinitializer, <16 x i1> , <16 x i32> @foo() nounwind ssp { - %1 = shufflevector <16 x i8> , <16 x i8> , <16 x i32> - ret <16 x i8> %1 -} - -; CHECK: .LCPI0_0: -; CHECK: .byte 31 -; CHECK: .byte 26 -; CHECK: .byte 21 -; CHECK: .byte 16 -; CHECK: .byte 11 -; CHECK: .byte 6 -; CHECK: .byte 1 -; CHECK: .byte 28 -; CHECK: .byte 23 -; CHECK: .byte 18 -; CHECK: .byte 13 -; CHECK: .byte 8 -; CHECK: .byte 3 -; CHECK: .byte 30 -; CHECK: .byte 25 -; CHECK: .byte 20 -; CHECK: .LCPI0_1: -; CHECK: .byte 0 -; CHECK: .byte 1 -; CHECK: .byte 2 -; CHECK: .byte 3 -; CHECK: .byte 4 -; CHECK: .byte 5 -; CHECK: .byte 6 -; CHECK: .byte 7 -; CHECK: .byte 8 -; CHECK: .byte 9 -; CHECK: .byte 10 -; CHECK: .byte 11 -; CHECK: .byte 12 -; CHECK: .byte 13 -; CHECK: .byte 14 -; CHECK: .byte 15 -; CHECK: .LCPI0_2: -; CHECK: .byte 16 -; CHECK: .byte 17 -; CHECK: .byte 18 -; CHECK: .byte 19 -; CHECK: .byte 20 -; CHECK: .byte 21 -; CHECK: .byte 22 -; CHECK: .byte 23 -; CHECK: .byte 24 -; CHECK: .byte 25 -; CHECK: .byte 26 -; CHECK: .byte 27 -; CHECK: .byte 28 -; CHECK: .byte 29 -; CHECK: .byte 30 -; CHECK: .byte 31 -; CHECK: foo: -; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_2@toc@ha -; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_2@toc@l -; CHECK: lvx [[REG3:[0-9]+]], 0, [[REG2]] -; CHECK: vperm {{[0-9]+}}, [[REG3]], {{[0-9]+}}, {{[0-9]+}} +define <16 x i8> @foo() nounwind ssp { + %1 = shufflevector <16 x i8> , <16 x i8> , <16 x i32> + ret <16 x i8> %1 +} + +; CHECK: .LCPI0_0: +; CHECK: .byte 0 +; CHECK: .byte 5 +; CHECK: .byte 10 +; CHECK: .byte 15 +; CHECK: .byte 20 +; CHECK: .byte 25 +; CHECK: .byte 30 +; CHECK: .byte 3 +; CHECK: .byte 8 +; CHECK: .byte 13 +; CHECK: .byte 18 +; CHECK: .byte 23 +; CHECK: .byte 28 +; CHECK: .byte 1 +; CHECK: .byte 6 +; CHECK: .byte 11 +; CHECK: foo: +; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_0@toc@ha +; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_0@toc@l +; CHECK: lvx [[REG3:[0-9]+]], 0, [[REG2]] Index: test/CodeGen/X86/mmx-bitcast.ll =================================================================== --- test/CodeGen/X86/mmx-bitcast.ll +++ test/CodeGen/X86/mmx-bitcast.ll @@ -69,20 +69,19 @@ ret void } -define i64 @t5(i32 %a, i32 %b) nounwind readnone { -; CHECK-LABEL: t5: -; CHECK: ## BB#0: -; CHECK-NEXT: movd -; CHECK-NEXT: movd -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; CHECK-NEXT: movd %xmm0, %rax -; CHECK-NEXT: retq - %v0 = insertelement <2 x i32> undef, i32 %a, i32 0 - %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1 - %conv = bitcast <2 x i32> %v1 to i64 - ret i64 %conv -} +define i64 @t5(i32 %a, i32 %b) nounwind readnone { +; CHECK-LABEL: t5: +; CHECK: ## BB#0: +; CHECK-NEXT: movd +; CHECK-NEXT: movd +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: retq + %v0 = insertelement <2 x i32> undef, i32 %a, i32 0 + %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1 + %conv = bitcast <2 x i32> %v1 to i64 + ret i64 %conv +} declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -1019,45 +1019,18 @@ ; X64-LABEL: pr20087: ; X64: ## BB#0: ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2] -; X64-NEXT: retq - %load = load <4 x float> , <4 x float> *%ptr - %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> - ret <4 x float> %ret -} - -; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> -define void @insertps_pr20411(i32* noalias nocapture %RET) #1 { -; X32-LABEL: insertps_pr20411: -; X32: ## BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] -; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[3,1,2,3] -; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; X32-NEXT: movdqu %xmm1, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: insertps_pr20411: -; X64: ## BB#0: -; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] -; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[3,1,2,3] -; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; X64-NEXT: movdqu %xmm1, (%rdi) -; X64-NEXT: retq - %gather_load = shufflevector <8 x i32> , <8 x i32> undef, <8 x i32> - %shuffle109 = shufflevector <4 x i32> , <4 x i32> undef, <4 x i32> ; 4 5 6 7 - %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> ; 3 x x x - %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> ; 3 7 x x - %ptrcast = bitcast i32* %RET to <4 x i32>* - store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 - ret void -} - -define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { -; X32-LABEL: insertps_4: -; X32: ## BB#0: ## %entry -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero -; X32-NEXT: retl -; +; X64-NEXT: retq + %load = load <4 x float> , <4 x float> *%ptr + %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> + ret <4 x float> %ret +} + +define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_4: +; X32: ## BB#0: ## %entry +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero +; X32-NEXT: retl +; ; X64-LABEL: insertps_4: ; X64: ## BB#0: ## %entry ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero Index: test/CodeGen/X86/vec_insert-5.ll =================================================================== --- test/CodeGen/X86/vec_insert-5.ll +++ test/CodeGen/X86/vec_insert-5.ll @@ -2,19 +2,19 @@ ; There are no MMX operations in @t1 define void @t1(i32 %a, x86_mmx* %P) nounwind { -; CHECK-LABEL: t1: -; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: shll $12, %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] -; CHECK-NEXT: movlpd %xmm0, (%eax) -; CHECK-NEXT: retl - %tmp12 = shl i32 %a, 12 - %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 - %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0 - %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx +; CHECK-LABEL: t1: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: shll $12, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; CHECK-NEXT: movlpd %xmm0, (%eax) +; CHECK-NEXT: retl + %tmp12 = shl i32 %a, 12 + %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 + %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0 + %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx store x86_mmx %tmp23, x86_mmx* %P ret void } Index: test/CodeGen/X86/vec_insert-mmx.ll =================================================================== --- test/CodeGen/X86/vec_insert-mmx.ll +++ test/CodeGen/X86/vec_insert-mmx.ll @@ -1,18 +1,18 @@ ; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-32 ; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s -check-prefix=X86-64 - -; This is not an MMX operation; promoted to XMM. -define x86_mmx @t0(i32 %A) nounwind { -; X86-32-LABEL: t0: -; X86-32: ## BB#0: -; X86-32: movd {{[0-9]+}}(%esp), %xmm0 -; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] -; X86-32-NEXT: movlpd %xmm0, (%esp) -; X86-32-NEXT: movq (%esp), %mm0 -; X86-32-NEXT: addl $12, %esp -; X86-32-NEXT: retl - %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 - %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx + +; This is not an MMX operation; promoted to XMM. +define x86_mmx @t0(i32 %A) nounwind { +; X86-32-LABEL: t0: +; X86-32: ## BB#0: +; X86-32: movd {{[0-9]+}}(%esp), %xmm0 +; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X86-32-NEXT: movlpd %xmm0, (%esp) +; X86-32-NEXT: movq (%esp), %mm0 +; X86-32-NEXT: addl $12, %esp +; X86-32-NEXT: retl + %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 + %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx ret x86_mmx %tmp4 } Index: test/CodeGen/X86/vec_zero_cse.ll =================================================================== --- test/CodeGen/X86/vec_zero_cse.ll +++ test/CodeGen/X86/vec_zero_cse.ll @@ -14,13 +14,13 @@ store <2 x i32> zeroinitializer, <2 x i32>* @M2 ret void } - -define void @test2() { -;CHECK-LABEL: @test2 -;CHECK: pshufd - store <1 x i64> < i64 -1 >, <1 x i64>* @M1 - store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2 - ret void + +define void @test2() { +;CHECK-LABEL: @test2 +;CHECK: pcmpeqd + store <1 x i64> < i64 -1 >, <1 x i64>* @M1 + store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2 + ret void } define void @test3() { Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -628,120 +628,85 @@ ; AVX-LABEL: PR20540: ; AVX: # BB#0: ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> - ret <16 x i8> %shuffle -} - -define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 0 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> - ret <16 x i8> %shuffle -} - -define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 0 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> - ret <16 x i8> %shuffle -} - -define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { -; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; SSE: # BB#0: -; SSE-NEXT: movd %edi, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 0 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> - ret <16 x i8> %shuffle -} - -define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pslld $24, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 3 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> - ret <16 x i8> %shuffle -} - +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: movd %eax, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: shll $8, %edi +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $2, %edi, %xmm0 +; SSE-NEXT: retq + +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: shll $8, %edi +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { +; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSE: # BB#0: +; SSE-NEXT: shll $8, %edi +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $7, %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; AVX: # BB#0: +; AVX-NEXT: shll $8, %edi +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $1, %eax, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 3 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) { ; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: ; SSE: # BB#0: Index: test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v8.ll +++ test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1378,83 +1378,77 @@ ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> - ret <8 x i16> %shuffle -} - -define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { -; SSE-LABEL: shuffle_v8i16_z8zzzzzz: -; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_z8zzzzzz: -; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 0 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> - ret <8 x i16> %shuffle -} - -define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { -; SSE-LABEL: shuffle_v8i16_zzzzz8zz: -; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_zzzzz8zz: -; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 0 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> - ret <8 x i16> %shuffle -} - -define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { -; SSE-LABEL: shuffle_v8i16_zuuzuuz8: -; SSE: # BB#0: -; SSE-NEXT: movd %edi, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_zuuzuuz8: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 0 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> - ret <8 x i16> %shuffle -} - -define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { -; SSE-LABEL: shuffle_v8i16_zzBzzzzz: -; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_zzBzzzzz: -; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 3 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> - ret <8 x i16> %shuffle -} - + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_z8zzzzzz: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_z8zzzzzz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $1, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zzzzz8zz: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $5, %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzzzz8zz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zuuzuuz8: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $7, %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zuuzuuz8: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zzBzzzzz: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $2, %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzBzzzzz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 3 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + define <8 x i16> @shuffle_v8i16_def01234(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_def01234: ; SSE2: # BB#0: