Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7580,7 +7580,7 @@ auto *BV1 = dyn_cast(N1); auto *BV00 = dyn_cast(N00); auto *BV01 = dyn_cast(N01); - + // Check 1: Make sure that the first operand of the inner multiply is NOT // a constant. Otherwise, we may induce infinite looping. if (!(isConstOrConstSplatFP(N00) || (BV00 && BV00->isConstant()))) { @@ -12004,6 +12004,51 @@ return V; } + // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - + // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR or + // SCALAR_TO_VECTOR operation. + if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { + int NumDefElts = 0; + SmallVector Ops; + for (int M : SVN->getMask()) { + SDValue Op = DAG.getUNDEF(VT.getScalarType()); + if (M >= 0) { + int Idx = M % NumElts; + SDValue &S = (M < (int)NumElts ? N0 : N1); + if (S.getOpcode() == ISD::BUILD_VECTOR && S.hasOneUse()) { + Op = S.getOperand(Idx); + } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR && S.hasOneUse()) { + if (Idx == 0) + Op = S.getOperand(0); + } else { + // Operand can't be combined into shuffle - bail out. + break; + } + } + if (Op.getOpcode() != ISD::UNDEF) + NumDefElts++; + Ops.push_back(Op); + } + if (Ops.size() == VT.getVectorNumElements()) { + // Create SCALAR_TO_VECTOR if the only defined input is input[0]. + if (1 == NumDefElts && Ops[0].getOpcode() != ISD::UNDEF) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Ops[0]); + + // BUILD_VECTOR requires all inputs to be of the same type, find the + // maximum type and extend them all. + EVT SVT = VT.getScalarType(); + if (SVT.isInteger()) + for (SDValue &Op : Ops) + SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); + if (SVT != VT.getScalarType()) + for (SDValue &Op : Ops) + Op = TLI.isZExtFree(Op.getValueType(), SVT) + ? DAG.getZExtOrTrunc(Op, SDLoc(N), SVT) + : DAG.getSExtOrTrunc(Op, SDLoc(N), SVT); + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Ops); + } + } + // If this shuffle only has a single input that is a bitcasted shuffle, // attempt to merge the 2 shuffles and suitably bitcast the inputs/output // back to their original types. @@ -12273,7 +12318,7 @@ SDValue RHS = N->getOperand(1); SDLoc dl(N); - // Make sure we're not running after operation legalization where it + // Make sure we're not running after operation legalization where it // may have custom lowered the vector shuffles. if (LegalOperations) return SDValue(); Index: test/CodeGen/AArch64/arm64-neon-copy.ll =================================================================== --- test/CodeGen/AArch64/arm64-neon-copy.ll +++ test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1086,7 +1086,7 @@ ; CHECK-LABEL: test_concat_diff_v1i32_v1i32: ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} -; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: ins {{v[0-9]+}}.s[1], w{{[0-9]+}} entry: %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) %d = insertelement <2 x i32> undef, i32 %c, i32 0 Index: test/CodeGen/AArch64/arm64-vshuffle.ll =================================================================== --- test/CodeGen/AArch64/arm64-vshuffle.ll +++ test/CodeGen/AArch64/arm64-vshuffle.ll @@ -1,22 +1,8 @@ ; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s -; The mask: -; CHECK: lCPI0_0: -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; The second vector is legalized to undef and the elements of the first vector -; are used instead. -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 4 ; 0x4 -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 0 ; 0x0 ; CHECK: test1 -; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0 -; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8 -; CHECK: tbl.8b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]] +; CHECK: movi d[[REG0:[0-9]+]], #0000000000000000 define <8 x i1> @test1() { entry: %Shuff = shufflevector <8 x i1> @test2() { bb: %Shuff = shufflevector <8 x i1> zeroinitializer, @@ -51,28 +35,8 @@ ret <8 x i1> %Shuff } -; CHECK: lCPI2_0: -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 ; CHECK: test3 -; CHECK: adrp x[[REG3:[0-9]+]], lCPI2_0@PAGE -; CHECK: ldr q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF] -; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF] -; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]] +; CHECK: movi.4s v{{[0-9]+}}, #0x1 define <16 x i1> @test3(i1* %ptr, i32 %v) { bb: %Shuff = shufflevector <16 x i1> , <16 x i1> undef, @@ -81,29 +45,26 @@ i32 14, i32 0> ret <16 x i1> %Shuff } -; CHECK: lCPI3_1: +; CHECK: lCPI3_0: +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 ; CHECK: .byte 0 ; 0x0 ; CHECK: .byte 1 ; 0x1 -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 18 ; 0x12 -; CHECK: .byte 4 ; 0x4 -; CHECK: .byte 5 ; 0x5 -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 7 ; 0x7 -; CHECK: .byte 8 ; 0x8 -; CHECK: .byte 31 ; 0x1f -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 30 ; 0x1e -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 13 ; 0xd -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 15 ; 0xf +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 ; CHECK: _test4: -; CHECK: ldr q[[REG1:[0-9]+]] -; CHECK: movi.2d v[[REG0:[0-9]+]], #0000000000000000 -; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_1@PAGE -; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF] -; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]] +; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_0@PAGE +; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_0@PAGEOFF] define <16 x i1> @test4(i1* %ptr, i32 %v) { bb: %Shuff = shufflevector <16 x i1> zeroinitializer, Index: test/CodeGen/PowerPC/vperm-lowering.ll =================================================================== --- test/CodeGen/PowerPC/vperm-lowering.ll +++ test/CodeGen/PowerPC/vperm-lowering.ll @@ -6,61 +6,26 @@ define <16 x i8> @foo() nounwind ssp { %1 = shufflevector <16 x i8> , <16 x i8> , <16 x i32> ret <16 x i8> %1 -} - -; CHECK: .LCPI0_0: -; CHECK: .byte 31 -; CHECK: .byte 26 -; CHECK: .byte 21 -; CHECK: .byte 16 -; CHECK: .byte 11 -; CHECK: .byte 6 -; CHECK: .byte 1 -; CHECK: .byte 28 -; CHECK: .byte 23 -; CHECK: .byte 18 -; CHECK: .byte 13 -; CHECK: .byte 8 -; CHECK: .byte 3 -; CHECK: .byte 30 -; CHECK: .byte 25 -; CHECK: .byte 20 -; CHECK: .LCPI0_1: -; CHECK: .byte 0 -; CHECK: .byte 1 -; CHECK: .byte 2 -; CHECK: .byte 3 -; CHECK: .byte 4 -; CHECK: .byte 5 -; CHECK: .byte 6 -; CHECK: .byte 7 -; CHECK: .byte 8 -; CHECK: .byte 9 -; CHECK: .byte 10 -; CHECK: .byte 11 -; CHECK: .byte 12 -; CHECK: .byte 13 -; CHECK: .byte 14 -; CHECK: .byte 15 -; CHECK: .LCPI0_2: -; CHECK: .byte 16 -; CHECK: .byte 17 -; CHECK: .byte 18 -; CHECK: .byte 19 -; CHECK: .byte 20 -; CHECK: .byte 21 -; CHECK: .byte 22 -; CHECK: .byte 23 -; CHECK: .byte 24 -; CHECK: .byte 25 -; CHECK: .byte 26 -; CHECK: .byte 27 -; CHECK: .byte 28 -; CHECK: .byte 29 -; CHECK: .byte 30 -; CHECK: .byte 31 -; CHECK: foo: -; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_2@toc@ha -; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_2@toc@l -; CHECK: lvx [[REG3:[0-9]+]], 0, [[REG2]] -; CHECK: vperm {{[0-9]+}}, [[REG3]], {{[0-9]+}}, {{[0-9]+}} +} + +; CHECK: .LCPI0_0: +; CHECK: .byte 0 +; CHECK: .byte 5 +; CHECK: .byte 10 +; CHECK: .byte 15 +; CHECK: .byte 20 +; CHECK: .byte 25 +; CHECK: .byte 30 +; CHECK: .byte 3 +; CHECK: .byte 8 +; CHECK: .byte 13 +; CHECK: .byte 18 +; CHECK: .byte 23 +; CHECK: .byte 28 +; CHECK: .byte 1 +; CHECK: .byte 6 +; CHECK: .byte 11 +; CHECK: foo: +; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_0@toc@ha +; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_0@toc@l +; CHECK: lvx [[REG3:[0-9]+]], 0, [[REG2]] Index: test/CodeGen/X86/mmx-bitcast.ll =================================================================== --- test/CodeGen/X86/mmx-bitcast.ll +++ test/CodeGen/X86/mmx-bitcast.ll @@ -72,14 +72,13 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone { ; CHECK-LABEL: t5: ; CHECK: ## BB#0: -; CHECK-NEXT: movd -; CHECK-NEXT: movd -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; CHECK-NEXT: movd %xmm0, %rax -; CHECK-NEXT: retq - %v0 = insertelement <2 x i32> undef, i32 %a, i32 0 - %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1 +; CHECK-NEXT: movd +; CHECK-NEXT: movd +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: retq + %v0 = insertelement <2 x i32> undef, i32 %a, i32 0 + %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1 %conv = bitcast <2 x i32> %v1 to i64 ret i64 %conv } Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -1022,39 +1022,12 @@ ; X64-NEXT: retq %load = load <4 x float> , <4 x float> *%ptr %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> - ret <4 x float> %ret -} - -; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> -define void @insertps_pr20411(i32* noalias nocapture %RET) #1 { -; X32-LABEL: insertps_pr20411: -; X32: ## BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] -; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[3,1,2,3] -; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; X32-NEXT: movdqu %xmm1, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: insertps_pr20411: -; X64: ## BB#0: -; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] -; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[3,1,2,3] -; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; X64-NEXT: movdqu %xmm1, (%rdi) -; X64-NEXT: retq - %gather_load = shufflevector <8 x i32> , <8 x i32> undef, <8 x i32> - %shuffle109 = shufflevector <4 x i32> , <4 x i32> undef, <4 x i32> ; 4 5 6 7 - %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> ; 3 x x x - %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> ; 3 7 x x - %ptrcast = bitcast i32* %RET to <4 x i32>* - store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 - ret void -} - -define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { -; X32-LABEL: insertps_4: -; X32: ## BB#0: ## %entry + ret <4 x float> %ret +} + +define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_4: +; X32: ## BB#0: ## %entry ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero ; X32-NEXT: retl ; Index: test/CodeGen/X86/vec_insert-5.ll =================================================================== --- test/CodeGen/X86/vec_insert-5.ll +++ test/CodeGen/X86/vec_insert-5.ll @@ -5,13 +5,13 @@ ; CHECK-LABEL: t1: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: shll $12, %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] -; CHECK-NEXT: movlpd %xmm0, (%eax) -; CHECK-NEXT: retl - %tmp12 = shl i32 %a, 12 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: shll $12, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; CHECK-NEXT: movlpd %xmm0, (%eax) +; CHECK-NEXT: retl + %tmp12 = shl i32 %a, 12 %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0 %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx Index: test/CodeGen/X86/vec_insert-mmx.ll =================================================================== --- test/CodeGen/X86/vec_insert-mmx.ll +++ test/CodeGen/X86/vec_insert-mmx.ll @@ -3,13 +3,13 @@ ; This is not an MMX operation; promoted to XMM. define x86_mmx @t0(i32 %A) nounwind { -; X86-32-LABEL: t0: -; X86-32: ## BB#0: -; X86-32: movd {{[0-9]+}}(%esp), %xmm0 -; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] -; X86-32-NEXT: movlpd %xmm0, (%esp) -; X86-32-NEXT: movq (%esp), %mm0 -; X86-32-NEXT: addl $12, %esp +; X86-32-LABEL: t0: +; X86-32: ## BB#0: +; X86-32: movd {{[0-9]+}}(%esp), %xmm0 +; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X86-32-NEXT: movlpd %xmm0, (%esp) +; X86-32-NEXT: movq (%esp), %mm0 +; X86-32-NEXT: addl $12, %esp ; X86-32-NEXT: retl %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx Index: test/CodeGen/X86/vec_zero_cse.ll =================================================================== --- test/CodeGen/X86/vec_zero_cse.ll +++ test/CodeGen/X86/vec_zero_cse.ll @@ -17,7 +17,7 @@ define void @test2() { ;CHECK-LABEL: @test2 -;CHECK: pshufd +;CHECK: pcmpeqd store <1 x i64> < i64 -1 >, <1 x i64>* @M1 store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2 ret void Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -631,114 +631,79 @@ ; AVX-NEXT: retq %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle -} - -define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 0 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> - ret <16 x i8> %shuffle -} - -define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 0 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> - ret <16 x i8> %shuffle -} - -define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { -; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; SSE: # BB#0: -; SSE-NEXT: movd %edi, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 0 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> - ret <16 x i8> %shuffle -} - -define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pslld $24, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 3 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> +} + +define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: movd %eax, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: shll $8, %edi +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $2, %edi, %xmm0 +; SSE-NEXT: retq + +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: shll $8, %edi +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { +; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSE: # BB#0: +; SSE-NEXT: shll $8, %edi +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $7, %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; AVX: # BB#0: +; AVX-NEXT: shll $8, %edi +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $1, %eax, %xmm0 +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 3 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle } Index: test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v8.ll +++ test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1381,77 +1381,71 @@ ret <8 x i16> %shuffle } -define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { -; SSE-LABEL: shuffle_v8i16_z8zzzzzz: -; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_z8zzzzzz: -; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 0 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> - ret <8 x i16> %shuffle -} - -define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { -; SSE-LABEL: shuffle_v8i16_zzzzz8zz: -; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_zzzzz8zz: -; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 0 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> - ret <8 x i16> %shuffle -} - -define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { -; SSE-LABEL: shuffle_v8i16_zuuzuuz8: -; SSE: # BB#0: -; SSE-NEXT: movd %edi, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_zuuzuuz8: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 0 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> - ret <8 x i16> %shuffle -} - -define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { -; SSE-LABEL: shuffle_v8i16_zzBzzzzz: -; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_zzBzzzzz: -; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 3 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> +define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_z8zzzzzz: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_z8zzzzzz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $1, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zzzzz8zz: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $5, %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzzzz8zz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zuuzuuz8: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $7, %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zuuzuuz8: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zzBzzzzz: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $2, %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzBzzzzz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %edi, %xmm0 +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 3 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle }