Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -9281,15 +9281,6 @@ if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1); - // If we have a single input to the zero element, insert that into V1 if we - // can do so cheaply. - int NumV2Elements = - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); - if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) - return Insertion; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -9432,15 +9423,6 @@ ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // If we have a single input to the zero element, insert that into V1 if we - // can do so cheaply. - int NumV2Elements = - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 8; }); - if (NumV2Elements == 1 && Mask[0] >= 8) - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) - return Insertion; - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -9811,6 +9793,18 @@ ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumElts = VT.getVectorNumElements(); + int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) { + return M >= NumElts; + }); + + if (NumV2Elements == 1 && Mask[0] >= NumElts) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, VT, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // There is a really nice hard cut-over between AVX1 and AVX2 that means we can // check for those subtargets here and avoid much of the subtarget querying in // the per-vector-type lowering routines. With AVX1 we have essentially *zero* Index: llvm/trunk/test/CodeGen/X86/2012-1-10-buildvector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/2012-1-10-buildvector.ll +++ llvm/trunk/test/CodeGen/X86/2012-1-10-buildvector.ll @@ -3,25 +3,26 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32" target triple = "i686-pc-win32" -;CHECK-LABEL: bad_cast: +; CHECK-LABEL: bad_cast: define void @bad_cast() { entry: %vext.i = shufflevector <2 x i64> undef, <2 x i64> undef, <3 x i32> %vecinit8.i = shufflevector <3 x i64> zeroinitializer, <3 x i64> %vext.i, <3 x i32> store <3 x i64> %vecinit8.i, <3 x i64>* undef, align 32 -;CHECK: ret +; CHECK: ret ret void } -;CHECK-LABEL: bad_insert: +; CHECK-LABEL: bad_insert: define void @bad_insert(i32 %t) { entry: -;CHECK: vxorps %ymm1, %ymm1, %ymm1 -;CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; CHECK: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovaps %ymm0 +; CHECK: ret + %v2 = insertelement <8 x i32> zeroinitializer, i32 %t, i32 0 store <8 x i32> %v2, <8 x i32> addrspace(1)* undef, align 32 -;CHECK: ret ret void } Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -813,15 +813,11 @@ ; AVX1-LABEL: insert_reg_and_zero_v4i64: ; AVX1: # BB#0: ; AVX1-NEXT: vmovq %rdi, %xmm0 -; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_reg_and_zero_v4i64: ; AVX2: # BB#0: ; AVX2-NEXT: vmovq %rdi, %xmm0 -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: retq %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> @@ -831,16 +827,12 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { ; AVX1-LABEL: insert_mem_and_zero_v4i64: ; AVX1: # BB#0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_mem_and_zero_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <4 x i64> undef, i64 %a, i64 0 Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -133,8 +133,8 @@ ; AVX2: # BB#0: ; AVX2-NEXT: movl $7, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] +; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -962,8 +962,8 @@ ; AVX2: # BB#0: ; AVX2-NEXT: movl $7, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] +; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>