diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2638,6 +2638,39 @@ } break; } + case ISD::BITCAST: { + SDValue Src = V.getOperand(0); + EVT SrcVT = Src.getValueType(); + unsigned SrcBitWidth = SrcVT.getScalarSizeInBits(); + unsigned BitWidth = VT.getScalarSizeInBits(); + + // Ignore bitcasts from unsupported types. + // TODO: Add fp support? + if (!SrcVT.isVector() || !SrcVT.isInteger() || !VT.isInteger()) + break; + + // Bitcast 'small element' vector to 'large element' vector. + if ((BitWidth % SrcBitWidth) == 0) { + // See if each sub element is a splat. + unsigned Scale = BitWidth / SrcBitWidth; + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + APInt ScaledDemandedElts = + APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); + for (unsigned I = 0; I != Scale; ++I) { + APInt SubUndefElts; + APInt SubDemandedElt = APInt::getOneBitSet(Scale, I); + APInt SubDemandedElts = APInt::getSplat(NumSrcElts, SubDemandedElt); + SubDemandedElts &= ScaledDemandedElts; + if (!isSplatValue(Src, SubDemandedElts, SubUndefElts, Depth + 1)) + return false; + // TODO: Add support for merging sub undef elements. + if (SubDemandedElts.isSubsetOf(SubUndefElts)) + return false; + } + return true; + } + break; + } } return false; diff --git a/llvm/test/CodeGen/X86/pr15296.ll b/llvm/test/CodeGen/X86/pr15296.ll --- a/llvm/test/CodeGen/X86/pr15296.ll +++ b/llvm/test/CodeGen/X86/pr15296.ll @@ -62,11 +62,11 @@ define <4 x i64> @shiftInput___64in32bitmode(<4 x i64> %input, i64 %shiftval) nounwind { ; X86-LABEL: shiftInput___64in32bitmode: ; X86: # %bb.0: # %allocas -; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; X86-NEXT: vpsrlq %xmm2, %xmm1, %xmm1 -; X86-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; X86-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: shiftInput___64in32bitmode: @@ -87,26 +87,20 @@ define <4 x i64> @shiftInput___2x32bitcast(<4 x i64> %input, i32 %shiftval) nounwind { ; X86-LABEL: shiftInput___2x32bitcast: ; X86: # %bb.0: # %allocas -; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-NEXT: vpsrlq %xmm2, %xmm1, %xmm1 -; X86-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; X86-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: shiftInput___2x32bitcast: ; X64: # %bb.0: # %allocas -; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-NEXT: vmovd %edi, %xmm2 -; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero -; X64-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 -; X64-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,1] -; X64-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; X64-NEXT: vpsrlq %xmm2, %xmm0, %xmm2 -; X64-NEXT: vpsrlq %xmm4, %xmm0, %xmm0 -; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-NEXT: vmovd %edi, %xmm1 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; X64-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; X64-NEXT: retq allocas: %smear.0 = insertelement <8 x i32> zeroinitializer, i32 %shiftval, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -1036,23 +1036,14 @@ ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0] -; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 -; X86-SSE2-NEXT: pandn %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm3, %xmm4 ; X86-SSE2-NEXT: psrlq $1, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlq %xmm5, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 -; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] -; X86-SSE2-NEXT: pand %xmm4, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psllq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psllq %xmm3, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: psllq %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat) diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -802,24 +802,15 @@ ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0] ; X86-SSE2-NEXT: pxor %xmm3, %xmm3 ; X86-SSE2-NEXT: psubq %xmm1, %xmm3 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: psllq %xmm1, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 -; X86-SSE2-NEXT: psllq %xmm1, %xmm5 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X86-SSE2-NEXT: pand %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm5, %xmm0 +; X86-SSE2-NEXT: psrlq %xmm3, %xmm0 +; X86-SSE2-NEXT: por %xmm4, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat) diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1126,23 +1126,14 @@ ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0] -; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 -; X86-SSE2-NEXT: pand %xmm4, %xmm5 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: psrlq %xmm5, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 -; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] -; X86-SSE2-NEXT: pandn %xmm4, %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm2 ; X86-SSE2-NEXT: psllq $1, %xmm0 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psllq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psllq %xmm3, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm2, %xmm0 +; X86-SSE2-NEXT: psllq %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat) diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -828,24 +828,15 @@ ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0] ; X86-SSE2-NEXT: pxor %xmm3, %xmm3 ; X86-SSE2-NEXT: psubq %xmm1, %xmm3 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: psrlq %xmm1, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 -; X86-SSE2-NEXT: psrlq %xmm1, %xmm5 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] ; X86-SSE2-NEXT: pand %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psllq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psllq %xmm2, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm5, %xmm0 +; X86-SSE2-NEXT: psllq %xmm3, %xmm0 +; X86-SSE2-NEXT: por %xmm4, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat) diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -788,17 +788,12 @@ ; ; X86-SSE2-LABEL: splatvar_rotate_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0] -; X86-SSE2-NEXT: psubq %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psllq %xmm1, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm3, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,0,0,0] +; X86-SSE2-NEXT: psubq %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psllq %xmm1, %xmm3 +; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 +; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer %splat64 = sub <2 x i64> , %splat