diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5302,19 +5302,6 @@ if (TLI->isCommutativeBinOp(Opcode)) if (GlobalAddressSDNode *GA = dyn_cast(Ops[1])) return FoldSymbolOffset(Opcode, VT, GA, Ops[0].getNode()); - - // If this is a bitwise logic opcode see if we can fold bitcasted ops. - // TODO: Can we generalize this and fold any bitcasted constant data? - if (ISD::isBitwiseLogicOp(Opcode) && Ops[0].getOpcode() == ISD::BITCAST && - Ops[1].getOpcode() == ISD::BITCAST) { - SDValue InnerN1 = peekThroughBitcasts(Ops[0].getOperand(0)); - SDValue InnerN2 = peekThroughBitcasts(Ops[1].getOperand(0)); - EVT InnerVT = InnerN1.getValueType(); - if (InnerVT == InnerN2.getValueType() && InnerVT.isInteger()) - if (SDValue C = - FoldConstantArithmetic(Opcode, DL, InnerVT, {InnerN1, InnerN2})) - return getBitcast(VT, C); - } } // This is for vector folding only from here on. @@ -5323,6 +5310,54 @@ ElementCount NumElts = VT.getVectorElementCount(); + // See if we can fold through bitcasted integer ops. + // TODO: Can we handle undef elements? + if (NumOps == 2 && VT.isFixedLengthVector() && VT.isInteger() && + Ops[0].getValueType() == VT && Ops[1].getValueType() == VT && + Ops[0].getOpcode() == ISD::BITCAST && + Ops[1].getOpcode() == ISD::BITCAST) { + SDValue N1 = peekThroughBitcasts(Ops[0]); + SDValue N2 = peekThroughBitcasts(Ops[1]); + auto *BV1 = dyn_cast(N1); + auto *BV2 = dyn_cast(N2); + EVT BVVT = N1.getValueType(); + if (BV1 && BV2 && BVVT.isInteger() && BVVT == N2.getValueType()) { + bool IsLE = getDataLayout().isLittleEndian(); + unsigned EltBits = VT.getScalarSizeInBits(); + SmallVector RawBits1, RawBits2; + BitVector UndefElts1, UndefElts2; + if (BV1->getConstantRawBits(IsLE, EltBits, RawBits1, UndefElts1) && + BV2->getConstantRawBits(IsLE, EltBits, RawBits2, UndefElts2) && + UndefElts1.none() && UndefElts2.none()) { + SmallVector RawBits; + for (unsigned I = 0, E = NumElts.getFixedValue(); I != E; ++I) { + Optional Fold = FoldValue(Opcode, RawBits1[I], RawBits2[I]); + if (!Fold) + break; + RawBits.push_back(Fold.getValue()); + } + if (RawBits.size() == NumElts.getFixedValue()) { + // We have constant folded, but we need to cast this again back to + // the original (possibly legalized) type. + SmallVector DstBits; + BitVector DstUndefs; + BuildVectorSDNode::recastRawBits(IsLE, BVVT.getScalarSizeInBits(), + DstBits, RawBits, DstUndefs, + BitVector(RawBits.size(), false)); + EVT BVEltVT = BV1->getOperand(0).getValueType(); + unsigned BVEltBits = BVEltVT.getSizeInBits(); + SmallVector Ops(DstBits.size(), getUNDEF(BVEltVT)); + for (unsigned I = 0, E = DstBits.size(); I != E; ++I) { + if (DstUndefs[I]) + continue; + Ops[I] = getConstant(DstBits[I].sextOrSelf(BVEltBits), DL, BVEltVT); + } + return getBitcast(VT, getBuildVector(BVVT, DL, Ops)); + } + } + } + } + auto IsScalarOrSameVectorSize = [NumElts](const SDValue &Op) { return !Op.getValueType().isVector() || Op.getValueType().getVectorElementCount() == NumElts; diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -159,7 +159,7 @@ ; X86-SSE-NEXT: psllq $63, %xmm1 ; X86-SSE-NEXT: psllq $63, %xmm0 ; X86-SSE-NEXT: psrlq $63, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,0,0] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,0,2147483648] ; X86-SSE-NEXT: pxor %xmm2, %xmm0 ; X86-SSE-NEXT: psubq %xmm2, %xmm0 ; X86-SSE-NEXT: psrlq $63, %xmm1 @@ -196,11 +196,11 @@ ; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0] ; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; X86-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,0,0,2147483648,1,0,0,2147483648] +; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1460,20 +1460,16 @@ ; ; X86-SSE2-LABEL: constant_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: psubq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrlq $60, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psrlq %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psrlq %xmm1, %xmm3 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; X86-SSE2-NEXT: psrlq $50, %xmm2 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psllq $4, %xmm1 ; X86-SSE2-NEXT: psllq $14, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm3, %xmm0 +; X86-SSE2-NEXT: orpd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res @@ -1928,10 +1924,9 @@ ; X86-SSE2-LABEL: splatconstant_funnnel_v2i64: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psllq $14, %xmm1 -; X86-SSE2-NEXT: psrlq $50, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlq $50, %xmm1 +; X86-SSE2-NEXT: psllq $14, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1548,20 +1548,16 @@ ; ; X86-SSE2-LABEL: constant_funnnel_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: psubq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psllq $60, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT: psllq %xmm1, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE2-NEXT: psllq %xmm1, %xmm3 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; X86-SSE2-NEXT: psllq $50, %xmm2 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrlq $4, %xmm1 ; X86-SSE2-NEXT: psrlq $14, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm3, %xmm0 +; X86-SSE2-NEXT: orpd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res @@ -2016,10 +2012,9 @@ ; X86-SSE2-LABEL: splatconstant_funnnel_v2i64: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlq $14, %xmm1 -; X86-SSE2-NEXT: psllq $50, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: psllq $50, %xmm1 +; X86-SSE2-NEXT: psrlq $14, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -990,7 +990,7 @@ ; X86-SSE-NEXT: psrlq $1, %xmm1 ; X86-SSE-NEXT: psrlq $7, %xmm0 ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [2.0E+0,7.2911220195563975E-304] +; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1073741824,0,16777216] ; X86-SSE-NEXT: xorpd %xmm1, %xmm0 ; X86-SSE-NEXT: psubq %xmm1, %xmm0 ; X86-SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1080,12 +1080,10 @@ ; ; X86-AVX2-LABEL: constant_shift_v4i64: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 -; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1073741824,0,16777216,0,1,2,0] +; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl %shift = ashr <4 x i64> %a, ret <4 x i64> %shift