Index: llvm/include/llvm/ADT/APInt.h =================================================================== --- llvm/include/llvm/ADT/APInt.h +++ llvm/include/llvm/ADT/APInt.h @@ -2239,12 +2239,16 @@ /// Splat/Merge neighboring bits to widen/narrow the bitmask represented /// by \param A to \param NewBitWidth bits. /// +/// MatchAnyBits: (Default) /// e.g. ScaleBitMask(0b0101, 8) -> 0b00110011 /// e.g. ScaleBitMask(0b00011011, 4) -> 0b0111 -/// A.getBitwidth() or NewBitWidth must be a whole multiples of the other. /// -/// TODO: Do we need a mode where all bits must be set when merging down? -APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth); +/// MatchAllBits: +/// e.g. ScaleBitMask(0b0101, 8) -> 0b00110011 +/// e.g. ScaleBitMask(0b00011011, 4) -> 0b0001 +/// A.getBitwidth() or NewBitWidth must be a whole multiples of the other. +APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, + bool MatchAllBits = false); } // namespace APIntOps // See friend declaration above. This additional declaration is required in Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2718,7 +2718,15 @@ SubDemandedElts &= ScaledDemandedElts; if (!isSplatValue(Src, SubDemandedElts, SubUndefElts, Depth + 1)) return false; - UndefElts |= APIntOps::ScaleBitMask(SubUndefElts, NumElts); + + // Here we can't do "OR" operation merge for undef bits. + // Because some operation only use part value of the source. + // Take llvm.fshl.* for example: + // t1: v4i32 = Constant:i32<12>, undef:i32, Constant:i32<12>, undef:i32 + // t2: v2i64 = bitcast t1 + // t5: v2i64 = fshl t3, t4, t2 + // We can not convert t2 to {i64 undef, i64 undef} + UndefElts |= APIntOps::ScaleBitMask(SubUndefElts, NumElts, true); } return true; } Index: llvm/lib/Support/APInt.cpp =================================================================== --- llvm/lib/Support/APInt.cpp +++ llvm/lib/Support/APInt.cpp @@ -2968,7 +2968,8 @@ return A.getBitWidth() - ((A ^ B).countLeadingZeros() + 1); } -APInt llvm::APIntOps::ScaleBitMask(const APInt &A, unsigned NewBitWidth) { +APInt llvm::APIntOps::ScaleBitMask(const APInt &A, unsigned NewBitWidth, + bool MatchAllBits) { unsigned OldBitWidth = A.getBitWidth(); assert((((OldBitWidth % NewBitWidth) == 0) || ((NewBitWidth % OldBitWidth) == 0)) && @@ -2992,11 +2993,16 @@ if (A[i]) NewA.setBits(i * Scale, (i + 1) * Scale); } else { - // Merge bits - if any old bit is set, then set scale equivalent new bit. unsigned Scale = OldBitWidth / NewBitWidth; - for (unsigned i = 0; i != NewBitWidth; ++i) - if (!A.extractBits(Scale, i * Scale).isZero()) - NewA.setBit(i); + for (unsigned i = 0; i != NewBitWidth; ++i) { + if (MatchAllBits) { + if (A.extractBits(Scale, i * Scale).isAllOnes()) + NewA.setBit(i); + } else { + if (!A.extractBits(Scale, i * Scale).isZero()) + NewA.setBit(i); + } + } } return NewA; Index: llvm/test/CodeGen/X86/fshl-splat-undef.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/fshl-splat-undef.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=cannonlake | FileCheck %s + +; Check the correctness of following test. +; For this case: +; In 32-bits targets the will convert to in +; type legalization and turn to in combining due to it +; only use the low i32 bits. +; But the fshl is <8 x i64> fshl, the will bitcast to +; back. Some like: +; ============================================================================== +; // t1: v16i32 = Constant:i32<12>, undef:i32, Constant:i32<12>, undef:i32, ... +; // t2: v8i64 = bitcast t1 +; // t5: v8i64 = fshl t3, t4, t2 +; ============================================================================== +; We should make sure not "merging" to +; (We can not convert t2 to {i64 undef, i64 undef, ...}) +; That is not equal with the origin result) +; +define void @test_fshl(<8 x i64> %lo, <8 x i64> %hi, <8 x i64>* %arr) { +; CHECK-LABEL: test_fshl: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $63, %eax +; CHECK-NEXT: vmovd %eax, %xmm2 +; CHECK-NEXT: movl $12, %eax +; CHECK-NEXT: vmovd %eax, %xmm3 +; CHECK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpsllq %xmm2, %zmm1, %zmm1 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: vpsrlq $52, %zmm0, %zmm0 +; CHECK-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm0, (%eax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retl +entry: + %fshl = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %hi, <8 x i64> %lo, <8 x i64> ) + %res = shufflevector <8 x i64> %fshl, <8 x i64> zeroinitializer, <8 x i32> + store <8 x i64> %res, <8 x i64>* %arr, align 64 + ret void +} + + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) #0 + +attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willreturn } Index: llvm/unittests/ADT/APIntTest.cpp =================================================================== --- llvm/unittests/ADT/APIntTest.cpp +++ llvm/unittests/ADT/APIntTest.cpp @@ -3115,6 +3115,15 @@ APInt::getAllOnes(256)); EXPECT_EQ(APIntOps::ScaleBitMask(APInt::getOneBitSet(4096, 32), 256), APInt::getOneBitSet(256, 2)); + + EXPECT_EQ(APIntOps::ScaleBitMask(APInt(2, 0x00), 8, true), APInt(8, 0x00)); + EXPECT_EQ(APIntOps::ScaleBitMask(APInt(2, 0x01), 8, true), APInt(8, 0x0F)); + EXPECT_EQ(APIntOps::ScaleBitMask(APInt(2, 0x02), 8, true), APInt(8, 0xF0)); + EXPECT_EQ(APIntOps::ScaleBitMask(APInt(2, 0x03), 8, true), APInt(8, 0xFF)); + + EXPECT_EQ(APIntOps::ScaleBitMask(APInt(8, 0x00), 4, true), APInt(4, 0x00)); + EXPECT_EQ(APIntOps::ScaleBitMask(APInt(8, 0xFF), 4, true), APInt(4, 0x0F)); + EXPECT_EQ(APIntOps::ScaleBitMask(APInt(8, 0xE4), 4, true), APInt(4, 0x08)); } } // end anonymous namespace