Index: llvm/include/llvm/ADT/APInt.h
===================================================================
--- llvm/include/llvm/ADT/APInt.h
+++ llvm/include/llvm/ADT/APInt.h
@@ -2239,12 +2239,16 @@
 /// Splat/Merge neighboring bits to widen/narrow the bitmask represented
 /// by \param A to \param NewBitWidth bits.
 ///
+/// MatchAnyBits: (Default)
 /// e.g. ScaleBitMask(0b0101, 8) -> 0b00110011
 /// e.g. ScaleBitMask(0b00011011, 4) -> 0b0111
-/// A.getBitwidth() or NewBitWidth must be a whole multiples of the other.
 ///
-/// TODO: Do we need a mode where all bits must be set when merging down?
-APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth);
+/// MatchAllBits:
+/// e.g. ScaleBitMask(0b0101, 8) -> 0b00110011
+/// e.g. ScaleBitMask(0b00011011, 4) -> 0b0001
+/// A.getBitwidth() or NewBitWidth must be a whole multiples of the other.
+APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth,
+                   bool MatchAllBits = false);
 } // namespace APIntOps
 
 // See friend declaration above. This additional declaration is required in
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2718,7 +2718,15 @@
         SubDemandedElts &= ScaledDemandedElts;
         if (!isSplatValue(Src, SubDemandedElts, SubUndefElts, Depth + 1))
           return false;
-        UndefElts |= APIntOps::ScaleBitMask(SubUndefElts, NumElts);
+
+        // Here we can't do "OR" operation merge for undef bits.
+        // Because some operation only use part value of the source.
+        // Take llvm.fshl.* for example:
+        // t1: v4i32 = Constant:i32<12>, undef:i32, Constant:i32<12>, undef:i32
+        // t2: v2i64 = bitcast t1
+        // t5: v2i64 = fshl t3, t4, t2
+        // We can not convert t2 to {i64 undef, i64 undef}
+        UndefElts |= APIntOps::ScaleBitMask(SubUndefElts, NumElts, true);
       }
       return true;
     }
Index: llvm/lib/Support/APInt.cpp
===================================================================
--- llvm/lib/Support/APInt.cpp
+++ llvm/lib/Support/APInt.cpp
@@ -2968,7 +2968,8 @@
   return A.getBitWidth() - ((A ^ B).countLeadingZeros() + 1);
 }
 
-APInt llvm::APIntOps::ScaleBitMask(const APInt &A, unsigned NewBitWidth) {
+APInt llvm::APIntOps::ScaleBitMask(const APInt &A, unsigned NewBitWidth,
+                                   bool MatchAllBits) {
   unsigned OldBitWidth = A.getBitWidth();
   assert((((OldBitWidth % NewBitWidth) == 0) ||
           ((NewBitWidth % OldBitWidth) == 0)) &&
@@ -2992,11 +2993,16 @@
       if (A[i])
         NewA.setBits(i * Scale, (i + 1) * Scale);
   } else {
-    // Merge bits - if any old bit is set, then set scale equivalent new bit.
     unsigned Scale = OldBitWidth / NewBitWidth;
-    for (unsigned i = 0; i != NewBitWidth; ++i)
-      if (!A.extractBits(Scale, i * Scale).isZero())
-        NewA.setBit(i);
+    for (unsigned i = 0; i != NewBitWidth; ++i) {
+      if (MatchAllBits) {
+        if (A.extractBits(Scale, i * Scale).isAllOnes())
+          NewA.setBit(i);
+      } else {
+        if (!A.extractBits(Scale, i * Scale).isZero())
+          NewA.setBit(i);
+      }
+    }
   }
 
   return NewA;
Index: llvm/test/CodeGen/X86/fshl-splat-undef.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/fshl-splat-undef.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=cannonlake | FileCheck %s
+
+; Check the correctness of following test.
+; For this case:
+; In 32-bits targets the <i64 12, ...> will convert to <i32 12, i32 0, ...> in
+; type legalization and turn to <i32 12, i32 undef, ...> in combining due to it
+; only use the low i32 bits.
+; But the fshl is <8 x i64> fshl, the <i32 12, i32 undef, ...> will bitcast to
+; <i64 Element, ...> back. Some like:
+; ==============================================================================
+; // t1: v16i32 = Constant:i32<12>, undef:i32, Constant:i32<12>, undef:i32, ...
+; // t2: v8i64 = bitcast t1
+; // t5: v8i64 = fshl t3, t4, t2
+; ==============================================================================
+; We should make sure not "merging" <i32 12, i32 undef> to <i64 undef> 
+; (We can not convert t2 to {i64 undef, i64 undef, ...})
+; That is not equal with the origin result)
+; 
+define void @test_fshl(<8 x i64> %lo, <8 x i64> %hi, <8 x i64>* %arr) {
+; CHECK-LABEL: test_fshl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl $63, %eax
+; CHECK-NEXT:    vmovd %eax, %xmm2
+; CHECK-NEXT:    movl $12, %eax
+; CHECK-NEXT:    vmovd %eax, %xmm3
+; CHECK-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpsllq %xmm2, %zmm1, %zmm1
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    vpsrlq $52, %zmm0, %zmm0
+; CHECK-NEXT:    vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm0
+; CHECK-NEXT:    vmovdqa64 %zmm0, (%eax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
+entry:
+  %fshl = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %hi, <8 x i64> %lo, <8 x i64> <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>)
+  %res = shufflevector <8 x i64> %fshl, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
+  store <8 x i64> %res, <8 x i64>* %arr, align 64
+  ret void
+}
+
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
Index: llvm/unittests/ADT/APIntTest.cpp
===================================================================
--- llvm/unittests/ADT/APIntTest.cpp
+++ llvm/unittests/ADT/APIntTest.cpp
@@ -3115,6 +3115,15 @@
             APInt::getAllOnes(256));
   EXPECT_EQ(APIntOps::ScaleBitMask(APInt::getOneBitSet(4096, 32), 256),
             APInt::getOneBitSet(256, 2));
+
+  EXPECT_EQ(APIntOps::ScaleBitMask(APInt(2, 0x00), 8, true), APInt(8, 0x00));
+  EXPECT_EQ(APIntOps::ScaleBitMask(APInt(2, 0x01), 8, true), APInt(8, 0x0F));
+  EXPECT_EQ(APIntOps::ScaleBitMask(APInt(2, 0x02), 8, true), APInt(8, 0xF0));
+  EXPECT_EQ(APIntOps::ScaleBitMask(APInt(2, 0x03), 8, true), APInt(8, 0xFF));
+
+  EXPECT_EQ(APIntOps::ScaleBitMask(APInt(8, 0x00), 4, true), APInt(4, 0x00));
+  EXPECT_EQ(APIntOps::ScaleBitMask(APInt(8, 0xFF), 4, true), APInt(4, 0x0F));
+  EXPECT_EQ(APIntOps::ScaleBitMask(APInt(8, 0xE4), 4, true), APInt(4, 0x08));
 }
 
 } // end anonymous namespace