Index: lib/CodeGen/InterleavedAccessPass.cpp
===================================================================
--- lib/CodeGen/InterleavedAccessPass.cpp
+++ lib/CodeGen/InterleavedAccessPass.cpp
@@ -156,12 +156,17 @@
   return false;
 }
 
-/// \brief Check if the mask is RE-interleave mask for an interleaved store.
-///
-/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...>
+/// \brief Check if the mask can be used in an interleaved store.
+//
+/// It checks for a more general pattern than the RE-interleave mask.
+/// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
+/// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
+/// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+/// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
 ///
-/// E.g. The RE-interleave mask (Factor = 2) could be:
-///     <0, 4, 1, 5, 2, 6, 3, 7>
+/// The particular case of an RE-interleave mask is:
+/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
+/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
 static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) {
   unsigned NumElts = Mask.size();
   if (NumElts < 4)
@@ -172,21 +177,74 @@
     if (NumElts % Factor)
       continue;
 
-    unsigned NumSubElts = NumElts / Factor;
-    if (!isPowerOf2_32(NumSubElts))
+    unsigned LaneLen = NumElts / Factor;
+    if (!isPowerOf2_32(LaneLen))
       continue;
 
-    // Check whether each element matchs the RE-interleaved rule. Ignore undef
-    // elements.
-    unsigned i = 0;
-    for (; i < NumElts; i++)
-      if (Mask[i] >= 0 &&
-          static_cast<unsigned>(Mask[i]) !=
-              (i % Factor) * NumSubElts + i / Factor)
+    // Check whether each element matches the general interleaved rule.
+    // Ignore undef elements, as long as the defined elements match the rule.
+    // Outer loop processes all factors (x, y, z in the above example)
+    unsigned I = 0, J;
+    for (; I < Factor; I++) {
+      unsigned SavedLaneValue;
+      unsigned SavedNoUndefs = 0;
+
+      //Inner loop processes all consecutive accesses (x, x+1... in the example)
+      for (J = 0; J < LaneLen-1; J++) {
+        //Lane computes x's position in the Mask
+        unsigned Lane = J*Factor + I;
+        unsigned NextLane = Lane + Factor;
+        int LaneValue = Mask[Lane];
+        int NextLaneValue = Mask[NextLane];
+
+        // If both are defined, values must be sequential
+        if (LaneValue >= 0 && NextLaneValue >= 0 &&
+            LaneValue + 1  != NextLaneValue)
+          break;
+
+        // If the next value is undef, save the current one as reference
+        if (LaneValue >= 0 && NextLaneValue < 0) {
+          SavedLaneValue = LaneValue;
+          SavedNoUndefs = 1;
+        }
+
+        // Undefs are allowed, but the defined elements must still be consecutive:
+        // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, ....
+        // Verify this by storing the last non-undef followed by an undef
+        // Check that following non-undef masks are incremented with the
+        // corresponding distance.
+        if (SavedNoUndefs > 0 && LaneValue < 0) {
+          SavedNoUndefs ++;
+          if (NextLaneValue >= 0 &&
+              SavedLaneValue + SavedNoUndefs != (unsigned) NextLaneValue)
+            break;
+        }
+      }
+
+      if (J < LaneLen-1)
         break;
 
-    // Find a RE-interleaved mask of current factor.
-    if (i == NumElts)
+      int StartMask = 0;
+      if (Mask[I] >= 0) {
+        //Check that the start of the I range (J=0) is greater than 0
+        StartMask = Mask[I];
+      }
+      else if (Mask[(LaneLen-1)*Factor + I] >= 0) {
+        // StartMask defined by the last value in lane
+        StartMask = Mask[(LaneLen-1)*Factor + I] - J;
+      }
+      else if (SavedNoUndefs > 0) {
+        // StartMask defined by some non-zero value in the j loop
+          StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs);
+      }
+      //else StartMask remains set to 0, i.e. all elements are undefs
+
+      if (StartMask < 0)
+        break;
+    }
+
+    // Found an interleaved mask of current factor.
+    if (I == Factor)
       return true;
   }
 
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7152,7 +7152,7 @@
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
-///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
 ///
 ///      Into:
@@ -7163,6 +7163,17 @@
 ///
 /// Note that the new shufflevectors will be removed and we'll only generate one
 /// st3 instruction in CodeGen.
+///
+/// Example for a more general valid mask (Factor 3). Lower:
+///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
+///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+///        store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+///      Into:
+///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
+///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
+///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
+///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
                                                   ShuffleVectorInst *SVI,
                                                   unsigned Factor) const {
@@ -7173,9 +7184,9 @@
   assert(VecTy->getVectorNumElements() % Factor == 0 &&
          "Invalid interleaved store");
 
-  unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+  unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
   Type *EltTy = VecTy->getVectorElementType();
-  VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+  VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
   unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
@@ -7200,7 +7211,7 @@
     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
 
-    SubVecTy = VectorType::get(IntTy, NumSubElts);
+    SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
   Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
@@ -7214,9 +7225,28 @@
   SmallVector<Value *, 5> Ops;
 
   // Split the shufflevector operands into sub vectors for the new stN call.
-  for (unsigned i = 0; i < Factor; i++)
-    Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+  auto Mask = SVI->getShuffleMask();
+  for (unsigned i = 0; i < Factor; i++) {
+    if (Mask[i] >= 0) {
+      Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
+    } else {
+      unsigned StartMask = 0;
+      for (unsigned j = 1; j < LaneLen; j++) {
+        if (Mask[j*Factor + i] >= 0) {
+          StartMask = Mask[j*Factor + i] - j;
+          break;
+        }
+      }
+      // Note: If all elements in a chunk are undefs, StartMask=0!
+      // Note: Filling undef gaps with random elements is ok, since
+      // those elements were being written anyway (with undefs).
+      // In the case of all undefs we're defaulting to using elems from 0
+      // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
+      Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
+    }
+  }
 
   Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
   Builder.CreateCall(StNFunc, Ops);
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -13099,6 +13099,17 @@
 ///
 /// Note that the new shufflevectors will be removed and we'll only generate one
 /// vst3 instruction in CodeGen.
+///
+/// Example for a more general valid mask (Factor 3). Lower:
+///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
+///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+///        store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+///      Into:
+///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
+///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
+///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
+///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
                                               ShuffleVectorInst *SVI,
                                               unsigned Factor) const {
@@ -13109,9 +13120,9 @@
   assert(VecTy->getVectorNumElements() % Factor == 0 &&
          "Invalid interleaved store");
 
-  unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+  unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
   Type *EltTy = VecTy->getVectorElementType();
-  VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+  VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
   unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
@@ -13138,7 +13149,7 @@
     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
 
-    SubVecTy = VectorType::get(IntTy, NumSubElts);
+    SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
@@ -13154,9 +13165,28 @@
       SI->getModule(), StoreInts[Factor - 2], Tys);
 
   // Split the shufflevector operands into sub vectors for the new vstN call.
-  for (unsigned i = 0; i < Factor; i++)
-    Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+  auto Mask = SVI->getShuffleMask();
+  for (unsigned i = 0; i < Factor; i++) {
+    if (Mask[i] >= 0) {
+      Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
+    } else {
+      unsigned StartMask = 0;
+      for (unsigned j = 1; j < LaneLen; j++) {
+        if (Mask[j*Factor + i] >= 0) {
+          StartMask = Mask[j*Factor + i] - j;
+          break;
+        }
+      }
+      // Note: If all elements in a chunk are undefs, StartMask=0!
+      // Note: Filling undef gaps with random elements is ok, since
+      // those elements were being written anyway (with undefs).
+      // In the case of all undefs we're defaulting to using elems from 0
+      // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
+      Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
+    }
+  }
 
   Ops.push_back(Builder.getInt32(SI->getAlignment()));
   Builder.CreateCall(VstNFunc, Ops);
Index: test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
===================================================================
--- test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
+++ test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
@@ -280,3 +280,114 @@
   %3 = extractelement <8 x i32> %1, i32 2
   ret i32 %3
 }
+
+; NEON-LABEL: store_general_mask_factor4:
+; NEON: st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0]
+; NONEON-LABEL: store_general_mask_factor4:
+; NONEON-NOT: st4
+define void @store_general_mask_factor4(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+  store <8 x i32> %i.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor4_undefbeg:
+; NEON: st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0]
+; NONEON-LABEL: store_general_mask_factor4_undefbeg:
+; NONEON-NOT: st4
+define void @store_general_mask_factor4_undefbeg(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+  store <8 x i32> %i.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor4_undefend:
+; NEON: st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0]
+; NONEON-LABEL: store_general_mask_factor4_undefend:
+; NONEON-NOT: st4
+define void @store_general_mask_factor4_undefend(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
+  store <8 x i32> %i.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor4_undefmid:
+; NEON: st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0]
+; NONEON-LABEL: store_general_mask_factor4_undefmid:
+; NONEON-NOT: st4
+define void @store_general_mask_factor4_undefmid(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
+  store <8 x i32> %i.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor4_undefmulti:
+; NEON: st4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x0]
+; NONEON-LABEL: store_general_mask_factor4_undefmulti:
+; NONEON-NOT: st4
+define void @store_general_mask_factor4_undefmulti(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
+  store <8 x i32> %i.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3:
+; NEON: st3 { v2.4s, v3.4s, v4.4s }, [x0]
+; NONEON-LABEL: store_general_mask_factor3:
+; NONEON-NOT: st3
+define void @store_general_mask_factor3(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_undefmultimid:
+; NEON: st3 { v2.4s, v3.4s, v4.4s }, [x0]
+; NONEON-LABEL: store_general_mask_factor3_undefmultimid:
+; NONEON-NOT: st3
+define void @store_general_mask_factor3_undefmultimid(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_undef_fail:
+; NEON-NOT: st3
+; NONEON-LABEL: store_general_mask_factor3_undef_fail:
+; NONEON-NOT: st3
+define void @store_general_mask_factor3_undef_fail(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_undeflane:
+; NEON: st3 { v1.4s, v2.4s, v3.4s }, [x0]
+; NONEON-LABEL: store_general_mask_factor3_undeflane:
+; NONEON-NOT: st3
+define void @store_general_mask_factor3_undeflane(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_negativestart:
+; NEON-NOT: st3
+; NONEON-LABEL: store_general_mask_factor3_negativestart:
+; NONEON-NOT: st3
+define void @store_general_mask_factor3_negativestart(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
Index: test/CodeGen/ARM/arm-interleaved-accesses.ll
===================================================================
--- test/CodeGen/ARM/arm-interleaved-accesses.ll
+++ test/CodeGen/ARM/arm-interleaved-accesses.ll
@@ -316,3 +316,147 @@
   %3 = extractelement <8 x i32> %1, i32 2
   ret i32 %3
 }
+
+; NEON-LABEL: store_general_mask_factor4:
+; NEON: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; NONEON-LABEL: store_general_mask_factor4:
+; NONEON-NOT: vst4.32
+define void @store_general_mask_factor4(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+  store <8 x i32> %i.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor4_undefbeg:
+; NEON: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; NONEON-LABEL: store_general_mask_factor4_undefbeg:
+; NONEON-NOT: vst4.32
+define void @store_general_mask_factor4_undefbeg(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 undef, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 9>
+  store <8 x i32> %i.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor4_undefend:
+; NEON: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; NONEON-LABEL: store_general_mask_factor4_undefend:
+; NONEON-NOT: vst4.32
+define void @store_general_mask_factor4_undefend(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 16, i32 32, i32 8, i32 5, i32 17, i32 33, i32 undef>
+  store <8 x i32> %i.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor4_undefmid:
+; NEON: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; NONEON-LABEL: store_general_mask_factor4_undefmid:
+; NONEON-NOT: vst4.32
+define void @store_general_mask_factor4_undefmid(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 32, i32 8, i32 5, i32 17, i32 undef, i32 9>
+  store <8 x i32> %i.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor4_undefmulti:
+; NEON: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; NONEON-LABEL: store_general_mask_factor4_undefmulti:
+; NONEON-NOT: vst4.32
+define void @store_general_mask_factor4_undefmulti(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <8 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 8, i32 undef, i32 undef, i32 undef, i32 9>
+  store <8 x i32> %i.vec, <8 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3:
+; NEON: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; NONEON-LABEL: store_general_mask_factor3:
+; NONEON-NOT: vst3.32
+define void @store_general_mask_factor3(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 5, i32 33, i32 17, i32 6, i32 34, i32 18, i32 7, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_undefmultimid:
+; NEON: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; NONEON-LABEL: store_general_mask_factor3_undefmultimid:
+; NONEON-NOT: vst3.32
+define void @store_general_mask_factor3_undefmultimid(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_undef_fail:
+; NEON-NOT: vst3.32
+; NONEON-LABEL: store_general_mask_factor3_undef_fail:
+; NONEON-NOT: vst3.32
+define void @store_general_mask_factor3_undef_fail(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_undeflane:
+; NEON: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; NONEON-LABEL: store_general_mask_factor3_undeflane:
+; NONEON-NOT: vst3.32
+define void @store_general_mask_factor3_undeflane(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_endstart_fail:
+; NEON-NOT: vst3.32
+; NONEON-LABEL: store_general_mask_factor3_endstart_fail:
+; NONEON-NOT: vst3.32
+define void @store_general_mask_factor3_endstart_fail(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_endstart_pass:
+; NEON: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; NONEON-LABEL: store_general_mask_factor3_endstart_pass:
+; NONEON-NOT: vst3.32
+define void @store_general_mask_factor3_endstart_pass(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 7, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_midstart_fail:
+; NEON-NOT: vst3.32
+; NONEON-LABEL: store_general_mask_factor3_midstart_fail:
+; NONEON-NOT: vst3.32
+define void @store_general_mask_factor3_midstart_fail(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+
+; NEON-LABEL: store_general_mask_factor3_midstart_pass:
+; NEON: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; NONEON-LABEL: store_general_mask_factor3_midstart_pass:
+; NONEON-NOT: vst3.32
+define void @store_general_mask_factor3_midstart_pass(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
+  %base = bitcast i32* %ptr to <12 x i32>*
+  %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 1, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
+  store <12 x i32> %i.vec, <12 x i32>* %base, align 4
+  ret void
+}
+