Index: lib/CodeGen/InterleavedAccessPass.cpp =================================================================== --- lib/CodeGen/InterleavedAccessPass.cpp +++ lib/CodeGen/InterleavedAccessPass.cpp @@ -178,15 +178,36 @@ // Check whether each element matchs the RE-interleaved rule. Ignore undef // elements. - unsigned i = 0; - for (; i < NumElts; i++) - if (Mask[i] >= 0 && - static_cast(Mask[i]) != - (i % Factor) * NumSubElts + i / Factor) + unsigned i = 0, j; + for (; i < Factor; i++) { + int PreviousMask = -1; + int PreviousPos = -1; + //TODO: check that Mask[i] (if it exists) is aligned (TLI). + //TODO: add additional tests (+ARM) + for (j = 0; j < NumSubElts-1; j++) { + unsigned ij = j*Factor + i; + if (Mask[ij] >= 0 && Mask[ij + Factor] >= 0 && + static_cast(Mask[ij]) + 1 != + static_cast(Mask[ij + Factor])) + break; + + // With undefined mask, we can have: 2, undef, 7, undef, 32. + // Compare the next mask, with value from further back to avoid this. + if (PreviousMask > 0 && Mask[ij] < 0 && Mask[ij + Factor] >= 0 && + static_cast(PreviousMask) + (j - PreviousPos) + 1 != + static_cast(Mask[ij + Factor])) + break; + if (Mask[ij] >= 0 && Mask[ij + Factor] < 0) { + PreviousMask = Mask[ij]; + PreviousPos = j; + } + } + if (j < NumSubElts-1) break; + } // Find a RE-interleaved mask of current factor. - if (i == NumElts) + if (i == Factor) return true; } Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7233,9 +7233,25 @@ SmallVector Ops; // Split the shufflevector operands into sub vectors for the new stN call. - for (unsigned i = 0; i < Factor; i++) - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); + ArrayRef Mask = SVI->getShuffleMask(); + for (unsigned i = 0; i < Factor; i++) { + if (Mask[i] >= 0) + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, Mask[i], NumSubElts))); + //Op0, Op1, getSequentialMask(Builder, Prefix + Skip * i, NumSubElts))); + else { + unsigned StartMask = 0; + for (unsigned j = 1; j < NumSubElts; j++) { + if (Mask[j*Factor + i] >= 0) { + StartMask = Mask[j*Factor + i] - j; + break; + } + } + //TODO: StartMask can end up being negative! + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, StartMask, NumSubElts))); + } + } Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); Builder.CreateCall(StNFunc, Ops); Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -12759,9 +12759,23 @@ SI->getModule(), StoreInts[Factor - 2], Tys); // Split the shufflevector operands into sub vectors for the new vstN call. - for (unsigned i = 0; i < Factor; i++) - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); + ArrayRef Mask = SVI->getShuffleMask(); + for (unsigned i = 0; i < Factor; i++) { + if (Mask[i] >= 0) + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, Mask[i], NumSubElts))); + else { + unsigned StartMask = 0; + for (unsigned j = 1; j < NumSubElts; j++) { + if (Mask[j*Factor + i] >= 0) { + StartMask = Mask[j*Factor + i] - j; + break; + } + } + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, StartMask, NumSubElts))); + } + } Ops.push_back(Builder.getInt32(SI->getAlignment())); Builder.CreateCall(VstNFunc, Ops); Index: test/CodeGen/AArch64/aarch64-interleaved-accesses.ll =================================================================== --- test/CodeGen/AArch64/aarch64-interleaved-accesses.ll +++ test/CodeGen/AArch64/aarch64-interleaved-accesses.ll @@ -280,3 +280,14 @@ %3 = extractelement <8 x i32> %1, i32 2 ret i32 %3 } + +; NEON-LABEL: store_general_mask_factor4: +; NEON: st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0] +; NONEON-LABEL: store_general_mask_factor4: +; NONEON-NOT: st4 +define void @store_general_mask_factor4(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +}