Index: lib/CodeGen/InterleavedAccessPass.cpp =================================================================== --- lib/CodeGen/InterleavedAccessPass.cpp +++ lib/CodeGen/InterleavedAccessPass.cpp @@ -156,12 +156,17 @@ return false; } -/// \brief Check if the mask is RE-interleave mask for an interleaved store. -/// -/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...> +/// \brief Check if the mask can be used in an interleaved store. +// +/// It checks for a more general pattern than the RE-interleave mask. +/// I.e. +/// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35> +/// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> +/// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5> /// -/// E.g. The RE-interleave mask (Factor = 2) could be: -/// <0, 4, 1, 5, 2, 6, 3, 7> +/// The particular case of an RE-interleave mask is: +/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> +/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7> static bool isReInterleaveMask(ArrayRef Mask, unsigned &Factor) { unsigned NumElts = Mask.size(); if (NumElts < 4) @@ -172,21 +177,72 @@ if (NumElts % Factor) continue; - unsigned NumSubElts = NumElts / Factor; - if (!isPowerOf2_32(NumSubElts)) + unsigned LaneLen = NumElts / Factor; + if (!isPowerOf2_32(LaneLen)) continue; - // Check whether each element matchs the RE-interleaved rule. Ignore undef - // elements. - unsigned i = 0; - for (; i < NumElts; i++) - if (Mask[i] >= 0 && - static_cast(Mask[i]) != - (i % Factor) * NumSubElts + i / Factor) + // Check whether each element matches the general interleaved rule. + // Ignore undef elements, as long as the defined elements match the rule. + // Outer loop processes all factors (x, y, z in the above example) + unsigned I = 0, J; + for (; I < Factor; I++) { + unsigned SavedLaneValue; + unsigned SavedNoUndefs = 0; + + // Inner loop processes consecutive accesses (x, x+1... in the example) + for (J = 0; J < LaneLen - 1; J++) { + // Lane computes x's position in the Mask + unsigned Lane = J * Factor + I; + unsigned NextLane = Lane + Factor; + int LaneValue = Mask[Lane]; + int NextLaneValue = Mask[NextLane]; + + // If both are defined, values must be sequential + if (LaneValue >= 0 && NextLaneValue >= 0 && + LaneValue + 1 != NextLaneValue) + break; + + // If the next value is undef, save the current one as reference + if (LaneValue >= 0 && NextLaneValue < 0) { + SavedLaneValue = LaneValue; + SavedNoUndefs = 1; + } + + // Undefs are allowed, but defined elements must still be consecutive: + // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, .... + // Verify this by storing the last non-undef followed by an undef + // Check that following non-undef masks are incremented with the + // corresponding distance. + if (SavedNoUndefs > 0 && LaneValue < 0) { + SavedNoUndefs++; + if (NextLaneValue >= 0 && + SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue) + break; + } + } + + if (J < LaneLen - 1) break; - // Find a RE-interleaved mask of current factor. - if (i == NumElts) + int StartMask = 0; + if (Mask[I] >= 0) { + // Check that the start of the I range (J=0) is greater than 0 + StartMask = Mask[I]; + } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) { + // StartMask defined by the last value in lane + StartMask = Mask[(LaneLen - 1) * Factor + I] - J; + } else if (SavedNoUndefs > 0) { + // StartMask defined by some non-zero value in the j loop + StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs); + } + // else StartMask remains set to 0, i.e. all elements are undefs + + if (StartMask < 0) + break; + } + + // Found an interleaved mask of current factor. + if (I == Factor) return true; } Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7152,7 +7152,7 @@ /// /// E.g. Lower an interleaved store (Factor = 3): /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, -/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: @@ -7163,6 +7163,17 @@ /// /// Note that the new shufflevectors will be removed and we'll only generate one /// st3 instruction in CodeGen. +/// +/// Example for a more general valid mask (Factor 3). Lower: +/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, +/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr +/// +/// Into: +/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> +/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> +/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> +/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { @@ -7173,9 +7184,9 @@ assert(VecTy->getVectorNumElements() % Factor == 0 && "Invalid interleaved store"); - unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; + unsigned LaneLen = VecTy->getVectorNumElements() / Factor; Type *EltTy = VecTy->getVectorElementType(); - VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); + VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); @@ -7200,7 +7211,7 @@ Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = VectorType::get(IntTy, NumSubElts); + SubVecTy = VectorType::get(IntTy, LaneLen); } Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); @@ -7214,9 +7225,28 @@ SmallVector Ops; // Split the shufflevector operands into sub vectors for the new stN call. - for (unsigned i = 0; i < Factor; i++) - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); + auto Mask = SVI->getShuffleMask(); + for (unsigned i = 0; i < Factor; i++) { + if (Mask[i] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + if (Mask[j*Factor + i] >= 0) { + StartMask = Mask[j*Factor + i] - j; + break; + } + } + // Note: If all elements in a chunk are undefs, StartMask=0! + // Note: Filling undef gaps with random elements is ok, since + // those elements were being written anyway (with undefs). + // In the case of all undefs we're defaulting to using elems from 0 + // Note: StartMask cannot be negative, it's checked in isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen))); + } + } Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); Builder.CreateCall(StNFunc, Ops); Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -13099,6 +13099,17 @@ /// /// Note that the new shufflevectors will be removed and we'll only generate one /// vst3 instruction in CodeGen. +/// +/// Example for a more general valid mask (Factor 3). Lower: +/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, +/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr +/// +/// Into: +/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> +/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> +/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> +/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { @@ -13109,9 +13120,9 @@ assert(VecTy->getVectorNumElements() % Factor == 0 && "Invalid interleaved store"); - unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; + unsigned LaneLen = VecTy->getVectorNumElements() / Factor; Type *EltTy = VecTy->getVectorElementType(); - VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); + VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); @@ -13138,7 +13149,7 @@ Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = VectorType::get(IntTy, NumSubElts); + SubVecTy = VectorType::get(IntTy, LaneLen); } static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, @@ -13154,9 +13165,28 @@ SI->getModule(), StoreInts[Factor - 2], Tys); // Split the shufflevector operands into sub vectors for the new vstN call. - for (unsigned i = 0; i < Factor; i++) - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); + auto Mask = SVI->getShuffleMask(); + for (unsigned i = 0; i < Factor; i++) { + if (Mask[i] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + if (Mask[j*Factor + i] >= 0) { + StartMask = Mask[j*Factor + i] - j; + break; + } + } + // Note: If all elements in a chunk are undefs, StartMask=0! + // Note: Filling undef gaps with random elements is ok, since + // those elements were being written anyway (with undefs). + // In the case of all undefs we're defaulting to using elems from 0 + // Note: StartMask cannot be negative, it's checked in isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen))); + } + } Ops.push_back(Builder.getInt32(SI->getAlignment())); Builder.CreateCall(VstNFunc, Ops); Index: test/CodeGen/AArch64/aarch64-interleaved-accesses.ll =================================================================== --- test/CodeGen/AArch64/aarch64-interleaved-accesses.ll +++ test/CodeGen/AArch64/aarch64-interleaved-accesses.ll @@ -280,3 +280,114 @@ %3 = extractelement <8 x i32> %1, i32 2 ret i32 %3 } + +; NEON-LABEL: store_general_mask_factor4: +; NEON: st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0] +; NONEON-LABEL: store_general_mask_factor4: +; NONEON-NOT: st4 +define void @store_general_mask_factor4(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor4_undefbeg: +; NEON: st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0] +; NONEON-LABEL: store_general_mask_factor4_undefbeg: +; NONEON-NOT: st4 +define void @store_general_mask_factor4_undefbeg(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor4_undefend: +; NEON: st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0] +; NONEON-LABEL: store_general_mask_factor4_undefend: +; NONEON-NOT: st4 +define void @store_general_mask_factor4_undefend(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor4_undefmid: +; NEON: st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0] +; NONEON-LABEL: store_general_mask_factor4_undefmid: +; NONEON-NOT: st4 +define void @store_general_mask_factor4_undefmid(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor4_undefmulti: +; NEON: st4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x0] +; NONEON-LABEL: store_general_mask_factor4_undefmulti: +; NONEON-NOT: st4 +define void @store_general_mask_factor4_undefmulti(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3: +; NEON: st3 { v2.4s, v3.4s, v4.4s }, [x0] +; NONEON-LABEL: store_general_mask_factor3: +; NONEON-NOT: st3 +define void @store_general_mask_factor3(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_undefmultimid: +; NEON: st3 { v2.4s, v3.4s, v4.4s }, [x0] +; NONEON-LABEL: store_general_mask_factor3_undefmultimid: +; NONEON-NOT: st3 +define void @store_general_mask_factor3_undefmultimid(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_undef_fail: +; NEON-NOT: st3 +; NONEON-LABEL: store_general_mask_factor3_undef_fail: +; NONEON-NOT: st3 +define void @store_general_mask_factor3_undef_fail(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_undeflane: +; NEON: st3 { v1.4s, v2.4s, v3.4s }, [x0] +; NONEON-LABEL: store_general_mask_factor3_undeflane: +; NONEON-NOT: st3 +define void @store_general_mask_factor3_undeflane(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_negativestart: +; NEON-NOT: st3 +; NONEON-LABEL: store_general_mask_factor3_negativestart: +; NONEON-NOT: st3 +define void @store_general_mask_factor3_negativestart(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + Index: test/CodeGen/ARM/arm-interleaved-accesses.ll =================================================================== --- test/CodeGen/ARM/arm-interleaved-accesses.ll +++ test/CodeGen/ARM/arm-interleaved-accesses.ll @@ -316,3 +316,147 @@ %3 = extractelement <8 x i32> %1, i32 2 ret i32 %3 } + +; NEON-LABEL: store_general_mask_factor4: +; NEON: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; NONEON-LABEL: store_general_mask_factor4: +; NONEON-NOT: vst4.32 +define void @store_general_mask_factor4(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor4_undefbeg: +; NEON: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; NONEON-LABEL: store_general_mask_factor4_undefbeg: +; NONEON-NOT: vst4.32 +define void @store_general_mask_factor4_undefbeg(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor4_undefend: +; NEON: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; NONEON-LABEL: store_general_mask_factor4_undefend: +; NONEON-NOT: vst4.32 +define void @store_general_mask_factor4_undefend(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor4_undefmid: +; NEON: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; NONEON-LABEL: store_general_mask_factor4_undefmid: +; NONEON-NOT: vst4.32 +define void @store_general_mask_factor4_undefmid(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor4_undefmulti: +; NEON: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; NONEON-LABEL: store_general_mask_factor4_undefmulti: +; NONEON-NOT: vst4.32 +define void @store_general_mask_factor4_undefmulti(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <8 x i32> + store <8 x i32> %i.vec, <8 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3: +; NEON: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; NONEON-LABEL: store_general_mask_factor3: +; NONEON-NOT: vst3.32 +define void @store_general_mask_factor3(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_undefmultimid: +; NEON: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; NONEON-LABEL: store_general_mask_factor3_undefmultimid: +; NONEON-NOT: vst3.32 +define void @store_general_mask_factor3_undefmultimid(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_undef_fail: +; NEON-NOT: vst3.32 +; NONEON-LABEL: store_general_mask_factor3_undef_fail: +; NONEON-NOT: vst3.32 +define void @store_general_mask_factor3_undef_fail(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_undeflane: +; NEON: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; NONEON-LABEL: store_general_mask_factor3_undeflane: +; NONEON-NOT: vst3.32 +define void @store_general_mask_factor3_undeflane(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_endstart_fail: +; NEON-NOT: vst3.32 +; NONEON-LABEL: store_general_mask_factor3_endstart_fail: +; NONEON-NOT: vst3.32 +define void @store_general_mask_factor3_endstart_fail(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_endstart_pass: +; NEON: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; NONEON-LABEL: store_general_mask_factor3_endstart_pass: +; NONEON-NOT: vst3.32 +define void @store_general_mask_factor3_endstart_pass(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_midstart_fail: +; NEON-NOT: vst3.32 +; NONEON-LABEL: store_general_mask_factor3_midstart_fail: +; NONEON-NOT: vst3.32 +define void @store_general_mask_factor3_midstart_fail(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} + +; NEON-LABEL: store_general_mask_factor3_midstart_pass: +; NEON: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; NONEON-LABEL: store_general_mask_factor3_midstart_pass: +; NONEON-NOT: vst3.32 +define void @store_general_mask_factor3_midstart_pass(i32* %ptr, <32 x i32> %v0, <32 x i32> %v1) { + %base = bitcast i32* %ptr to <12 x i32>* + %i.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> + store <12 x i32> %i.vec, <12 x i32>* %base, align 4 + ret void +} +