Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7275,9 +7275,16 @@ unsigned VecSize = DL.getTypeSizeInBits(VecTy); // Skip if we do not have NEON and skip illegal vector types. - if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize % 128 != 0)) return false; + // We can "legalize" wide vector types into multiple interleaved loads as + // long as the vector types are divisible by 128. The code below determines + // the number of loads we will generate. + unsigned NumLoads = 1; + if (VecSize > 128) + NumLoads = VecSize / 128; + // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. Type *EltTy = VecTy->getVectorElementType(); @@ -7285,6 +7292,25 @@ VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + IRBuilder<> Builder(LI); + + // The base address of the load. + Value *BaseAddr = LI->getPointerOperand(); + + if (NumLoads > 1) { + // If we're going to generate more than one load, reset the sub-vector type + // to something legal. + VecTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / NumLoads); + + // We will compute the pointer operand of each load from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace())); + } + Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[2] = {VecTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, @@ -7293,24 +7319,46 @@ Function *LdNFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); - IRBuilder<> Builder(LI); - Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); + // Holds sub-vectors extracted from the load intrinsic return values. The + // sub-vectors are associated with the shufflevector instructions they will + // replace. + DenseMap> SubVecs; - CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); + for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { - // Replace uses of each shufflevector with the corresponding vector loaded - // by ldN. - for (unsigned i = 0; i < Shuffles.size(); i++) { - ShuffleVectorInst *SVI = Shuffles[i]; - unsigned Index = Indices[i]; + // If we're generating more than one load, compute the base address of + // subsequent loads as an offset from the previous. + if (LoadCount > 0) + BaseAddr = Builder.CreateConstGEP1_32( + BaseAddr, VecTy->getVectorNumElements() * Factor); - Value *SubVec = Builder.CreateExtractValue(LdN, Index); + CallInst *LdN = Builder.CreateCall( + LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); - // Convert the integer vector to pointer vector if the element is pointer. - if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); + // Extract and store the sub-vectors returned by the load intrinsic. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SVI = Shuffles[i]; + unsigned Index = Indices[i]; - SVI->replaceAllUsesWith(SubVec); + Value *SubVec = Builder.CreateExtractValue(LdN, Index); + + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); + + SubVecs[SVI].push_back(SubVec); + } + } + + // Replace uses of the shufflevector instructions with the sub-vectors + // returned by the load intrinsic. If a shufflevector instruction is + // associated with more than one sub-vector, those sub-vectors will be + // concatenated into a single wide vector. + for (ShuffleVectorInst *SVI : Shuffles) { + auto &SubVec = SubVecs[SVI]; + auto *WideVec = + SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; + SVI->replaceAllUsesWith(WideVec); } return true; @@ -7360,9 +7408,16 @@ unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // Skip if we do not have NEON and skip illegal vector types. - if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize % 128 != 0)) return false; + // We can "legalize" wide vector types into multiple interleaved stores as + // long as the vector types are divisible by 128. The code below determines + // the number of stores we will generate. + unsigned NumStores = 1; + if (SubVecSize > 128) + NumStores = SubVecSize / 128; + Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); @@ -7382,6 +7437,25 @@ SubVecTy = VectorType::get(IntTy, LaneLen); } + // The base address of the store. + Value *BaseAddr = SI->getPointerOperand(); + + if (NumStores > 1) { + // If we're going to generate more than one store, reset the lane length + // and sub-vector type to something legal. + LaneLen /= NumStores; + SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + + // We will compute the pointer operand of each store from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace())); + } + + auto Mask = SVI->getShuffleMask(); + Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); Type *Tys[2] = {SubVecTy, PtrTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, @@ -7390,34 +7464,43 @@ Function *StNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); - SmallVector Ops; + for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { - // Split the shufflevector operands into sub vectors for the new stN call. - auto Mask = SVI->getShuffleMask(); - for (unsigned i = 0; i < Factor; i++) { - if (Mask[i] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Mask[i], LaneLen, 0))); - } else { - unsigned StartMask = 0; - for (unsigned j = 1; j < LaneLen; j++) { - if (Mask[j*Factor + i] >= 0) { - StartMask = Mask[j*Factor + i] - j; - break; + SmallVector Ops; + + // Split the shufflevector operands into sub vectors for the new stN call. + for (unsigned i = 0; i < Factor; i++) { + unsigned IdxI = StoreCount * LaneLen * Factor + i; + if (Mask[IdxI] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + unsigned IdxJ = StoreCount * LaneLen * Factor + j; + if (Mask[IdxJ * Factor + IdxI] >= 0) { + StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; + break; + } } + // Note: If all elements in a chunk are undefs, StartMask=0! Note: + // Filling undef gaps with random elements is ok, since those elements + // were being written anyway (with undefs). In the case of all undefs + // we're defaulting to using elems from 0 Note: StartMask cannot be + // negative, it's checked in isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - // Note: If all elements in a chunk are undefs, StartMask=0! - // Note: Filling undef gaps with random elements is ok, since - // those elements were being written anyway (with undefs). - // In the case of all undefs we're defaulting to using elems from 0 - // Note: StartMask cannot be negative, it's checked in isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - } - Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); - Builder.CreateCall(StNFunc, Ops); + // If we generating more than one store, we compute the base address of + // subsequent stores as an offset from the previous. + if (StoreCount > 0) + BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); + + Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); + Builder.CreateCall(StNFunc, Ops); + } return true; } Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -13300,44 +13300,94 @@ // Skip if we do not have NEON and skip illegal vector types and vector types // with i64/f64 elements (vldN doesn't support i64/f64 elements). - if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize % 128 != 0) || + EltIs64Bits) return false; + // We can "legalize" wide vector types into multiple interleaved loads as + // long as the vector types are divisible by 128. The code below determines + // the number of loads we will generate. + unsigned NumLoads = 1; + if (VecSize > 128) + NumLoads = VecSize / 128; + // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + IRBuilder<> Builder(LI); + + // The base address of the load. + Value *BaseAddr = LI->getPointerOperand(); + + if (NumLoads > 1) { + // If we're going to generate more than one load, reset the sub-vector type + // to something legal. + VecTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / NumLoads); + + // We will compute the pointer operand of each load from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace())); + } + + Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); + Type *Tys[] = {VecTy, Int8Ptr}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); - IRBuilder<> Builder(LI); - SmallVector Ops; + // Holds sub-vectors extracted from the load intrinsic return values. The + // sub-vectors are associated with the shufflevector instructions they will + // replace. + DenseMap> SubVecs; - Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); - Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); - Ops.push_back(Builder.getInt32(LI->getAlignment())); + for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { - Type *Tys[] = { VecTy, Int8Ptr }; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); - CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); + // If we're generating more than one load, compute the base address of + // subsequent loads as an offset from the previous. + if (LoadCount > 0) + BaseAddr = Builder.CreateConstGEP1_32( + BaseAddr, VecTy->getVectorNumElements() * Factor); - // Replace uses of each shufflevector with the corresponding vector loaded - // by ldN. - for (unsigned i = 0; i < Shuffles.size(); i++) { - ShuffleVectorInst *SV = Shuffles[i]; - unsigned Index = Indices[i]; + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + Ops.push_back(Builder.getInt32(LI->getAlignment())); - Value *SubVec = Builder.CreateExtractValue(VldN, Index); + CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); - // Convert the integer vector to pointer vector if the element is pointer. - if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); + // Replace uses of each shufflevector with the corresponding vector loaded + // by ldN. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SV = Shuffles[i]; + unsigned Index = Indices[i]; + + Value *SubVec = Builder.CreateExtractValue(VldN, Index); + + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); + + SubVecs[SV].push_back(SubVec); + } + } - SV->replaceAllUsesWith(SubVec); + // Replace uses of the shufflevector instructions with the sub-vectors + // returned by the load intrinsic. If a shufflevector instruction is + // associated with more than one sub-vector, those sub-vectors will be + // concatenated into a single wide vector. + for (ShuffleVectorInst *SVI : Shuffles) { + auto &SubVec = SubVecs[SVI]; + auto *WideVec = + SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; + SVI->replaceAllUsesWith(WideVec); } return true; @@ -13389,10 +13439,17 @@ // Skip if we do not have NEON and skip illegal vector types and vector types // with i64/f64 elements (vstN doesn't support i64/f64 elements). - if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize % 128 != 0) || EltIs64Bits) return false; + // We can "legalize" wide vector types into multiple interleaved stores as + // long as the vector types are divisible by 128. The code below determines + // the number of stores we will generate. + unsigned NumStores = 1; + if (SubVecSize > 128) + NumStores = SubVecSize / 128; + Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); @@ -13411,44 +13468,72 @@ SubVecTy = VectorType::get(IntTy, LaneLen); } + // The base address of the store. + Value *BaseAddr = SI->getPointerOperand(); + + if (NumStores > 1) { + // If we're going to generate more than one store, reset the lane length + // and sub-vector type to something legal. + LaneLen /= NumStores; + SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + + // We will compute the pointer operand of each store from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace())); + } + + auto Mask = SVI->getShuffleMask(); + + Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); + Type *Tys[] = {Int8Ptr, SubVecTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, Intrinsic::arm_neon_vst3, Intrinsic::arm_neon_vst4}; - SmallVector Ops; - Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); - Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); + for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { - Type *Tys[] = { Int8Ptr, SubVecTy }; - Function *VstNFunc = Intrinsic::getDeclaration( - SI->getModule(), StoreInts[Factor - 2], Tys); + // If we generating more than one store, we compute the base address of + // subsequent stores as an offset from the previous. + if (StoreCount > 0) + BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); - // Split the shufflevector operands into sub vectors for the new vstN call. - auto Mask = SVI->getShuffleMask(); - for (unsigned i = 0; i < Factor; i++) { - if (Mask[i] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Mask[i], LaneLen, 0))); - } else { - unsigned StartMask = 0; - for (unsigned j = 1; j < LaneLen; j++) { - if (Mask[j*Factor + i] >= 0) { - StartMask = Mask[j*Factor + i] - j; - break; + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + + Function *VstNFunc = + Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); + + // Split the shufflevector operands into sub vectors for the new vstN call. + for (unsigned i = 0; i < Factor; i++) { + unsigned IdxI = StoreCount * LaneLen * Factor + i; + if (Mask[IdxI] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + unsigned IdxJ = StoreCount * LaneLen * Factor + j; + if (Mask[IdxJ * Factor + IdxI] >= 0) { + StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; + break; + } } + // Note: If all elements in a chunk are undefs, StartMask=0! Note: + // Filling undef gaps with random elements is ok, since those elements + // were being written anyway (with undefs). In the case of all undefs + // we're defaulting to using elems from 0 Note: StartMask cannot be + // negative, it's checked in isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - // Note: If all elements in a chunk are undefs, StartMask=0! - // Note: Filling undef gaps with random elements is ok, since - // those elements were being written anyway (with undefs). - // In the case of all undefs we're defaulting to using elems from 0 - // Note: StartMask cannot be negative, it's checked in isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - } - Ops.push_back(Builder.getInt32(SI->getAlignment())); - Builder.CreateCall(VstNFunc, Ops); + Ops.push_back(Builder.getInt32(SI->getAlignment())); + Builder.CreateCall(VstNFunc, Ops); + } return true; } Index: test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll =================================================================== --- test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll +++ test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll @@ -565,3 +565,164 @@ store <4 x float> %v0, <4 x float>* @g, align 16 ret void } + +define void @load_factor2_wide(<16 x i32>* %ptr) { +; NEON-LABEL: @load_factor2_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0 +; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]]) +; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1 +; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0 +; NEON-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> +; NEON-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor2_wide( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4 + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> + ret void +} + +define void @load_factor3_wide(<24 x i32>* %ptr) { +; NEON-LABEL: @load_factor3_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1 +; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0 +; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12 +; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP7]]) +; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2 +; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1 +; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0 +; NEON-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> +; NEON-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor3_wide( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4 + %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + ret void +} + +define void @load_factor4_wide(<32 x i32>* %ptr) { +; NEON-LABEL: @load_factor4_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2 +; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1 +; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0 +; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP8]]) +; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 3 +; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2 +; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1 +; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0 +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> +; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> +; NEON-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> +; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor4_wide( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4 + %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + ret void +} + +define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) { +; NEON-LABEL: @store_factor2_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]]) +; NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]]) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor2_wide( +; NO_NEON: ret void +; + %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4 + ret void +} + +define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) { +; NEON-LABEL: @store_factor3_wide( +; NEON: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]]) +; NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12 +; NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32>* [[TMP10]]) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor3_wide( +; NO_NEON: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> + store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4 + ret void +} + +define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) { +; NEON-LABEL: @store_factor4_wide( +; NEON: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]]) +; NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; NEON-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]]) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor4_wide( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4 + ret void +} Index: test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll =================================================================== --- test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll +++ test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll @@ -644,3 +644,166 @@ store <4 x float> %v0, <4 x float>* @g, align 16 ret void } + +define void @load_factor2_wide(<16 x i32>* %ptr) { +; NEON-LABEL: @load_factor2_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 +; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8* +; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4) +; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; NEON-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> +; NEON-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor2_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4 + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> + ret void +} + +define void @load_factor3_wide(<24 x i32>* %ptr) { +; NEON-LABEL: @load_factor3_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 +; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 +; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12 +; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8* +; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4) +; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2 +; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; NEON-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> +; NEON-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor3_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4 + %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + ret void +} + +define void @load_factor4_wide(<32 x i32>* %ptr) { +; NEON-LABEL: @load_factor4_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 +; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 +; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 +; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4) +; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3 +; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2 +; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> +; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> +; NEON-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> +; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor4_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4 + %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + ret void +} + +define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) { +; NEON-LABEL: @store_factor2_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8* +; NEON-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor2_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4 + ret void +} + +define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) { +; NEON-LABEL: @store_factor3_wide( +; NEON: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) +; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12 +; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8* +; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor3_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> + store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4 + ret void +} + +define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) { +; NEON-LABEL: @store_factor4_wide( +; NEON: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4) +; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor4_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4 + ret void +}