Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7248,6 +7248,13 @@ return NumBits == 32 || NumBits == 64; } +/// A helper function for determining the number of interleaved accesses we +/// will generate when lowering accesses of the given type. +static unsigned getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) { + return (DL.getTypeSizeInBits(VecTy) + 127) / 128; +} + /// \brief Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -7273,10 +7280,14 @@ VectorType *VecTy = Shuffles[0]->getType(); unsigned VecSize = DL.getTypeSizeInBits(VecTy); - // Skip if we do not have NEON and skip illegal vector types. - if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize % 128 != 0)) return false; + unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); + // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. Type *EltTy = VecTy->getVectorElementType(); @@ -7284,6 +7295,25 @@ VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + IRBuilder<> Builder(LI); + + // The base address of the load. + Value *BaseAddr = LI->getPointerOperand(); + + if (NumLoads > 1) { + // If we're going to generate more than one load, reset the sub-vector type + // to something legal. + VecTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / NumLoads); + + // We will compute the pointer operand of each load from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace())); + } + Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[2] = {VecTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, @@ -7292,24 +7322,46 @@ Function *LdNFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); - IRBuilder<> Builder(LI); - Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); - - CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); - - // Replace uses of each shufflevector with the corresponding vector loaded - // by ldN. - for (unsigned i = 0; i < Shuffles.size(); i++) { - ShuffleVectorInst *SVI = Shuffles[i]; - unsigned Index = Indices[i]; - - Value *SubVec = Builder.CreateExtractValue(LdN, Index); - - // Convert the integer vector to pointer vector if the element is pointer. - if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); + // Holds sub-vectors extracted from the load intrinsic return values. The + // sub-vectors are associated with the shufflevector instructions they will + // replace. + DenseMap> SubVecs; + + for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { + + // If we're generating more than one load, compute the base address of + // subsequent loads as an offset from the previous. + if (LoadCount > 0) + BaseAddr = Builder.CreateConstGEP1_32( + BaseAddr, VecTy->getVectorNumElements() * Factor); + + CallInst *LdN = Builder.CreateCall( + LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); + + // Extract and store the sub-vectors returned by the load intrinsic. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SVI = Shuffles[i]; + unsigned Index = Indices[i]; + + Value *SubVec = Builder.CreateExtractValue(LdN, Index); + + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); + + SubVecs[SVI].push_back(SubVec); + } + } - SVI->replaceAllUsesWith(SubVec); + // Replace uses of the shufflevector instructions with the sub-vectors + // returned by the load intrinsic. If a shufflevector instruction is + // associated with more than one sub-vector, those sub-vectors will be + // concatenated into a single wide vector. + for (ShuffleVectorInst *SVI : Shuffles) { + auto &SubVec = SubVecs[SVI]; + auto *WideVec = + SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; + SVI->replaceAllUsesWith(WideVec); } return true; @@ -7358,10 +7410,14 @@ const DataLayout &DL = SI->getModule()->getDataLayout(); unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); - // Skip if we do not have NEON and skip illegal vector types. - if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize % 128 != 0)) return false; + unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); + Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); @@ -7381,6 +7437,25 @@ SubVecTy = VectorType::get(IntTy, LaneLen); } + // The base address of the store. + Value *BaseAddr = SI->getPointerOperand(); + + if (NumStores > 1) { + // If we're going to generate more than one store, reset the lane length + // and sub-vector type to something legal. + LaneLen /= NumStores; + SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + + // We will compute the pointer operand of each store from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace())); + } + + auto Mask = SVI->getShuffleMask(); + Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); Type *Tys[2] = {SubVecTy, PtrTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, @@ -7389,34 +7464,43 @@ Function *StNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); - SmallVector Ops; + for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { - // Split the shufflevector operands into sub vectors for the new stN call. - auto Mask = SVI->getShuffleMask(); - for (unsigned i = 0; i < Factor; i++) { - if (Mask[i] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Mask[i], LaneLen, 0))); - } else { - unsigned StartMask = 0; - for (unsigned j = 1; j < LaneLen; j++) { - if (Mask[j*Factor + i] >= 0) { - StartMask = Mask[j*Factor + i] - j; - break; + SmallVector Ops; + + // Split the shufflevector operands into sub vectors for the new stN call. + for (unsigned i = 0; i < Factor; i++) { + unsigned IdxI = StoreCount * LaneLen * Factor + i; + if (Mask[IdxI] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + unsigned IdxJ = StoreCount * LaneLen * Factor + j; + if (Mask[IdxJ * Factor + IdxI] >= 0) { + StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; + break; + } } + // Note: Filling undef gaps with random elements is ok, since + // those elements were being written anyway (with undefs). + // In the case of all undefs we're defaulting to using elems from 0 + // Note: StartMask cannot be negative, it's checked in + // isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - // Note: If all elements in a chunk are undefs, StartMask=0! - // Note: Filling undef gaps with random elements is ok, since - // those elements were being written anyway (with undefs). - // In the case of all undefs we're defaulting to using elems from 0 - // Note: StartMask cannot be negative, it's checked in isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - } - Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); - Builder.CreateCall(StNFunc, Ops); + // If we generating more than one store, we compute the base address of + // subsequent stores as an offset from the previous. + if (StoreCount > 0) + BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); + + Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); + Builder.CreateCall(StNFunc, Ops); + } return true; } Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -13282,6 +13282,13 @@ Addr}); } +/// A helper function for determining the number of interleaved accesses we +/// will generate when lowering accesses of the given type. +static unsigned getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) { + return (DL.getTypeSizeInBits(VecTy) + 127) / 128; +} + /// \brief Lower an interleaved load into a vldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -13310,8 +13317,11 @@ bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; // Skip if we do not have NEON and skip illegal vector types and vector types - // with i64/f64 elements (vldN doesn't support i64/f64 elements). - if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) + // with i64/f64 elements (vldN doesn't support i64/f64 elements). We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize % 128 != 0) || + EltIs64Bits) return false; // Skip if the vector has f16 elements: even though we could do an i16 vldN, @@ -13319,43 +13329,87 @@ if (EltTy->isHalfTy()) return false; + unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); + // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); - static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, - Intrinsic::arm_neon_vld3, - Intrinsic::arm_neon_vld4}; - IRBuilder<> Builder(LI); - SmallVector Ops; - Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); - Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); - Ops.push_back(Builder.getInt32(LI->getAlignment())); + // The base address of the load. + Value *BaseAddr = LI->getPointerOperand(); + + if (NumLoads > 1) { + // If we're going to generate more than one load, reset the sub-vector type + // to something legal. + VecTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / NumLoads); + + // We will compute the pointer operand of each load from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace())); + } assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); - Type *Tys[] = { VecTy, Int8Ptr }; + Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); + Type *Tys[] = {VecTy, Int8Ptr}; + static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, + Intrinsic::arm_neon_vld3, + Intrinsic::arm_neon_vld4}; Function *VldnFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); - CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); - // Replace uses of each shufflevector with the corresponding vector loaded - // by ldN. - for (unsigned i = 0; i < Shuffles.size(); i++) { - ShuffleVectorInst *SV = Shuffles[i]; - unsigned Index = Indices[i]; - - Value *SubVec = Builder.CreateExtractValue(VldN, Index); - - // Convert the integer vector to pointer vector if the element is pointer. - if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); + // Holds sub-vectors extracted from the load intrinsic return values. The + // sub-vectors are associated with the shufflevector instructions they will + // replace. + DenseMap> SubVecs; + + for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { + + // If we're generating more than one load, compute the base address of + // subsequent loads as an offset from the previous. + if (LoadCount > 0) + BaseAddr = Builder.CreateConstGEP1_32( + BaseAddr, VecTy->getVectorNumElements() * Factor); + + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + Ops.push_back(Builder.getInt32(LI->getAlignment())); + + CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); + + // Replace uses of each shufflevector with the corresponding vector loaded + // by ldN. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SV = Shuffles[i]; + unsigned Index = Indices[i]; + + Value *SubVec = Builder.CreateExtractValue(VldN, Index); + + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); + + SubVecs[SV].push_back(SubVec); + } + } - SV->replaceAllUsesWith(SubVec); + // Replace uses of the shufflevector instructions with the sub-vectors + // returned by the load intrinsic. If a shufflevector instruction is + // associated with more than one sub-vector, those sub-vectors will be + // concatenated into a single wide vector. + for (ShuffleVectorInst *SVI : Shuffles) { + auto &SubVec = SubVecs[SVI]; + auto *WideVec = + SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; + SVI->replaceAllUsesWith(WideVec); } return true; @@ -13406,8 +13460,10 @@ bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; // Skip if we do not have NEON and skip illegal vector types and vector types - // with i64/f64 elements (vstN doesn't support i64/f64 elements). - if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || + // with i64/f64 elements (vldN doesn't support i64/f64 elements). We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize % 128 != 0) || EltIs64Bits) return false; @@ -13416,6 +13472,8 @@ if (EltTy->isHalfTy()) return false; + unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); + Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); @@ -13434,46 +13492,75 @@ SubVecTy = VectorType::get(IntTy, LaneLen); } - static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, - Intrinsic::arm_neon_vst3, - Intrinsic::arm_neon_vst4}; - SmallVector Ops; + // The base address of the store. + Value *BaseAddr = SI->getPointerOperand(); - Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); - Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); + if (NumStores > 1) { + // If we're going to generate more than one store, reset the lane length + // and sub-vector type to something legal. + LaneLen /= NumStores; + SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + + // We will compute the pointer operand of each store from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace())); + } assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); - Type *Tys[] = { Int8Ptr, SubVecTy }; - Function *VstNFunc = Intrinsic::getDeclaration( - SI->getModule(), StoreInts[Factor - 2], Tys); - - // Split the shufflevector operands into sub vectors for the new vstN call. auto Mask = SVI->getShuffleMask(); - for (unsigned i = 0; i < Factor; i++) { - if (Mask[i] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Mask[i], LaneLen, 0))); - } else { - unsigned StartMask = 0; - for (unsigned j = 1; j < LaneLen; j++) { - if (Mask[j*Factor + i] >= 0) { - StartMask = Mask[j*Factor + i] - j; - break; + + Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); + Type *Tys[] = {Int8Ptr, SubVecTy}; + static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; + + for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { + + // If we generating more than one store, we compute the base address of + // subsequent stores as an offset from the previous. + if (StoreCount > 0) + BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); + + SmallVector Ops; + Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); + + Function *VstNFunc = + Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); + + // Split the shufflevector operands into sub vectors for the new vstN call. + for (unsigned i = 0; i < Factor; i++) { + unsigned IdxI = StoreCount * LaneLen * Factor + i; + if (Mask[IdxI] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + unsigned IdxJ = StoreCount * LaneLen * Factor + j; + if (Mask[IdxJ * Factor + IdxI] >= 0) { + StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; + break; + } } + // Note: If all elements in a chunk are undefs, StartMask=0! + // Note: Filling undef gaps with random elements is ok, since + // those elements were being written anyway (with undefs). + // In the case of all undefs we're defaulting to using elems from 0 + // Note: StartMask cannot be negative, it's checked in + // isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - // Note: If all elements in a chunk are undefs, StartMask=0! - // Note: Filling undef gaps with random elements is ok, since - // those elements were being written anyway (with undefs). - // In the case of all undefs we're defaulting to using elems from 0 - // Note: StartMask cannot be negative, it's checked in isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - } - Ops.push_back(Builder.getInt32(SI->getAlignment())); - Builder.CreateCall(VstNFunc, Ops); + Ops.push_back(Builder.getInt32(SI->getAlignment())); + Builder.CreateCall(VstNFunc, Ops); + } return true; } Index: llvm/trunk/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll =================================================================== --- llvm/trunk/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll +++ llvm/trunk/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll @@ -565,3 +565,198 @@ store <4 x float> %v0, <4 x float>* @g, align 16 ret void } + +define void @load_factor2_wide2(<16 x i32>* %ptr) { +; NEON-LABEL: @load_factor2_wide2( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0 +; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]]) +; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1 +; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0 +; NEON-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> +; NEON-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor2_wide2( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4 + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> + ret void +} + +define void @load_factor2_wide3(<24 x i32>* %ptr) { +; NEON-LABEL: @load_factor2_wide3( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0 +; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]]) +; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1 +; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0 +; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8 +; NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; NEON-NEXT: [[LDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP10]]) +; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 1 +; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 0 +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> +; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> +; NEON-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> +; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> +; NEON-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> +; NEON-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor2_wide3( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4 + %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> + %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> + ret void +} + +define void @load_factor3_wide(<24 x i32>* %ptr) { +; NEON-LABEL: @load_factor3_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1 +; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0 +; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12 +; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP7]]) +; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2 +; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1 +; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0 +; NEON-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> +; NEON-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor3_wide( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4 + %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + ret void +} + +define void @load_factor4_wide(<32 x i32>* %ptr) { +; NEON-LABEL: @load_factor4_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]]) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2 +; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1 +; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0 +; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; NEON-NEXT: [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP8]]) +; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 3 +; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2 +; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1 +; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0 +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> +; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> +; NEON-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> +; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor4_wide( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4 + %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + ret void +} + +define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) { +; NEON-LABEL: @store_factor2_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]]) +; NEON-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]]) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor2_wide( +; NO_NEON: ret void +; + %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4 + ret void +} + +define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) { +; NEON-LABEL: @store_factor3_wide( +; NEON: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]]) +; NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12 +; NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32>* [[TMP10]]) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor3_wide( +; NO_NEON: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> + store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4 + ret void +} + +define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) { +; NEON-LABEL: @store_factor4_wide( +; NEON: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]]) +; NEON-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; NEON-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* +; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]]) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor4_wide( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4 + ret void +} Index: llvm/trunk/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll =================================================================== --- llvm/trunk/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll +++ llvm/trunk/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll @@ -646,3 +646,200 @@ store <4 x float> %v0, <4 x float>* @g, align 16 ret void } + +define void @load_factor2_wide2(<16 x i32>* %ptr) { +; NEON-LABEL: @load_factor2_wide2( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 +; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8* +; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4) +; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; NEON-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> +; NEON-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor2_wide2( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4 + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> + ret void +} + +define void @load_factor2_wide3(<24 x i32>* %ptr) { +; NEON-LABEL: @load_factor2_wide3( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 +; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8* +; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4) +; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8 +; NEON-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8* +; NEON-NEXT: [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4) +; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1 +; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0 +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> +; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> +; NEON-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> +; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> +; NEON-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> +; NEON-NEXT: [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor2_wide3( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4 + %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> + %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> + ret void +} + +define void @load_factor3_wide(<24 x i32>* %ptr) { +; NEON-LABEL: @load_factor3_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 +; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 +; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12 +; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8* +; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4) +; NEON-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2 +; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; NEON-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> +; NEON-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor3_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4 + %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> + ret void +} + +define void @load_factor4_wide(<32 x i32>* %ptr) { +; NEON-LABEL: @load_factor4_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 +; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 +; NEON-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 +; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4) +; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3 +; NEON-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2 +; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; NEON-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> +; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> +; NEON-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> +; NEON-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor4_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4 + %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> + ret void +} + +define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) { +; NEON-LABEL: @store_factor2_wide( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4) +; NEON-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8* +; NEON-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor2_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4 + ret void +} + +define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) { +; NEON-LABEL: @store_factor3_wide( +; NEON: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) +; NEON-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12 +; NEON-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8* +; NEON-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor3_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> + store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4 + ret void +} + +define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) { +; NEON-LABEL: @store_factor4_wide( +; NEON: [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4) +; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 +; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +; NEON-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> +; NEON-NEXT: call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor4_wide( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4 + ret void +}