Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -441,6 +441,17 @@ /// Returns the size of the platform's va_list object. unsigned getVaListSizeInBits(const DataLayout &DL) const override; + /// Returns true if \p VecTy is a legal interleaved access type. This + /// function checks the vector element type and the overall width of the + /// vector. + bool isLegalInterleavedAccessType(VectorType *VecTy, + const DataLayout &DL) const; + + /// Returns the number of interleaved accesses that will be generated when + /// lowering accesses of the given type. + unsigned getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) const; + private: bool isExtFreeImpl(const Instruction *Ext) const override; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7239,11 +7239,34 @@ /// A helper function for determining the number of interleaved accesses we /// will generate when lowering accesses of the given type. -static unsigned getNumInterleavedAccesses(VectorType *VecTy, - const DataLayout &DL) { +unsigned +AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) const { return (DL.getTypeSizeInBits(VecTy) + 127) / 128; } +bool AArch64TargetLowering::isLegalInterleavedAccessType( + VectorType *VecTy, const DataLayout &DL) const { + + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + + // Ensure the number of vector elements is greater than 1. + if (VecTy->getNumElements() < 2) + return false; + + // Ensure the element type is legal. + if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) + return false; + + // Ensure the total vector size is 64 or a multiple of 128. Types larger than + // 128 will be split into multiple interleaved accesses. + if (VecSize != 64 && VecSize % 128 != 0) + return false; + + return true; +} + /// \brief Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -7267,12 +7290,11 @@ const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); - unsigned VecSize = DL.getTypeSizeInBits(VecTy); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize % 128 != 0)) + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); @@ -7397,12 +7419,11 @@ VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize % 128 != 0)) + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -505,14 +505,14 @@ if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); - Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be // matched to more than one ldN/stN instruction. - if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize % 128 == 0)) - return Factor * ((SubVecSize + 127) / 128); + if (NumElts % Factor == 0 && + TLI->isLegalInterleavedAccessType(SubVecTy, DL)) + return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -530,6 +530,17 @@ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const; CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const; + /// Returns true if \p VecTy is a legal interleaved access type. This + /// function checks the vector element type and the overall width of the + /// vector. + bool isLegalInterleavedAccessType(VectorType *VecTy, + const DataLayout &DL) const; + + /// Returns the number of interleaved accesses that will be generated when + /// lowering accesses of the given type. + unsigned getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) const; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -13593,11 +13593,40 @@ /// A helper function for determining the number of interleaved accesses we /// will generate when lowering accesses of the given type. -static unsigned getNumInterleavedAccesses(VectorType *VecTy, - const DataLayout &DL) { +unsigned +ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) const { return (DL.getTypeSizeInBits(VecTy) + 127) / 128; } +bool ARMTargetLowering::isLegalInterleavedAccessType( + VectorType *VecTy, const DataLayout &DL) const { + + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + + // Ensure the vector doesn't have f16 elements. Even though we could do an + // i16 vldN, we can't hold the f16 vectors and will end up converting via + // f32. + if (VecTy->getElementType()->isHalfTy()) + return false; + + // Ensure the number of vector elements is greater than 1. + if (VecTy->getNumElements() < 2) + return false; + + // Ensure the element type is legal. + if (ElSize != 8 && ElSize != 16 && ElSize != 32) + return false; + + // Ensure the total vector size is 64 or a multiple of 128. Types larger than + // 128 will be split into multiple interleaved accesses. + if (VecSize != 64 && VecSize % 128 != 0) + return false; + + return true; +} + /// \brief Lower an interleaved load into a vldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -13622,20 +13651,11 @@ Type *EltTy = VecTy->getVectorElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); - unsigned VecSize = DL.getTypeSizeInBits(VecTy); - bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; - // Skip if we do not have NEON and skip illegal vector types and vector types - // with i64/f64 elements (vldN doesn't support i64/f64 elements). We can + // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize % 128 != 0) || - EltIs64Bits) - return false; - - // Skip if the vector has f16 elements: even though we could do an i16 vldN, - // we can't hold the f16 vectors and will end up converting via f32. - if (EltTy->isHalfTy()) + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); @@ -13765,20 +13785,11 @@ VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); - bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; - // Skip if we do not have NEON and skip illegal vector types and vector types - // with i64/f64 elements (vldN doesn't support i64/f64 elements). We can + // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize % 128 != 0) || - EltIs64Bits) - return false; - - // Skip if the vector has f16 elements: even though we could do an i16 vldN, - // we can't hold the f16 vectors and will end up converting via f32. - if (EltTy->isHalfTy()) + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -529,15 +529,14 @@ if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { unsigned NumElts = VecTy->getVectorNumElements(); - Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); // vldN/vstN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be // matched to more than one vldN/vstN instruction. - if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize % 128 == 0) && - !VecTy->getScalarType()->isHalfTy()) - return Factor * ((SubVecSize + 127) / 128); + if (NumElts % Factor == 0 && + TLI->isLegalInterleavedAccessType(SubVecTy, DL)) + return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Index: test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll =================================================================== --- test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll +++ test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll @@ -760,3 +760,17 @@ store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4 ret void } + +define void @load_factor2_fp128(<4 x fp128>* %ptr) { +; NEON-LABEL: @load_factor2_fp128( +; NEON-NOT: @llvm.aarch64.neon +; NEON: ret void +; NO_NEON-LABEL: @load_factor2_fp128( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16 + %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> + %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> + ret void +} Index: test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll =================================================================== --- test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll +++ test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll @@ -843,3 +843,14 @@ store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4 ret void } + +define void @load_factor2_fp128(<4 x fp128>* %ptr) { +; ALL-LABEL: @load_factor2_fp128( +; ALL-NOT: @llvm.arm.neon +; ALL: ret void +; + %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16 + %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> + %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> + ret void +}