diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1183,6 +1183,10 @@ /// split during legalization. Zero is returned when the answer is unknown. unsigned getNumberOfParts(Type *Tp) const; + /// \returns The type of the piece into which the provided type must be + /// split during legalization. + Type *getLegalizedPartType(Type *Tp) const; + /// \returns The cost of the address computation. For most targets this can be /// merged into the instruction indexing mode. Some targets might want to /// distinguish between address computation for memory operations on vector @@ -1632,6 +1636,7 @@ ArrayRef Tys, TTI::TargetCostKind CostKind) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; + virtual Type *getLegalizedPartType(Type *Tp) = 0; virtual InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0; virtual InstructionCost @@ -2145,6 +2150,9 @@ unsigned getNumberOfParts(Type *Tp) override { return Impl.getNumberOfParts(Tp); } + Type *getLegalizedPartType(Type *Tp) override { + return Impl.getLegalizedPartType(Tp); + } InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) override { return Impl.getAddressComputationCost(Ty, SE, Ptr); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -615,6 +615,7 @@ } unsigned getNumberOfParts(Type *Tp) const { return 0; } + Type *getLegalizedPartType(Type *Tp) const { return nullptr; } InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *, const SCEV *) const { diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1984,6 +1984,12 @@ return *LT.first.getValue(); } + Type *getLegalizedPartType(Type *Tp) { + std::pair LT = + getTLI()->getTypeLegalizationCost(DL, Tp); + return EVT(LT.second).getTypeForEVT(Tp->getContext()); + } + InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *) { return 0; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -879,6 +879,10 @@ return TTIImpl->getNumberOfParts(Tp); } +Type *TargetTransformInfo::getLegalizedPartType(Type *Tp) const { + return TTIImpl->getLegalizedPartType(Tp); +} + InstructionCost TargetTransformInfo::getAddressComputationCost(Type *Tp, ScalarEvolution *SE, const SCEV *Ptr) const { diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -36,6 +36,7 @@ #define DEBUG_TYPE "vector-combine" STATISTIC(NumVecLoad, "Number of vector loads formed"); +STATISTIC(NumVecLoadWiden, "Number of vector loads widened"); STATISTIC(NumVecCmp, "Number of vector compares formed"); STATISTIC(NumVecBO, "Number of vector binops formed"); STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); @@ -75,6 +76,7 @@ AssumptionCache &AC; bool vectorizeLoadInsert(Instruction &I); + bool widenPartialVectorLoad(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex) const; @@ -231,6 +233,88 @@ return true; } +bool VectorCombine::widenPartialVectorLoad(Instruction &I) { + const DataLayout &DL = I.getModule()->getDataLayout(); + + auto *Load = dyn_cast(&I); + if (!Load) + return false; + + Value *OrigPtr = Load->getPointerOperand(); + Align Alignment = Load->getAlign(); + unsigned AS = Load->getPointerAddressSpace(); + + // What vector type do we currently load? + auto *OrigVecTy = dyn_cast(Load->getType()); + if (!OrigVecTy) + return false; + + Type *ScalarEltTy = OrigVecTy->getScalarType(); + unsigned OrigNumElts = OrigVecTy->getNumElements(); + unsigned NumBitsPerElt = DL.getTypeSizeInBits(ScalarEltTy); + + // How will that type be legalized? I.e. into what vector register + // will it be loaded, and how many registers will be occupied? + auto *LegalizedPartVecTy = + dyn_cast_or_null(TTI.getLegalizedPartType(OrigVecTy)); + unsigned NumOfLegalizedVecParts = TTI.getNumberOfParts(OrigVecTy); + + // If it doesn't legalize into (a number of) vector registers, don't bother. + if (!LegalizedPartVecTy || !NumOfLegalizedVecParts) + return false; + + unsigned OrigBitCount = DL.getTypeSizeInBits(OrigVecTy); + unsigned LegalizedVecBitCount = + NumOfLegalizedVecParts * DL.getTypeSizeInBits(LegalizedPartVecTy); + assert(LegalizedVecBitCount >= OrigBitCount && + "Number of bits-to-be-loaded shouldn't decrease!"); + + // Do we already load a multiple of the legalized type? + if (OrigBitCount == LegalizedVecBitCount) + return false; + + // How many more elements would we need to load? + unsigned NumExtraBits = LegalizedVecBitCount - OrigBitCount; + if (NumExtraBits % NumBitsPerElt != 0) + return false; // Not a multiple of element size. + // FIXME: might be able to handle some cases if they are multiple of byte. + + unsigned NumExtraElts = NumExtraBits / NumBitsPerElt; + + auto *WideVecTy = + FixedVectorType::get(ScalarEltTy, OrigNumElts + NumExtraElts); + assert(DL.getTypeSizeInBits(WideVecTy) == LegalizedVecBitCount && + "Failed to properly widen OrigVecTy to match the total legalized " + "vector size?"); + + // Okay, we currently load less than full width of the legalized vectors. + // If we'd widen the load, would that be more costly than the current load? + InstructionCost OldLoadCost = + TTI.getMemoryOpCost(Instruction::Load, OrigVecTy, Alignment, AS); + InstructionCost NewLoadCost = + TTI.getMemoryOpCost(Instruction::Load, WideVecTy, Alignment, AS); + if (NewLoadCost > OldLoadCost) + return false; + + // It would not be more costly. But can we perform such a wide load? + if (!isSafeToLoadUnconditionally(OrigPtr, WideVecTy, Align(1), DL, Load, &DT, + /*TLI=*/nullptr)) + return false; + + IRBuilder<> Builder(Load); + Value *CastedPtr = + Builder.CreateBitCast(OrigPtr, WideVecTy->getPointerTo(AS)); + Value *WideVecLd = Builder.CreateAlignedLoad(WideVecTy, CastedPtr, Alignment); + // We loaded some extra elements, we only need the low NumElts ones. + // This is endiannes-insensitive. + SmallVector Mask(OrigNumElts); + std::iota(Mask.begin(), Mask.end(), 0); + Value *ExtractedLowSubvector = Builder.CreateShuffleVector(WideVecLd, Mask); + replaceValue(I, *ExtractedLowSubvector); + ++NumVecLoadWiden; + return true; +} + /// Determine which, if any, of the inputs should be replaced by a shuffle /// followed by extract from a different index. ExtractElementInst *VectorCombine::getShuffleExtract( @@ -967,6 +1051,7 @@ continue; Builder.SetInsertPoint(&I); MadeChange |= vectorizeLoadInsert(I); + MadeChange |= widenPartialVectorLoad(I); MadeChange |= foldExtractExtract(I); MadeChange |= foldBitcastShuf(I); MadeChange |= scalarizeBinopOrCmp(I); diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -587,8 +587,10 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[L:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[S:%.*]] = extractelement <2 x float> [[L]], i32 0 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0 ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <2 x float>, <2 x float>* %p, align 4 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll --- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll @@ -29,7 +29,9 @@ define <2 x float> @vec_with_2elts(<2 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_2elts( -; CHECK-NEXT: [[R:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[R]] ; %r = load <2 x float>, <2 x float>* %p, align 16 @@ -38,7 +40,9 @@ define <3 x float> @vec_with_3elts(<3 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_3elts( -; CHECK-NEXT: [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x float>* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[R]] ; %r = load <3 x float>, <3 x float>* %p, align 16 @@ -70,7 +74,9 @@ ; We can load 128 bits, and the fact that it's underaligned isn't relevant. define <3 x float> @vec_with_3elts_underaligned(<3 x float>* align 8 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_3elts_underaligned( -; CHECK-NEXT: [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x float>* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 8 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[R]] ; %r = load <3 x float>, <3 x float>* %p, align 8 @@ -112,7 +118,9 @@ define <2 x float> @vec_with_2elts_256bits(<2 x float>* align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_2elts_256bits( -; CHECK-NEXT: [[R:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 32 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[R]] ; %r = load <2 x float>, <2 x float>* %p, align 32 @@ -121,7 +129,9 @@ define <3 x float> @vec_with_3elts_256bits(<3 x float>* align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_3elts_256bits( -; CHECK-NEXT: [[R:%.*]] = load <3 x float>, <3 x float>* [[P:%.*]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x float>* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 32 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <3 x i32> ; CHECK-NEXT: ret <3 x float> [[R]] ; %r = load <3 x float>, <3 x float>* %p, align 32 @@ -139,7 +149,9 @@ define <5 x float> @vec_with_5elts_256bits(<5 x float>* align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_5elts_256bits( -; CHECK-NEXT: [[R:%.*]] = load <5 x float>, <5 x float>* [[P:%.*]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <5 x float>* [[P:%.*]] to <8 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 32 +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <5 x i32> ; CHECK-NEXT: ret <5 x float> [[R]] ; %r = load <5 x float>, <5 x float>* %p, align 32 @@ -148,7 +160,9 @@ define <6 x float> @vec_with_6elts_256bits(<6 x float>* align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_6elts_256bits( -; CHECK-NEXT: [[R:%.*]] = load <6 x float>, <6 x float>* [[P:%.*]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <6 x float>* [[P:%.*]] to <8 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 32 +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <6 x i32> ; CHECK-NEXT: ret <6 x float> [[R]] ; %r = load <6 x float>, <6 x float>* %p, align 32 @@ -157,7 +171,9 @@ define <7 x float> @vec_with_7elts_256bits(<7 x float>* align 32 dereferenceable(32) %p) { ; CHECK-LABEL: @vec_with_7elts_256bits( -; CHECK-NEXT: [[R:%.*]] = load <7 x float>, <7 x float>* [[P:%.*]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <7 x float>* [[P:%.*]] to <8 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 32 +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <7 x i32> ; CHECK-NEXT: ret <7 x float> [[R]] ; %r = load <7 x float>, <7 x float>* %p, align 32 @@ -187,6 +203,7 @@ ;------------------------------------------------------------------------------- ; Weird types we don't deal with + define <2 x i7> @vec_with_two_subbyte_elts(<2 x i7>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_two_subbyte_elts( ; CHECK-NEXT: [[R:%.*]] = load <2 x i7>, <2 x i7>* [[P:%.*]], align 16 @@ -216,7 +233,9 @@ define <2 x float> @vec_with_2elts_addressspace(<2 x float> addrspace(2)* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_2elts_addressspace( -; CHECK-NEXT: [[R:%.*]] = load <2 x float>, <2 x float> addrspace(2)* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> addrspace(2)* [[P:%.*]] to <4 x float> addrspace(2)* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float> addrspace(2)* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[R]] ; %r = load <2 x float>, <2 x float> addrspace(2)* %p, align 16 @@ -225,11 +244,13 @@ ;------------------------------------------------------------------------------- -; Widening these would change the legalized type, so leave them alone. +; Weird types we do deal with define <2 x i1> @vec_with_2elts_128bits_i1(<2 x i1>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_2elts_128bits_i1( -; CHECK-NEXT: [[R:%.*]] = load <2 x i1>, <2 x i1>* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i1>* [[P:%.*]] to <128 x i1>* +; CHECK-NEXT: [[TMP2:%.*]] = load <128 x i1>, <128 x i1>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <128 x i1> [[TMP2]], <128 x i1> poison, <2 x i32> ; CHECK-NEXT: ret <2 x i1> [[R]] ; %r = load <2 x i1>, <2 x i1>* %p, align 16 @@ -237,7 +258,9 @@ } define <2 x i2> @vec_with_2elts_128bits_i2(<2 x i2>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_2elts_128bits_i2( -; CHECK-NEXT: [[R:%.*]] = load <2 x i2>, <2 x i2>* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i2>* [[P:%.*]] to <64 x i2>* +; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i2>, <64 x i2>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <64 x i2> [[TMP2]], <64 x i2> poison, <2 x i32> ; CHECK-NEXT: ret <2 x i2> [[R]] ; %r = load <2 x i2>, <2 x i2>* %p, align 16 @@ -245,7 +268,9 @@ } define <2 x i4> @vec_with_2elts_128bits_i4(<2 x i4>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @vec_with_2elts_128bits_i4( -; CHECK-NEXT: [[R:%.*]] = load <2 x i4>, <2 x i4>* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i4>* [[P:%.*]] to <32 x i4>* +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i4>, <32 x i4>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <32 x i4> [[TMP2]], <32 x i4> poison, <2 x i32> ; CHECK-NEXT: ret <2 x i4> [[R]] ; %r = load <2 x i4>, <2 x i4>* %p, align 16 diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -587,8 +587,10 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[L:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[S:%.*]] = extractelement <2 x float> [[L]], i32 0 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <2 x float>, <2 x float>* %p, align 4