diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -86,6 +86,7 @@ InstructionWorklist Worklist; bool vectorizeLoadInsert(Instruction &I); + bool widenSubvectorLoad(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex) const; @@ -265,6 +266,66 @@ return true; } +/// If we are loading a vector and then inserting it into a larger vector with +/// undefined elements, try to load the larger vector and eliminate the insert. +/// This removes a shuffle in IR and may allow combining of other loaded values. +bool VectorCombine::widenSubvectorLoad(Instruction &I) { + // Match subvector insert of fixed vector. + auto *Ty = dyn_cast(I.getType()); + auto *Shuf = dyn_cast(&I); + if (!Ty || !Shuf || !Shuf->isIdentityWithPadding()) + return false; + + // Allow a non-canonical shuffle mask that is choosing elements from op1. + unsigned NumOpElts = + cast(Shuf->getOperand(0)->getType())->getNumElements(); + unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) { + return M >= (int)(NumOpElts); + }); + + auto *Load = dyn_cast(Shuf->getOperand(OpIndex)); + if (!canWidenLoad(Load, TTI)) + return false; + + // We use minimal alignment (maximum flexibility) because we only care about + // the dereferenceable region. When calculating cost and creating a new op, + // we may use a larger value based on alignment attributes. + const DataLayout &DL = I.getModule()->getDataLayout(); + Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); + assert(isa(SrcPtr->getType()) && "Expected a pointer type"); + Align Alignment = Load->getAlign(); + if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), DL, Load, &AC, &DT)) + return false; + + Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); + Type *LoadTy = Load->getType(); + unsigned AS = Load->getPointerAddressSpace(); + + // Original pattern: insert_subvector (load PtrOp) + // This conservatively assumes that the cost of a subvector insert into an + // undef value is 0. We could add that cost if the cost model accurately + // reflects the real cost of that operation. + InstructionCost OldCost = + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); + + // New pattern: load PtrOp + InstructionCost NewCost = + TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS); + + // We can aggressively convert to the vector form because the backend can + // invert this transform if it does not result in a performance win. + if (OldCost < NewCost || !NewCost.isValid()) + return false; + + IRBuilder<> Builder(Load); + Value *CastedPtr = + Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Ty->getPointerTo(AS)); + Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment); + replaceValue(I, *VecLd); + ++NumVecLoad; + return true; +} + /// Determine which, if any, of the inputs should be replaced by a shuffle /// followed by extract from a different index. ExtractElementInst *VectorCombine::getShuffleExtract( @@ -1646,6 +1707,7 @@ Builder.SetInsertPoint(&I); if (!ScalarizationOnly) { MadeChange |= vectorizeLoadInsert(I); + MadeChange |= widenSubvectorLoad(I); MadeChange |= foldExtractExtract(I); MadeChange |= foldInsExtFNeg(I); MadeChange |= foldBitcastShuf(I); diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll --- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK -; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="e" | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 --data-layout="E" | FileCheck %s --check-prefixes=CHECK,AVX ;------------------------------------------------------------------------------- ; Here we know we can load 128 bits as per dereferenceability and alignment. @@ -252,10 +252,11 @@ ret <2 x i4> %r } +; Load the 128-bit vector because there is no additional cost. + define <4 x float> @load_v1f32_v4f32(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v1f32_v4f32( -; CHECK-NEXT: [[L:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x float> [[L]], <1 x float> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <1 x float>, ptr %p, align 16 @@ -263,10 +264,12 @@ ret <4 x float> %s } +; Load the 128-bit vector because there is no additional cost. +; Alignment is taken from param attr. + define <4 x float> @load_v2f32_v4f32(ptr align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2f32_v4f32( -; CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <2 x float>, ptr %p, align 1 @@ -274,10 +277,11 @@ ret <4 x float> %s } +; Load the 128-bit vector because there is no additional cost. + define <4 x float> @load_v3f32_v4f32(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v3f32_v4f32( -; CHECK-NEXT: [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: ret <4 x float> [[S]] ; %l = load <3 x float>, ptr %p, align 1 @@ -285,6 +289,8 @@ ret <4 x float> %s } +; Negative test - the shuffle must be a simple subvector insert. + define <4 x float> @load_v3f32_v4f32_wrong_mask(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v3f32_v4f32_wrong_mask( ; CHECK-NEXT: [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1 @@ -296,6 +302,8 @@ ret <4 x float> %s } +; Negative test - must be dereferenceable to vector width. + define <4 x float> @load_v3f32_v4f32_not_deref(ptr dereferenceable(15) %p) { ; CHECK-LABEL: @load_v3f32_v4f32_not_deref( ; CHECK-NEXT: [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 16 @@ -307,21 +315,28 @@ ret <4 x float> %s } +; Without AVX, the cost of loading 256-bits would be greater. + define <8 x float> @load_v2f32_v8f32(ptr dereferenceable(32) %p) { -; CHECK-LABEL: @load_v2f32_v8f32( -; CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> -; CHECK-NEXT: ret <8 x float> [[S]] +; SSE-LABEL: @load_v2f32_v8f32( +; SSE-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1 +; SSE-NEXT: [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> +; SSE-NEXT: ret <8 x float> [[S]] +; +; AVX-LABEL: @load_v2f32_v8f32( +; AVX-NEXT: [[S:%.*]] = load <8 x float>, ptr [[P:%.*]], align 1 +; AVX-NEXT: ret <8 x float> [[S]] ; %l = load <2 x float>, ptr %p, align 1 %s = shufflevector <2 x float> %l, <2 x float> poison, <8 x i32> ret <8 x float> %s } +; Integer type is ok too. + define <4 x i32> @load_v2i32_v4i32(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2i32_v4i32( -; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 @@ -329,6 +344,8 @@ ret <4 x i32> %s } +; TODO: We assumed the shuffle mask is canonical. + define <4 x i32> @load_v2i32_v4i32_non_canonical_mask(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask( ; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1 @@ -340,10 +357,11 @@ ret <4 x i32> %s } +; Allow non-canonical commuted shuffle. + define <4 x i32> @load_v2i32_v4i32_non_canonical_mask_commute(ptr dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask_commute( -; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> poison, <2 x i32> [[L]], <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: ret <4 x i32> [[S]] ; %l = load <2 x i32>, ptr %p, align 1 @@ -351,11 +369,12 @@ ret <4 x i32> %s } +; The wide load must be in the same addrspace as the original load. + define <4 x i32> @load_v2i32_v4i32_addrspacecast(ptr addrspace(5) align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2i32_v4i32_addrspacecast( -; CHECK-NEXT: [[ASC:%.*]] = addrspacecast ptr addrspace(5) [[P:%.*]] to ptr addrspace(42) -; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr addrspace(42) [[ASC]], align 4 -; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[P:%.*]] to ptr addrspace(42) +; CHECK-NEXT: [[S:%.*]] = load <4 x i32>, ptr addrspace(42) [[TMP1]], align 16 ; CHECK-NEXT: ret <4 x i32> [[S]] ; %asc = addrspacecast ptr addrspace(5) %p to ptr addrspace(42)