Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -93,6 +93,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Match insert into fixed vector of scalar value. + // TODO: Handle non-zero insert index. auto *Ty = dyn_cast(I.getType()); Value *Scalar; if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || @@ -115,7 +116,6 @@ mustSuppressSpeculation(*Load)) return false; - // TODO: Extend this to match GEP with constant offsets. const DataLayout &DL = I.getModule()->getDataLayout(); Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); assert(isa(SrcPtr->getType()) && "Expected a pointer type"); @@ -137,8 +137,25 @@ unsigned MinVecNumElts = MinVectorSize / ScalarSize; auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); Align Alignment = Load->getAlign(); - if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Alignment, DL, Load, &DT)) - return false; + unsigned OffsetInBits = 0; + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Alignment, DL, Load, + &DT)) { + // It is not safe to load from the pointer, but we can still peek through + // gep offsets to see if it safe to load from a base address. + // If it is, we can shuffle the element(s) into place after loading. + unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType()); + APInt Offset(OffsetBitWidth, 0); + SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Alignment, DL, Load, + &DT)) + return false; + + // The offset must be within a vector-length to allow shuffling into place. + if (Offset.uge(MinVectorSize / 8)) + return false; + + OffsetInBits = Offset.getZExtValue() * 8; + } // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 Type *LoadTy = Load->getType(); @@ -149,6 +166,9 @@ // New pattern: load VecPtr int NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); + // Optionally, we are shuffling the loaded vector element(s) into place. + if (OffsetInBits) + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy); // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. @@ -160,6 +180,13 @@ IRBuilder<> Builder(Load); Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS)); Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); + if (OffsetInBits) { + // If we peeked through a gep to a base pointer, translate the address + // offset into a vector element and shift it over to element 0. + SmallVector Mask(MinVecNumElts, UndefMaskElem); + Mask[0] = OffsetInBits / ScalarSize; + VecLd = Builder.CreateShuffleVector(VecLd, Mask); + } // If the insert type does not match the target's minimum vector type, // use an identity shuffle to shrink/grow the vector. Index: llvm/test/Transforms/VectorCombine/X86/load.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/load.ll +++ llvm/test/Transforms/VectorCombine/X86/load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -262,14 +262,19 @@ ret <8 x i16> %r } -; Negative test - can't safely load the offset vector, but could load+shuffle. +; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) { -; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; CHECK-NEXT: ret <8 x i16> [[R]] +; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 +; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 2 +; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; SSE2-NEXT: ret <8 x i16> [[R]] +; +; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2 +; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> +; AVX2-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 %s = load i16, i16* %gep, align 2 @@ -536,15 +541,21 @@ ret <8 x i32> %r } -; TODO: Can't safely load the offset vector, but can load+shuffle if it is profitable. +; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 16 dereferenceable(16) %p) { -; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16( -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 -; CHECK-NEXT: [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 2 -; CHECK-NEXT: [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; CHECK-NEXT: ret <8 x i16> [[R]] +; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( +; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 +; SSE2-NEXT: [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 2 +; SSE2-NEXT: [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0 +; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; SSE2-NEXT: ret <8 x i16> [[R]] +; +; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( +; AVX2-NEXT: [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>* +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 +; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> +; AVX2-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1 %l = load <2 x i16>, <2 x i16>* %gep, align 2