diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -100,36 +100,36 @@ Type *ScalarTy = Scalar->getType(); if (!Load || !Load->isSimple()) return false; + auto *Ty = dyn_cast(I.getType()); + if (!Ty) + return false; // TODO: Extend this to match GEP with constant offsets. Value *PtrOp = Load->getPointerOperand()->stripPointerCasts(); assert(isa(PtrOp->getType()) && "Expected a pointer type"); - unsigned VectorSize = TTI.getMinVectorRegisterBitWidth(); + unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); - if (!ScalarSize || !VectorSize || VectorSize % ScalarSize != 0) + if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0) return false; // Check safety of replacing the scalar load with a larger vector load. - unsigned VecNumElts = VectorSize / ScalarSize; - auto *VectorTy = VectorType::get(ScalarTy, VecNumElts, false); - // TODO: Allow insert/extract subvector if the type does not match. - if (VectorTy != I.getType()) - return false; + unsigned MinVecNumElts = MinVectorSize / ScalarSize; + auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); Align Alignment = Load->getAlign(); const DataLayout &DL = I.getModule()->getDataLayout(); - if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT)) + if (!isSafeToLoadUnconditionally(PtrOp, MinVecTy, Alignment, DL, Load, &DT)) return false; unsigned AS = Load->getPointerAddressSpace(); // Original pattern: insertelt undef, load [free casts of] ScalarPtr, 0 int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, AS); - APInt DemandedElts = APInt::getOneBitSet(VecNumElts, 0); - OldCost += TTI.getScalarizationOverhead(VectorTy, DemandedElts, true, false); + APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); + OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, true, false); // New pattern: load VecPtr - int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment, AS); + int NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. @@ -139,8 +139,18 @@ // It is safe and potentially profitable to load a vector directly: // inselt undef, load Scalar, 0 --> load VecPtr IRBuilder<> Builder(Load); - Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo(AS)); - LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment); + Value *CastedPtr = Builder.CreateBitCast(PtrOp, MinVecTy->getPointerTo(AS)); + Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); + + // If the insert type does not match the target's minimum vector type, + // use an identity shuffle to shrink/grow the vector. + if (Ty != MinVecTy) { + unsigned OutputNumElts = Ty->getNumElements(); + SmallVector Mask(OutputNumElts, UndefMaskElem); + for (unsigned i = 0; i < OutputNumElts && i < MinVecNumElts; ++i) + Mask[i] = i; + VecLd = Builder.CreateShuffleVector(VecLd, UndefValue::get(MinVecTy), Mask); + } replaceValue(I, *VecLd); ++NumVecLoad; return true; diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -346,12 +346,11 @@ ret <4 x float> %r } -; TODO: Should load v4i32. - define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_i32_insert_v8i32( -; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -359,13 +358,10 @@ ret <8 x i32> %r } -; TODO: Should load v4i32. - define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_i32_insert_v8i32( -; CHECK-NEXT: [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32* -; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %b = bitcast <4 x i32>* %p to i32* @@ -374,12 +370,11 @@ ret <8 x i32> %r } -; TODO: Should load v4f32. - define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_f32_insert_v16f32( -; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <16 x float> undef, float [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -387,12 +382,11 @@ ret <16 x float> %r } -; TODO: Should load v4f32. - define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_f32_insert_v2f32( -; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[R]] ; %s = load float, float* %p, align 4