Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" +#include using namespace llvm; using namespace llvm::PatternMatch; @@ -100,6 +101,9 @@ Type *ScalarTy = Scalar->getType(); if (!Load || !Load->isSimple()) return false; + auto *Ty = dyn_cast(I.getType()); + if (!Ty) + return false; // TODO: Extend this to match GEP with constant offsets. Value *PtrOp = Load->getPointerOperand()->stripPointerCasts(); @@ -113,9 +117,6 @@ // Check safety of replacing the scalar load with a larger vector load. unsigned VecNumElts = VectorSize / ScalarSize; auto *VectorTy = VectorType::get(ScalarTy, VecNumElts, false); - // TODO: Allow insert/extract subvector if the type does not match. - if (VectorTy != I.getType()) - return false; Align Alignment = Load->getAlign(); const DataLayout &DL = I.getModule()->getDataLayout(); if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT)) @@ -140,7 +141,15 @@ // inselt undef, load Scalar, 0 --> load VecPtr IRBuilder<> Builder(Load); Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo(AS)); - LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment); + Value *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment); + + // If the insert type does not match the target's minimum vector type, + // use an identity shuffle to shrink/grow the vector. + if (Ty != VectorTy) { + SmallVector Mask(Ty->getNumElements()); + std::iota(Mask.begin(), Mask.end(), 0); + VecLd = Builder.CreateShuffleVector(VecLd, UndefValue::get(VectorTy), Mask); + } replaceValue(I, *VecLd); ++NumVecLoad; return true; Index: llvm/test/Transforms/VectorCombine/X86/load.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/load.ll +++ llvm/test/Transforms/VectorCombine/X86/load.ll @@ -346,12 +346,11 @@ ret <4 x float> %r } -; TODO: Should load v4i32. - define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_i32_insert_v8i32( -; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -359,13 +358,10 @@ ret <8 x i32> %r } -; TODO: Should load v4i32. - define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_i32_insert_v8i32( -; CHECK-NEXT: [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32* -; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %b = bitcast <4 x i32>* %p to i32* @@ -374,12 +370,11 @@ ret <8 x i32> %r } -; TODO: Should load v4f32. - define <8 x float> @load_f32_insert_v8f32(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_f32_insert_v8f32( -; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> undef, float [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <8 x i32> ; CHECK-NEXT: ret <8 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -387,12 +382,11 @@ ret <8 x float> %r } -; TODO: Should load v4f32. - define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_f32_insert_v2f32( -; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[R]] ; %s = load float, float* %p, align 4