diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -161,15 +161,17 @@ Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS)); Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); - // If the insert type does not match the target's minimum vector type, - // use an identity shuffle to shrink/grow the vector. - if (Ty != MinVecTy) { - unsigned OutputNumElts = Ty->getNumElements(); - SmallVector Mask(OutputNumElts, UndefMaskElem); - for (unsigned i = 0; i < OutputNumElts && i < MinVecNumElts; ++i) - Mask[i] = i; - VecLd = Builder.CreateShuffleVector(VecLd, Mask); - } + // Set everything but element 0 to undef to prevent poison from propagating + // from the extra loaded memory. This will also optionally shrink/grow the + // vector from the loaded size to the output size. + // We assume this operation has no cost in codegen. + // Note that we could use freeze to avoid poison problems, but then we might + // still need a shuffle to change the vector size. + unsigned OutputNumElts = Ty->getNumElements(); + SmallVector Mask(OutputNumElts, UndefMaskElem); + Mask[0] = 0; + VecLd = Builder.CreateShuffleVector(VecLd, Mask); + replaceValue(I, *VecLd); ++NumVecLoad; return true; diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -175,7 +175,8 @@ define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_f32_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -185,7 +186,8 @@ define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_f32_insert_v4f32( -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %b = bitcast <4 x float>* %p to float* @@ -199,7 +201,8 @@ define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_i32_insert_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -212,7 +215,8 @@ define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_i32_insert_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %b = bitcast <16 x i8>* %p to i32* @@ -225,7 +229,8 @@ define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32( -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0 @@ -238,7 +243,8 @@ define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace( -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0 @@ -253,7 +259,8 @@ ; CHECK-LABEL: @gep01_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* -; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 @@ -283,7 +290,8 @@ ; CHECK-LABEL: @gep10_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* -; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 @@ -410,7 +418,7 @@ ; CHECK-LABEL: @load_i32_insert_v8i32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -421,7 +429,7 @@ define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_i32_insert_v8i32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %b = bitcast <4 x i32>* %p to i32* @@ -434,7 +442,7 @@ ; CHECK-LABEL: @load_f32_insert_v16f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -446,7 +454,7 @@ ; CHECK-LABEL: @load_f32_insert_v2f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -500,7 +508,8 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <2 x float>, <2 x float>* %p, align 4 @@ -512,7 +521,8 @@ define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_v8f32_extract_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <8 x float>, <8 x float>* %p, align 4