diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp --- a/llvm/lib/Transforms/Utils/VNCoercion.cpp +++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp @@ -4,6 +4,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" +#include #define DEBUG_TYPE "vncoerce" @@ -312,6 +313,74 @@ if (SrcVal->getType()->isPtrOrPtrVectorTy()) SrcVal = Builder.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType())); + if (LoadTy->isPtrOrPtrVectorTy()) + LoadTy = DL.getIntPtrType(LoadTy); + + // If SrcVal is a vector, use vector ops to get the loaded value. + if (SrcVal->getType()->isVectorTy()) { + if (StoreSize > LoadSize) { + auto *SrcValTyV = cast(SrcVal->getType()); + unsigned LoadSizeInBits = DL.getTypeSizeInBits(LoadTy).getFixedValue(); + unsigned SrcValEltSizeInBits = SrcValTyV->getScalarSizeInBits(); + + // Compute how many elements of SrcVal are necessary. We need to ensure + // that all bitcasts are between vector types to prevent poison + // propagation. + unsigned NumEltsRequiredFromVec = + std::lcm(SrcValEltSizeInBits, LoadSizeInBits) / SrcValEltSizeInBits; + if (isa(LoadTy)) { + auto *LoadTyV = cast(LoadTy); + unsigned LoadEltSizeInBits = LoadTy->getScalarSizeInBits(); + + SmallVector Mask(NumEltsRequiredFromVec, PoisonMaskElem); + std::iota(Mask.begin(), Mask.begin() + SrcValTyV->getNumElements(), 0); + SrcVal = Builder.CreateShuffleVector(SrcVal, Mask); + + // A subvector is loaded so we extract it from SrcVal. + if (SrcVal->getType() != LoadTy) { + // Bitcast to ensure that SrcVal has the same element type as the + // loaded vector. + unsigned NumElems = (NumEltsRequiredFromVec * SrcValEltSizeInBits) / + LoadEltSizeInBits; + + auto *DestTy = VectorType::get(LoadTy->getScalarType(), NumElems, + /* Scalable */ false); + SrcVal = Builder.CreateBitCast(SrcVal, DestTy); + if (SrcVal->getType() != LoadTy) { + // Extract the subvector such that SrcVal and the loaded vector + // have the same size. + SmallVector Mask2(LoadTyV->getNumElements()); + std::iota(Mask2.begin(), Mask2.end(), 0); + SrcVal = Builder.CreateShuffleVector(SrcVal, Mask2); + } + } + } else { + if (SrcVal->getType()->getScalarType() != LoadTy) { + // Requires a bitcast before the extract. + auto *DestTy = VectorType::get(LoadTy, NumEltsRequiredFromVec, + /* Scalable */ false); + if (SrcValTyV->getPrimitiveSizeInBits() != + DestTy->getPrimitiveSizeInBits()) { + // Extract the subvector to ensure a legal bitcast. + SmallVector Mask(NumEltsRequiredFromVec, PoisonMaskElem); + std::iota(Mask.begin(), Mask.begin() + SrcValTyV->getNumElements(), + 0); + SrcVal = Builder.CreateShuffleVector(SrcVal, Mask); + // Compute the new vector type from which we can extract the loaded + // element. + unsigned NumElems = + (NumEltsRequiredFromVec * SrcValEltSizeInBits) / LoadSizeInBits; + DestTy = VectorType::get(LoadTy, NumElems, /* Scalable */ false); + } + SrcVal = Builder.CreateBitCast(SrcVal, DestTy); + } + unsigned Idx = Offset / LoadSize; + SrcVal = Builder.CreateExtractElement(SrcVal, Idx); + } + } + return SrcVal; + } + if (!SrcVal->getType()->isIntegerTy()) SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8)); diff --git a/llvm/test/Transforms/GVN/pr63059.ll b/llvm/test/Transforms/GVN/pr63059.ll --- a/llvm/test/Transforms/GVN/pr63059.ll +++ b/llvm/test/Transforms/GVN/pr63059.ll @@ -8,17 +8,12 @@ ; CHECK-NEXT: [[LOAD_VEC:%.*]] = load <4 x float>, ptr [[LOC]], align 16 ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[LOAD_VEC]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds [4 x float], ptr [[LOC]], i64 0, i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[LOAD_VEC]] to i128 -; CHECK-NEXT: [[TMP2:%.*]] = lshr i128 [[TMP1]], 32 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i128 [[TMP2]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float -; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x float> [[SHUF]], float [[TMP4]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[LOAD_VEC]], i64 1 +; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x float> [[SHUF]], float [[TMP1]], i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds [4 x float], ptr [[LOC]], i64 0, i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP1]], 64 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i128 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 [[TMP6]] to float -; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float [[TMP7]], i64 2 -; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x float> [[INS2]], float [[TMP7]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[LOAD_VEC]], i64 2 +; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float [[TMP2]], i64 2 +; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x float> [[INS2]], float [[TMP2]], i64 3 ; CHECK-NEXT: ret <4 x float> [[INS3]] ; %load_vec = load <4 x float>, ptr %loc, align 16 @@ -39,10 +34,9 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: store <4 x i32> [[V]], ptr [[LOC]], align 16 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [4 x i32], ptr [[LOC]], i64 0, i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to i128 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i128 [[TMP0]], 64 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i128 [[TMP1]] to i64 -; CHECK-NEXT: ret i64 [[TMP2]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1 +; CHECK-NEXT: ret i64 [[TMP1]] ; entry: store <4 x i32> %v, ptr %loc @@ -76,10 +70,9 @@ ; CHECK-NEXT: store <4 x i32> [[V]], ptr [[LOC]], align 16 ; CHECK-NEXT: call void @f_no_mem(<4 x i32> [[V]]) ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [4 x i32], ptr [[LOC]], i64 0, i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to i128 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i128 [[TMP0]], 64 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i128 [[TMP1]] to i64 -; CHECK-NEXT: ret i64 [[TMP2]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1 +; CHECK-NEXT: ret i64 [[TMP1]] ; entry: store <4 x i32> %v, ptr %loc @@ -95,11 +88,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: store <4 x i32> [[V]], ptr [[LOC]], align 16 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [4 x i32], ptr [[LOC]], i64 0, i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to i128 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i128 [[TMP0]], 64 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i128 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1 ; CHECK-NEXT: call void @f(<4 x i32> [[V]]) -; CHECK-NEXT: ret i64 [[TMP2]] +; CHECK-NEXT: ret i64 [[TMP1]] ; entry: store <4 x i32> %v, ptr %loc @@ -114,11 +106,9 @@ ; CHECK-SAME: (ptr [[LOC:%.*]], <4 x float> [[V:%.*]]) { ; CHECK-NEXT: store <4 x float> [[V]], ptr [[LOC]], align 16 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [4 x float], ptr [[LOC]], i64 0, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V]] to i128 -; CHECK-NEXT: [[TMP2:%.*]] = lshr i128 [[TMP1]], 64 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i128 [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to double -; CHECK-NEXT: ret double [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V]] to <2 x double> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1 +; CHECK-NEXT: ret double [[TMP2]] ; store <4 x float> %v, ptr %loc %gep = getelementptr inbounds [4 x float], ptr %loc, i64 0, i64 2 @@ -144,9 +134,9 @@ ; CHECK-LABEL: define i9 @load_as_scalar_larger ; CHECK-SAME: (ptr [[LOC:%.*]], <4 x i6> [[V:%.*]]) { ; CHECK-NEXT: store <4 x i6> [[V]], ptr [[LOC]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i6> [[V]] to i24 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i24 [[TMP1]] to i16 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i16 [[TMP2]] to i9 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i6> [[V]], <4 x i6> poison, <3 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <3 x i6> [[TMP1]] to <2 x i9> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i9> [[TMP2]], i64 0 ; CHECK-NEXT: ret i9 [[TMP3]] ; store <4 x i6> %v, ptr %loc @@ -160,9 +150,9 @@ ; CHECK-LABEL: define i4 @load_as_scalar_smaller ; CHECK-SAME: (ptr [[LOC:%.*]], <4 x i6> [[V:%.*]]) { ; CHECK-NEXT: store <4 x i6> [[V]], ptr [[LOC]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i6> [[V]] to i24 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i24 [[TMP1]] to i8 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i8 [[TMP2]] to i4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i6> [[V]], <4 x i6> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i6> [[TMP1]] to <3 x i4> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i4> [[TMP2]], i64 0 ; CHECK-NEXT: ret i4 [[TMP3]] ; store <4 x i6> %v, ptr %loc @@ -193,9 +183,8 @@ ; CHECK-SAME: (ptr [[LOC:%.*]], <4 x i32> [[V:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: store <4 x i32> [[V]], ptr [[LOC]], align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to i128 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[TMP0]] to <2 x i64> -; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to <2 x i64> +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 ; CHECK-NEXT: ret i64 [[R]] ; entry: @@ -211,9 +200,8 @@ ; CHECK-SAME: (ptr [[LOC:%.*]], <4 x i32> [[V:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: store <4 x i32> [[V]], ptr [[LOC]], align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to i128 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[TMP0]] to <2 x double> -; CHECK-NEXT: [[R:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to <2 x double> +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 ; CHECK-NEXT: ret double [[R]] ; entry: @@ -229,10 +217,8 @@ ; CHECK-SAME: (ptr [[LOC:%.*]], <4 x i32> [[V:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: store <4 x i32> [[V]], ptr [[LOC]], align 16 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to i128 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i128 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> -; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 ; CHECK-NEXT: ret i32 [[R]] ; entry: @@ -248,10 +234,9 @@ ; CHECK-SAME: (ptr [[LOC:%.*]], <8 x i32> [[V:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: store <8 x i32> [[V]], ptr [[LOC]], align 32 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i32> [[V]] to i256 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i256 [[TMP0]] to i128 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i128 [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i32> [[V]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <2 x i64> +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 ; CHECK-NEXT: ret i64 [[R]] ; entry: @@ -267,10 +252,9 @@ ; CHECK-SAME: (ptr [[LOC:%.*]], <8 x i32> [[V:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: store <8 x i32> [[V]], ptr [[LOC]], align 32 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i32> [[V]] to i256 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i256 [[TMP0]] to i32 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[TMP1]] to <2 x i16> -; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i32> [[V]], <8 x i32> poison, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i32> [[TMP0]] to <2 x i16> +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1 ; CHECK-NEXT: ret i16 [[R]] ; entry: @@ -286,11 +270,10 @@ ; CHECK-SAME: (ptr [[LOC:%.*]], <8 x i8> [[V:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: store <8 x i8> [[V]], ptr [[LOC]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i16 [[TMP1]] to i12 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i12 [[TMP2]] to <3 x i4> -; CHECK-NEXT: [[R:%.*]] = extractelement <3 x i4> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> poison, <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i8> [[TMP0]] to <6 x i4> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <6 x i4> [[TMP1]], <6 x i4> poison, <3 x i32> +; CHECK-NEXT: [[R:%.*]] = extractelement <3 x i4> [[TMP2]], i32 1 ; CHECK-NEXT: ret i4 [[R]] ; entry: @@ -306,10 +289,9 @@ ; CHECK-SAME: (ptr [[LOC:%.*]], <8 x i8> [[V:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: store <8 x i8> [[V]], ptr [[LOC]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i24 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i24 [[TMP1]] to <2 x i12> -; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i12> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> poison, <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i8> [[TMP0]] to <2 x i12> +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i12> [[TMP1]], i32 1 ; CHECK-NEXT: ret i12 [[R]] ; entry: @@ -325,11 +307,10 @@ ; CHECK-SAME: (ptr [[LOC:%.*]], <8 x i8> [[V:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: store <8 x i8> [[V]], ptr [[LOC]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i16 [[TMP1]] to i12 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i12 [[TMP2]] to <2 x i6> -; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i6> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[V]], <8 x i8> poison, <3 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i8> [[TMP0]] to <4 x i6> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i6> [[TMP1]], <4 x i6> poison, <2 x i32> +; CHECK-NEXT: [[R:%.*]] = extractelement <2 x i6> [[TMP2]], i32 1 ; CHECK-NEXT: ret i6 [[R]] ; entry: @@ -340,5 +321,19 @@ ret i6 %r } +define i11 @load_greater_than_vector_type(ptr %loc, <4 x i6> %v) { +; CHECK-LABEL: define i11 @load_greater_than_vector_type +; CHECK-SAME: (ptr [[LOC:%.*]], <4 x i6> [[V:%.*]]) { +; CHECK-NEXT: store <4 x i6> [[V]], ptr [[LOC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i6> [[V]], <4 x i6> poison, <11 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <11 x i6> [[TMP1]] to <6 x i11> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i11> [[TMP2]], i64 0 +; CHECK-NEXT: ret i11 [[TMP3]] +; + store <4 x i6> %v, ptr %loc + %ref = load i11, ptr %loc + ret i11 %ref +} + declare void @f(<4 x i32>) declare void @f_no_mem(<4 x i32>) memory(none)