diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -152,12 +152,7 @@ Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); assert(isa(SrcPtr->getType()) && "Expected a pointer type"); - // If original AS != Load's AS, we can't bitcast the original pointer and have - // to use Load's operand instead. Ideally we would want to strip pointer casts - // without changing AS, but there's no API to do that ATM. unsigned AS = Load->getPointerAddressSpace(); - if (AS != SrcPtr->getType()->getPointerAddressSpace()) - SrcPtr = Load->getPointerOperand(); // We are potentially transforming byte-sized (8-bit) memory accesses, so make // sure we have all of our type-based constraints in place for this target. @@ -245,7 +240,8 @@ // It is safe and potentially profitable to load a vector directly: // inselt undef, load Scalar, 0 --> load VecPtr IRBuilder<> Builder(Load); - Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS)); + Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + SrcPtr, MinVecTy->getPointerTo(AS)); Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); VecLd = Builder.CreateShuffleVector(VecLd, Mask); diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll --- a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll @@ -11,9 +11,7 @@ ; CHECK-LABEL: @load_from_other_as( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5) -; CHECK-NEXT: [[B:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to %struct.hoge* -; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[B]], i64 0, i32 0 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[C]] to <1 x float>* +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to <1 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x float>, <1 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> ; CHECK-NEXT: store <4 x float> [[E]], <4 x float>* [[RESULTPTR:%.*]], align 16 diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll --- a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll +++ b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll @@ -11,9 +11,7 @@ ; CHECK-LABEL: @load_from_other_as( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5) -; CHECK-NEXT: [[B:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to %struct.hoge* -; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[B]], i64 0, i32 0 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[C]] to <1 x float>* +; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to <1 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x float>, <1 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> ; CHECK-NEXT: store <4 x float> [[E]], <4 x float>* [[RESULTPTR:%.*]], align 16 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -253,6 +253,23 @@ ret <4 x float> %r } +; Should work with addrspace even when peeking past unsafe loads through geps + +define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(i32* align 16 dereferenceable(16) %v3) { +; CHECK-LABEL: @unsafe_load_i32_insert_v4i32_addrspace( +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast i32* [[V3:%.*]] to <4 x i32> addrspace(42)* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(42)* [[TMP1]], align 16 +; CHECK-NEXT: [[INSELT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[INSELT]] +; + %t0 = getelementptr inbounds i32, i32* %v3, i32 1 + %t1 = addrspacecast i32* %t0 to i32 addrspace(42)* + %t2 = getelementptr inbounds i32, i32 addrspace(42)* %t1, i64 1 + %val = load i32, i32 addrspace(42)* %t2, align 4 + %inselt = insertelement <4 x i32> poison, i32 %val, i32 0 + ret <4 x i32> %inselt +} + ; If there are enough dereferenceable bytes, we can offset the vector load. define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync {