Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -339,14 +339,13 @@ const APInt &PtrDelta, unsigned Depth) const { unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); - unsigned PtrAS = PtrA->getType()->getPointerAddressSpace(); APInt OffsetA(PtrBitWidth, 0); APInt OffsetB(PtrBitWidth, 0); PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); - if (PtrA->getType()->getPointerAddressSpace() != PtrAS || - PtrB->getType()->getPointerAddressSpace() != PtrAS) + if (DL.getTypeStoreSizeInBits(PtrA->getType()) != PtrBitWidth || + DL.getTypeStoreSizeInBits(PtrB->getType()) != PtrBitWidth) return false; APInt OffsetDelta = OffsetB - OffsetA; Index: test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll @@ -1,11 +1,11 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S < %s | FileCheck %s +; RUN: opt -load-store-vectorizer -S < %s | FileCheck %s -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32" +target datalayout = "e-p:64:64-p1:64:64-p5:32:32" -; CHECK-LABEL: @test +; CHECK-LABEL: @cast_to_ptr ; CHECK: store i32* undef, i32** %tmp9, align 8 ; CHECK: store i32* undef, i32** %tmp7, align 8 -define amdgpu_kernel void @test() { +define void @cast_to_ptr() { entry: %a10.ascast.i = addrspacecast i32* addrspace(5)* null to i32** %tmp4 = icmp eq i32 undef, 0 @@ -16,3 +16,38 @@ store i32* undef, i32** %tmp7, align 8 unreachable } + +; CHECK-LABEL: @cast_to_cast +; CHECK: %tmp4 = load i32*, i32** %tmp1, align 8 +; CHECK: %tmp5 = load i32*, i32** %tmp3, align 8 +define void @cast_to_cast() { +entry: + %a10.ascast.i = addrspacecast i32* addrspace(5)* undef to i32** + %b14.ascast.i = addrspacecast i32* addrspace(5)* null to i32** + %tmp1 = select i1 false, i32** %a10.ascast.i, i32** undef + %tmp3 = select i1 false, i32** %b14.ascast.i, i32** undef + %tmp4 = load i32*, i32** %tmp1, align 8 + %tmp5 = load i32*, i32** %tmp3, align 8 + unreachable +} + +; CHECK-LABEL: @all_to_cast +; CHECK: load <4 x float> +define void @all_to_cast(i8* nocapture readonly align 16 dereferenceable(16) %alloc1) { +entry: + %alloc16 = addrspacecast i8* %alloc1 to i8 addrspace(1)* + %tmp = bitcast i8 addrspace(1)* %alloc16 to float addrspace(1)* + %tmp1 = load float, float addrspace(1)* %tmp, align 16, !invariant.load !0 + %tmp6 = getelementptr inbounds i8, i8 addrspace(1)* %alloc16, i64 4 + %tmp7 = bitcast i8 addrspace(1)* %tmp6 to float addrspace(1)* + %tmp8 = load float, float addrspace(1)* %tmp7, align 4, !invariant.load !0 + %tmp15 = getelementptr inbounds i8, i8 addrspace(1)* %alloc16, i64 8 + %tmp16 = bitcast i8 addrspace(1)* %tmp15 to float addrspace(1)* + %tmp17 = load float, float addrspace(1)* %tmp16, align 8, !invariant.load !0 + %tmp24 = getelementptr inbounds i8, i8 addrspace(1)* %alloc16, i64 12 + %tmp25 = bitcast i8 addrspace(1)* %tmp24 to float addrspace(1)* + %tmp26 = load float, float addrspace(1)* %tmp25, align 4, !invariant.load !0 + ret void +} + +!0 = !{}