Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4840,7 +4840,7 @@ continue; if (GEP->getType()->isVectorTy()) continue; - GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP); + GEPs[GEP->getPointerOperand()].push_back(GEP); } } } Index: test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll @@ -0,0 +1,103 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -slp-threshold=-18 < %s | FileCheck %s + +; Make sure there's no SCEV assert when the indexes are for different +; sized address spaces + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; CHECK-LABEL: @slp_scev_assert( +; CHECK: %tmp = addrspacecast i8 addrspace(5)* undef to i8* +; CHECK-NEXT: %tmp2 = getelementptr inbounds i8, i8 addrspace(5)* undef, i32 %idx +; CHECK-NEXT: %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 %tmp3 +; CHECK-NEXT: store i8 0, i8 addrspace(5)* %tmp2 +; CHECK-NEXT: store i8 0, i8* %tmp4 +define void @slp_scev_assert(i32 %idx, i64 %tmp3) #0 { +bb: + %tmp = addrspacecast i8 addrspace(5)* undef to i8* + %tmp2 = getelementptr inbounds i8, i8 addrspace(5)* undef, i32 %idx + %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 %tmp3 + store i8 0, i8 addrspace(5)* %tmp2 + store i8 0, i8* %tmp4 + ret void +} + +; CHECK-LABEL: @multi_as_reduction_different_sized( +; CHECK: %add0 = add i32 %idx0, 2 +; CHECK: %add1 = add i64 %idx1, 1 +define void @multi_as_reduction_different_sized(i32 addrspace(3)* %lds, i32 %idx0, i64 %idx1) #0 { +bb: + %flat = addrspacecast i32 addrspace(3)* %lds to i32* + %add0 = add i32 %idx0, 2 + %add1 = add i64 %idx1, 1 + + %lds.1 = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 %add0 + %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1 + + %load.lds.0 = load i32, i32 addrspace(3)* %lds, align 4 + %load.lds.1 = load i32, i32 addrspace(3)* %lds.1, align 4 + + %load.flat.0 = load i32, i32* %flat, align 4 + %load.flat.1 = load i32, i32* %flat.1, align 4 + + %sub0 = sub i32 %load.flat.0, %load.lds.0 + %sub1 = sub i32 %load.flat.1, %load.lds.1 + + store i32 %sub0, i32* undef + store i32 %sub1, i32* undef + ret void +} + +; This should vectorize if using GetUnderlyingObject +; CHECK-LABEL: @multi_as_reduction_same_size( +; CHECK: %add0 = add i64 %idx0, 2 +; CHECK: %add1 = add i64 %idx1, 1 +define void @multi_as_reduction_same_size(i32 addrspace(1)* %global, i64 %idx0, i64 %idx1) #0 { +bb: + %flat = addrspacecast i32 addrspace(1)* %global to i32* + %add0 = add i64 %idx0, 2 + %add1 = add i64 %idx1, 1 + + %global.1 = getelementptr inbounds i32, i32 addrspace(1)* %global, i64 %add0 + %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1 + + %load.global.0 = load i32, i32 addrspace(1)* %global, align 4 + %load.global.1 = load i32, i32 addrspace(1)* %global.1, align 4 + + %load.flat.0 = load i32, i32* %flat, align 4 + %load.flat.1 = load i32, i32* %flat.1, align 4 + + %sub0 = sub i32 %load.flat.0, %load.global.0 + %sub1 = sub i32 %load.flat.1, %load.global.1 + + store i32 %sub0, i32* undef + store i32 %sub1, i32* undef + ret void +} + +; This should vectorize if using GetUnderlyingObject +; The add is done in the same width, even though the address space size is smaller +; CHECK-LABEL: @multi_as_reduction_different_sized_noncanon( +; CHECK: %add0 = add i64 %idx0, 2 +; CHECK: %add1 = add i64 %idx1, 1 +define void @multi_as_reduction_different_sized_noncanon(i32 addrspace(3)* %lds, i64 %idx0, i64 %idx1) #0 { +bb: + %flat = addrspacecast i32 addrspace(3)* %lds to i32* + %add0 = add i64 %idx0, 2 + %add1 = add i64 %idx1, 1 + + %lds.1 = getelementptr inbounds i32, i32 addrspace(3)* %lds, i64 %add0 + %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1 + + %load.lds.0 = load i32, i32 addrspace(3)* %lds, align 4 + %load.lds.1 = load i32, i32 addrspace(3)* %lds.1, align 4 + + %load.flat.0 = load i32, i32* %flat, align 4 + %load.flat.1 = load i32, i32* %flat.1, align 4 + + %sub0 = sub i32 %load.flat.0, %load.lds.0 + %sub1 = sub i32 %load.flat.1, %load.lds.1 + + store i32 %sub0, i32* undef + store i32 %sub1, i32* undef + ret void +}