Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6399,7 +6399,8 @@ for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { auto *GEPJ = cast(GEPList[J]); auto *SCEVJ = SE->getSCEV(GEPList[J]); - if (isa(SE->getMinusSCEV(SCEVI, SCEVJ))) { + if (SCEVJ->getType() == SCEVI->getType() && + isa(SE->getMinusSCEV(SCEVI, SCEVJ))) { Candidates.remove(GEPList[I]); Candidates.remove(GEPList[J]); } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { Index: test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll @@ -0,0 +1,74 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -slp-threshold=-18 < %s | FileCheck %s + +; Make sure there's no SCEV assert when the indexes are for different +; sized address spaces + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + + +; CHECK-LABEL: @slp_scev_assert( +; CHECK: %tmp = addrspacecast i8 addrspace(5)* undef to i8* +; CHECK-NEXT: %tmp2 = getelementptr inbounds i8, i8 addrspace(5)* undef, i32 %idx +; CHECK-NEXT: %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 %tmp3 +; CHECK-NEXT: store i8 0, i8 addrspace(5)* %tmp2 +; CHECK-NEXT: store i8 0, i8* %tmp4 +define void @slp_scev_assert(i32 %idx, i64 %tmp3) #0 { +bb: + %tmp = addrspacecast i8 addrspace(5)* undef to i8* + %tmp2 = getelementptr inbounds i8, i8 addrspace(5)* undef, i32 %idx + %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 %tmp3 + store i8 0, i8 addrspace(5)* %tmp2 + store i8 0, i8* %tmp4 + ret void +} + +; CHECK-LABEL: @getelementptr_4x32_same_size_as( +; CHECK: add nsw <2 x i32> +; CHECK: getelementptr inbounds i32, i32* %g, i32 +; CHECK: getelementptr inbounds i32, i32* %g, i32 + +; CHECK: add nsw <2 x i32> +; CHECK: getelementptr inbounds i32, i32 addrspace(1)* %g1, i32 +; CHECK: getelementptr inbounds i32, i32 addrspace(1)* %g1, i32 +define i32 @getelementptr_4x32_same_size_as(i32* nocapture readonly %g, i32 addrspace(1)* nocapture readonly %g1, i32 %n, i32 %x, i32 %y, i32 %z) { +entry: + %cmp31 = icmp sgt i32 %n, 0 + br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ] + %t4 = shl nsw i32 %indvars.iv, 1 + %t5 = add nsw i32 %t4, 0 + %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5 + %t6 = load i32, i32* %arrayidx, align 4 + %add1 = add nsw i32 %t6, %sum.032 + %t7 = add nsw i32 %t4, %x + %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7 + %t8 = load i32, i32* %arrayidx5, align 4 + %add6 = add nsw i32 %add1, %t8 + %t9 = add nsw i32 %t4, %y + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %g1, i32 %t9 + %t10 = load i32, i32 addrspace(1)* %arrayidx10, align 4 + %add11 = add nsw i32 %add6, %t10 + %t11 = add nsw i32 %t4, %z + %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %g1, i32 %t11 + %t12 = load i32, i32 addrspace(1)* %arrayidx15, align 4 + %add16 = add nsw i32 %add11, %t12 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next , %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +attributes #0 = { nounwind } +