Index: llvm/trunk/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ llvm/trunk/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -587,6 +587,10 @@ // Trace into subexpressions for more hoisting opportunities. if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); + } else if (isa(V)) { + ConstantOffset = + find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative) + .trunc(BitWidth); } else if (isa(V)) { ConstantOffset = find(U->getOperand(0), /* SignExtended */ true, ZeroExtended, NonNegative).sext(BitWidth); @@ -651,8 +655,9 @@ } if (CastInst *Cast = dyn_cast(U)) { - assert((isa(Cast) || isa(Cast)) && - "We only traced into two types of CastInst: sext and zext"); + assert( + (isa(Cast) || isa(Cast) || isa(Cast)) && + "Only following instructions can be traced: sext, zext & trunc"); ExtInsts.push_back(Cast); UserChain[ChainIndex] = nullptr; return distributeExtsAndCloneChain(ChainIndex - 1); Index: llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll =================================================================== --- llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll +++ llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll @@ -1,5 +1,8 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX -; RUN: opt < %s -S -separate-const-offset-from-gep -reassociate-geps-verify-no-dead-code -gvn | FileCheck %s --check-prefix=IR +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_20 \ +; RUN: | FileCheck %s --check-prefix=PTX +; RUN: opt < %s -mtriple=nvptx64-nvidia-cuda -S -separate-const-offset-from-gep \ +; RUN: -reassociate-geps-verify-no-dead-code -gvn \ +; RUN: | FileCheck %s --check-prefix=IR ; Verifies the SeparateConstOffsetFromGEP pass. ; The following code computes @@ -12,9 +15,6 @@ ; ; so the backend can emit PTX that uses fewer virtual registers. -target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-unknown-unknown" - @array = internal addrspace(3) constant [32 x [32 x float]] zeroinitializer, align 4 define void @sum_of_array(i32 %x, i32 %y, float* nocapture %output) { Index: llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll =================================================================== --- llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll +++ llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll @@ -1,14 +1,10 @@ -; RUN: opt < %s -separate-const-offset-from-gep -reassociate-geps-verify-no-dead-code -S | FileCheck %s +; RUN: opt < %s -mtriple=nvptx64-nvidia-cuda -separate-const-offset-from-gep \ +; RUN: -reassociate-geps-verify-no-dead-code -S | FileCheck %s ; Several unit tests for -separate-const-offset-from-gep. The transformation ; heavily relies on TargetTransformInfo, so we put these tests under ; target-specific folders. -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -; target triple is necessary; otherwise TargetTransformInfo rejects any -; addressing mode. -target triple = "nvptx64-unknown-unknown" - %struct.S = type { float, double } @struct_array = global [1024 x %struct.S] zeroinitializer, align 16 @@ -271,9 +267,34 @@ ; CHECK-NOT: add %ptr2 = getelementptr inbounds %struct0, %struct0* %ptr, i64 0, i32 3, i64 %arrayidx, i32 1 ; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr %struct0, %struct0* %ptr, i64 0, i32 3, i64 %idx, i32 1 -; CHECK: [[PTR1:%[a-zA-Z0-9]+]] = bitcast %struct2* [[PTR]] to i8* -; CHECK: getelementptr inbounds i8, i8* [[PTR1]], i64 -64 -; CHECK: bitcast +; CHECK: getelementptr inbounds %struct2, %struct2* [[PTR]], i64 -3 + ret %struct2* %ptr2 +; CHECK-NEXT: ret +} + +; Check that we can see through explicit trunc() instruction. +define %struct2* @trunk_explicit(%struct0* %ptr, i64 %idx) { +; CHECK-LABEL: @trunk_explicit( +entry: + %idx0 = trunc i64 1 to i32 + %ptr2 = getelementptr inbounds %struct0, %struct0* %ptr, i32 %idx0, i32 3, i64 %idx, i32 1 +; CHECK-NOT: trunc +; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr %struct0, %struct0* %ptr, i64 0, i32 3, i64 %idx, i32 1 +; CHECK: getelementptr inbounds %struct2, %struct2* %0, i64 151 + ret %struct2* %ptr2 +; CHECK-NEXT: ret +} + +; Check that we can deal with trunc inserted by +; canonicalizeArrayIndicesToPointerSize() if size of an index is larger than +; that of the pointer. +define %struct2* @trunk_long_idx(%struct0* %ptr, i64 %idx) { +; CHECK-LABEL: @trunk_long_idx( +entry: + %ptr2 = getelementptr inbounds %struct0, %struct0* %ptr, i65 1, i32 3, i64 %idx, i32 1 +; CHECK-NOT: trunc +; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr %struct0, %struct0* %ptr, i64 0, i32 3, i64 %idx, i32 1 +; CHECK: getelementptr inbounds %struct2, %struct2* %0, i64 151 ret %struct2* %ptr2 ; CHECK-NEXT: ret }