Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -395,7 +395,8 @@ // // SeparateConstOffsetFromGEP - Split GEPs for better CSE // -FunctionPass *createSeparateConstOffsetFromGEPPass(); +FunctionPass * +createSeparateConstOffsetFromGEPPass(bool TransformComplexGEPsToAdds = false); //===----------------------------------------------------------------------===// // Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -115,7 +115,7 @@ bool isCortexA57() const { return CPUString == "cortex-a57"; } bool isCortexA53() const { return CPUString == "cortex-a53"; } - bool useAA() const override { return isCortexA53() || isCortexA57(); } + bool useAA() const override { return isCortexA53(); } /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -85,6 +85,11 @@ cl::desc("Work around Cortex-A53 erratum 835769"), cl::init(false)); +static cl::opt +EnableGEPOpt("aarch64-gep-opt", cl::Hidden, + cl::desc("Enable optimizations on GEPs for better address CSE"), + cl::init(true)); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(TheAArch64leTarget); @@ -204,6 +209,21 @@ addPass(createCFGSimplificationPass()); TargetPassConfig::addIRPasses(); + + // If use alias analysis in code gen, transforming GEPs to arithmetic form + // may have regression as currently BaiscAA does not analyze inttoptr. + if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt + && TM->getSubtarget().useAA()) { + // Call SeparateConstOffsetFromGEP pass to separate constant offset from + // GEPs and transform GEPs to ptrtoint + arithmetic + inttoptr form. + addPass(createSeparateConstOffsetFromGEPPass(true)); + // Call EarlyCSE pass to find and remove subexpressions related to the GEPs + // transformed in SeparateConstOffsetFromGEP Pass. + addPass(createEarlyCSEPass()); + // Do loop invariant code motion in case any of the address calculations are + // actually invariant. + addPass(createLICMPass()); + } } // Pass Pipeline Configuration Index: lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp =================================================================== --- lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -81,6 +81,65 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// +// Even we can extract constant from GEPs, sometimes it is still unable to do +// fully CSE for complex GEPs. There are two reasons: +// (1) The above algrithm can not extract constant in struct type indices. +// Actually the indices of struct types in GEP are always constant which can +// always be extracted by transforming GEPs into a arithmetic form. +// (2) As GEPs can have variable indices and can often not be matched by the +// addressing mode matcher. Such GEPs will be kept as-is into CodeGen, which can +// only see GEPs in the same basic block. If two GEPs are in different basic +// blocks, or a GEP is used across basic block boundaries, CodeGen will not be +// able to tell if there are common subexpressions between them. +// +// For the above reasons, if TransformComplexGEPsToAdds is true, this pass will +// transform complex GEPs into a "ptrtoint + arithmetic + inttoptr" form for +// better CSE. +// E.g. the following two GEPs are used in two basic blocks: +// BB1: +// %p = getelementptr inbounds [240 x %struct]* %in, i64 0, i64 %idx, i32 3 +// load %p +// ... +// BB2: +// %p2 = getelementptr inbounds [240 x %struct]* %in, i64 0, i64 %idx, i32 2 +// load %p2 +// ... +// +// This pass will transfer it into: +// BB1: +// %1 = ptrtoint [240 x %struct]* %in to i64 +// %2 = mul i64 %idx, length_Of_struct +// %3 = add i64 %1, %2 +// %4 = add i64 %3, 12 +// %p = inttoptr i64 %4 to i32* +// load %p +// ... +// BB2: +// %5 = ptrtoint [240 x %struct]* %in to i64 +// %6 = mul i64 %idx, length_Of_struct +// %7 = add i64 %5, %6 +// %8 = add i64 %7, 8 +// %p2 = inttoptr i64 %8 to i32* +// load %p +// ... +// +// So that it can extract the constant in the last struct type index and it is +// easy to remove following subexpressions: +// %5 = ptrtoint [240 x %struct]* %in to i64 +// %6 = mul i64 %idx, length_Of_struct +// %7 = add i64 %5, %6 +// +// This can also improve the address sinking logic in CodeGenPrepare. +// CodeGenPrepare tries to sink address calculations that match the target's +// addressing modes. Complex GEPs may not match and will not be sunk in this +// case. Even though the whole transformed ptrtoint+arithmetic+inttoptr chain +// won't be addressing-mode matched, part of the arithmetic can be matched and +// should be sunk, so we end up with better address calculations. +// +//===----------------------------------------------------------------------===// + #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -92,6 +151,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/IR/IRBuilder.h" using namespace llvm; @@ -241,7 +301,9 @@ class SeparateConstOffsetFromGEP : public FunctionPass { public: static char ID; - SeparateConstOffsetFromGEP() : FunctionPass(ID) { + SeparateConstOffsetFromGEP(bool TransformComplexGEPsToAdds = false) + : FunctionPass(ID), + TransformComplexGEPsToAdds(TransformComplexGEPsToAdds) { initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry()); } @@ -264,11 +326,22 @@ /// Tries to split the given GEP into a variadic base and a constant offset, /// and returns true if the splitting succeeds. bool splitGEP(GetElementPtrInst *GEP); - /// Finds the constant offset within each index, and accumulates them. This - /// function only inspects the GEP without changing it. The output - /// NeedsExtraction indicates whether we can extract a non-zero constant - /// offset from any index. - int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); + /// Tries to separate a constant offsets from the the given GEP and transform + /// it into a ptrtoint + arithmetic + inttoptr form. Returns true if it + /// succeeds. + bool transformGEPToAdd(GetElementPtrInst *GEP); + /// Finds the constant offset within each index for sequential types, and + /// accumulates them. This function only inspects the GEP without changing it. + /// The output NeedsExtraction indicates whether we can extract a constant + /// offset. + int64_t accumulateByteOffsetForSeq(GetElementPtrInst *GEP, + bool &NeedsExtraction); + /// Finds the constant offset within each index for sequential and struct + /// types, and accumulates them. This function only inspects the GEP without + /// changing it. The output NeedsExtraction indicates whether we can extract + /// a constant offset. + int64_t accumulateByteOffsetForSeqAndStruct(GetElementPtrInst *GEP, + bool &NeedsExtraction); /// Canonicalize array indices to pointer-size integers. This helps to /// simplify the logic of splitting a GEP. For example, if a + b is a /// pointer-size integer, we have @@ -286,6 +359,7 @@ /// Verified in @i32_add in split-gep.ll bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); + bool TransformComplexGEPsToAdds; const DataLayout *DL; }; } // anonymous namespace @@ -302,8 +376,9 @@ "Split GEPs to a variadic base and a constant offset for better CSE", false, false) -FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() { - return new SeparateConstOffsetFromGEP(); +FunctionPass * +llvm::createSeparateConstOffsetFromGEPPass(bool TransformComplexGEPsToAdds) { + return new SeparateConstOffsetFromGEP(TransformComplexGEPsToAdds); } bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, @@ -650,8 +725,8 @@ } int64_t -SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, - bool &NeedsExtraction) { +SeparateConstOffsetFromGEP::accumulateByteOffsetForSeq(GetElementPtrInst *GEP, + bool &NeedsExtraction) { NeedsExtraction = false; int64_t AccumulativeByteOffset = 0; gep_type_iterator GTI = gep_type_begin(*GEP); @@ -690,7 +765,8 @@ bool Changed = canonicalizeArrayIndicesToPointerSize(GEP); bool NeedsExtraction; - int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction); + int64_t AccumulativeByteOffset = + accumulateByteOffsetForSeq(GEP, NeedsExtraction); if (!NeedsExtraction) return Changed; @@ -814,11 +890,121 @@ return true; } +int64_t SeparateConstOffsetFromGEP::accumulateByteOffsetForSeqAndStruct( + GetElementPtrInst *GEP, bool &NeedsExtraction) { + NeedsExtraction = false; + int64_t AccumulativeByteOffset = 0; + gep_type_iterator GTI = gep_type_begin(GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { + if (isa(*GTI)) { + // Tries to extract a constant offset from this GEP index. + bool FoundConst = false; + int64_t ConstantOffset = ConstantOffsetExtractor::Find( + GEP->getOperand(I), DL, GEP, FoundConst); + if (FoundConst) { + NeedsExtraction = true; + // A GEP may have multiple indices. We accumulate the extracted + // constant offset to a byte offset, and later offset the remainder of + // the original GEP with this byte offset. + AccumulativeByteOffset += + ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType()); + } + } else { + StructType *StTy = cast(*GTI); + unsigned Field = cast(GEP->getOperand(I))->getZExtValue(); + // Skip field 0 as the offset is always 0. + if (Field != 0) { + NeedsExtraction = true; + AccumulativeByteOffset += + DL->getStructLayout(StTy)->getElementOffset(Field); + } + } + } + return AccumulativeByteOffset; +} + +bool SeparateConstOffsetFromGEP::transformGEPToAdd(GetElementPtrInst *GEP) { + // Skip vector GEPs. + if (GEP->getType()->isVectorTy()) + return false; + + // The backend can already nicely handle the case where all indices are + // constant. + if (GEP->hasAllConstantIndices()) + return false; + + bool NeedsExtraction; + int64_t AccumulativeByteOffset = + accumulateByteOffsetForSeqAndStruct(GEP, NeedsExtraction); + + if (!NeedsExtraction) + return false; + + canonicalizeArrayIndicesToPointerSize(GEP); + // Remove the constant offset in each GEP index. The resultant GEP computes + // the variadic base. + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + gep_type_iterator GTI = gep_type_begin(*GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { + // Skip indices for struct types as they are always constants and have been + // accumulated. + if (isa(*GTI)) { + // Tries to extract constant offsets and build new index. + Value *NewIdx = + ConstantOffsetExtractor::Extract(GEP->getOperand(I), DL, GEP); + if (NewIdx != nullptr) + GEP->setOperand(I, NewIdx); + } + } + + IRBuilder<> Builder(GEP); + Value *Result = Builder.CreatePtrToInt(GEP->getOperand(0), IntPtrTy); + GTI = gep_type_begin(*GEP); + for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) { + // Now create ADD/SHL/MUL arithmetic operations for the vairable indices. + if (!isa(GEP->getOperand(I))) { + Value *Idx = GEP->getOperand(I); + APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(), + DL->getTypeAllocSize(GTI.getIndexedType())); + if (ElementSize != 1) { + if (ElementSize.isPowerOf2()) + Idx = Builder.CreateShl(Idx, + ConstantInt::get(IntPtrTy, ElementSize.logBase2())); + else + Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize)); + } + Result = Builder.CreateAdd(Result, Idx); + } + } + + // Create an ADD for the extracted constant offset. + if (AccumulativeByteOffset != 0) + Result = Builder.CreateAdd( + Result, ConstantInt::get(IntPtrTy, AccumulativeByteOffset)); + + Result = Builder.CreateIntToPtr(Result, GEP->getType()); + + GEP->replaceAllUsesWith(Result); + GEP->eraseFromParent(); + return true; +} + bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { if (DisableSeparateConstOffsetFromGEP) return false; bool Changed = false; + // Enabling transforming GEPs to a arithmetic form can extract more constants + // and have better CSE for address calculation. + if (TransformComplexGEPsToAdds) { + for (auto &B : F) { + for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;) + if (GetElementPtrInst *GEP = dyn_cast(I++)) + Changed |= transformGEPToAdd(GEP); + } + return Changed; + } + for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { if (GetElementPtrInst *GEP = dyn_cast(I++)) { Index: test/CodeGen/AArch64/aarch64-gep-opt.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-gep-opt.ll @@ -0,0 +1,121 @@ +; RUN: llc -O3 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -O3 -print-after=codegenprepare < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-IR <%t %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnueabi" + +%struct = type { i32, i32, i32, i32, [20 x i32] } + +; Check that when two complex GEPs are used in two basic blocks, LLVM can +; elimilate the common subexpression for the second use. +define void @test_GEP_CSE([240 x %struct]* %string, i32* %adj, i32 %lib, i64 %idxprom) { + %liberties = getelementptr inbounds [240 x %struct]* %string, i64 1, i64 %idxprom, i32 3 + %1 = load i32* %liberties, align 4 + %cmp = icmp eq i32 %1, %lib + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %origin = getelementptr inbounds [240 x %struct]* %string, i64 1, i64 %idxprom, i32 2 + %2 = load i32* %origin, align 4 + store i32 %2, i32* %adj, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} +; CHECK-LABEL: test_GEP_CSE: +; CHECK: madd +; CHECK: ldr +; CHECK-NOT: madd +; CHECK:ldr +; CHECK-IR-LABEL: @test_GEP_CSE( +; CHECK-IR: [[PTR0:%[a-zA-Z0-9]+]] = ptrtoint [240 x %struct]* %string to i64 +; CHECK-IR: [[PTR1:%[a-zA-Z0-9]+]] = mul i64 %idxprom, 96 +; CHECK-IR: [[PTR2:%[a-zA-Z0-9]+]] = add i64 [[PTR0]], [[PTR1]] +; CHECK-IR: [[PTR3:%[a-zA-Z0-9]+]] = add i64 [[PTR2]], 23052 +; CHECK-IR: inttoptr +; CHECK-IR: if.then: +; CHECK-IR-NOT: ptrtoint +; CHECK-IR-NOT: mul +; CHECK-IR: [[PTR3:%[a-zA-Z0-9]+]] = add i64 [[PTR2]], 23048 +; CHECK-IR: inttoptr + +%class.my = type { i32, [128 x i32], i32, [256 x %struct.pt]} +%struct.pt = type { %struct.point*, i32, i32 } +%struct.point = type { i32, i32 } + +; Check when a GEP is used across two basic block, LLVM can sink the address +; calculation and code gen can generate a better addressing mode for the second +; use. +define void @test_GEP_across_BB(%class.my* %this, i64 %idx) { + %1 = getelementptr inbounds %class.my* %this, i64 0, i32 3, i64 %idx, i32 1 + %2 = load i32* %1, align 4 + %3 = getelementptr inbounds %class.my* %this, i64 0, i32 3, i64 %idx, i32 2 + %4 = load i32* %3, align 4 + %5 = icmp eq i32 %2, %4 + br i1 %5, label %if.true, label %exit + +if.true: + %6 = shl i32 %4, 1 + store i32 %6, i32* %3, align 4 + br label %exit + +exit: + %7 = add nsw i32 %4, 1 + store i32 %7, i32* %1, align 4 + ret void +} +; CHECK-LABEL: test_GEP_across_BB: +; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #528] +; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #532] +; CHECK-NOT: add +; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #532] +; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #528] +; CHECK-IR-LABEL: test_GEP_across_BB( +; CHECK-IR: add i64 [[TMP:%[a-zA-Z0-9]+]], 528 +; CHECK-IR: add i64 [[TMP]], 532 +; CHECK-IR: if.true: +; CHECK-IR: [[PTR0:%sunk[a-zA-Z0-9]+]] = add i64 [[TMP]], 532 +; CHECK-IR: [[PTR1:%sunk[a-zA-Z0-9]+]] = inttoptr i64 [[PTR0]] to i32* +; CHECK-IR: store i32 {{%[a-zA-Z0-9]+}}, i32* [[PTR1]] +; CHECK-IR: exit: +; CHECK-IR: [[PTR2:%sunk[a-zA-Z0-9]+]] = add i64 [[TMP]], 528 +; CHECK-IR: [[PTR3:%sunk[a-zA-Z0-9]+]] = inttoptr i64 [[PTR2]] to i32* +; CHECK-IR: store i32 {{%[a-zA-Z0-9]+}}, i32* [[PTR3]] + +%struct.S = type { float, double } +@struct_array = global [1024 x %struct.S] zeroinitializer, align 16 + +; The following two test cases check we can extract constant from indices of +; struct type. +; The constant offsets are from indices "i64 %idxprom" and "i32 1". As the +; alloca size of %struct.S is 16, and "i32 1" is the 2rd element whose field +; offset is 8, the total constant offset is (5 * 16 + 8) = 88. +define double* @test-struct_1(i32 %i) { +entry: + %add = add nsw i32 %i, 5 + %idxprom = sext i32 %add to i64 + %p = getelementptr inbounds [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1 + ret double* %p +} +; CHECK-IR-LABEL: @test-struct_1( +; CHECK-IR-NOT: getelementptr +; CHECK-IR: add i64 %{{[a-zA-Z0-9]+}}, 88 + +%struct3 = type { i64, i32 } +%struct2 = type { %struct3, i32 } +%struct1 = type { i64, %struct2 } +%struct0 = type { i32, i32, i64*, [100 x %struct1] } + +; The constant offsets are from indices "i32 3", "i64 %arrayidx" and "i32 1". +; "i32 3" is the 4th element whose field offset is 16. The alloca size of +; %struct1 is 32. "i32 1" is the 2rd element whose field offset is 8. So the +; total constant offset is 16 + (-2 * 32) + 8 = -40 +define %struct2* @test-struct_2(%struct0* %ptr, i64 %idx) { +entry: + %arrayidx = add nsw i64 %idx, -2 + %ptr2 = getelementptr inbounds %struct0* %ptr, i64 0, i32 3, i64 %arrayidx, i32 1 + ret %struct2* %ptr2 +} +; CHECK-IR-LABEL: @test-struct_2( +; CHECK-IR-NOT: = getelementptr +; CHECK-IR: add i64 %{{[a-zA-Z0-9]+}}, -40 Index: test/CodeGen/AArch64/arm64-addr-mode-folding.ll =================================================================== --- test/CodeGen/AArch64/arm64-addr-mode-folding.ll +++ test/CodeGen/AArch64/arm64-addr-mode-folding.ll @@ -1,4 +1,4 @@ -; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s +; RUN: llc -O3 -mtriple arm64-apple-ios3 -aarch64-gep-opt=false %s -o - | FileCheck %s ; @block = common global i8* null, align 8 Index: test/CodeGen/AArch64/arm64-cse.ll =================================================================== --- test/CodeGen/AArch64/arm64-cse.ll +++ test/CodeGen/AArch64/arm64-cse.ll @@ -1,4 +1,4 @@ -; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 | FileCheck %s +; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 -aarch64-gep-opt=false | FileCheck %s target triple = "arm64-apple-ios" ; rdar://12462006