Index: include/llvm/Transforms/Utils/BypassSlowDivision.h =================================================================== --- include/llvm/Transforms/Utils/BypassSlowDivision.h +++ include/llvm/Transforms/Utils/BypassSlowDivision.h @@ -22,6 +22,33 @@ #include "llvm/IR/Function.h" namespace llvm { +struct DivOpInfo { + bool SignedOp; + Value *Dividend; + Value *Divisor; + + DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor) + : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {} +}; + +template <> struct DenseMapInfo { + static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) { + return Val1.SignedOp == Val2.SignedOp && Val1.Dividend == Val2.Dividend && + Val1.Divisor == Val2.Divisor; + } + + static DivOpInfo getEmptyKey() { return DivOpInfo(false, nullptr, nullptr); } + + static DivOpInfo getTombstoneKey() { + return DivOpInfo(true, nullptr, nullptr); + } + + static unsigned getHashValue(const DivOpInfo &Val) { + return (unsigned)(reinterpret_cast(Val.Dividend) ^ + reinterpret_cast(Val.Divisor)) ^ + (unsigned)Val.SignedOp; + } +}; /// This optimization identifies DIV instructions in a BB that can be /// profitably bypassed and carried out with a shorter, faster divide. Index: lib/Transforms/Scalar/EarlyCSE.cpp =================================================================== --- lib/Transforms/Scalar/EarlyCSE.cpp +++ lib/Transforms/Scalar/EarlyCSE.cpp @@ -31,6 +31,7 @@ #include "llvm/Support/RecyclingAllocator.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/MemorySSA.h" #include "llvm/Transforms/Utils/MemorySSAUpdater.h" @@ -311,6 +312,12 @@ CallHTType; CallHTType AvailableCalls; + /// A scoped hash table of division and remainder instructions. We use this to + /// hoist matching div/rem instructions into the same basic block because + /// remainder contains the implicitly shared division. + typedef ScopedHashTable DivRemHTType; + DivRemHTType AvailableDivRems; + /// \brief This is the current generation of the memory value. unsigned CurrentGeneration; @@ -330,9 +337,9 @@ class NodeScope { public: NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, - CallHTType &AvailableCalls) + CallHTType &AvailableCalls, DivRemHTType &AvailableDivRems) : Scope(AvailableValues), LoadScope(AvailableLoads), - CallScope(AvailableCalls) {} + CallScope(AvailableCalls), DivRemScope(AvailableDivRems) {} private: NodeScope(const NodeScope &) = delete; @@ -341,6 +348,7 @@ ScopedHTType::ScopeTy Scope; LoadHTType::ScopeTy LoadScope; CallHTType::ScopeTy CallScope; + DivRemHTType::ScopeTy DivRemScope; }; // Contains all the needed information to create a stack for doing a depth @@ -350,10 +358,12 @@ class StackNode { public: StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, - CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n, - DomTreeNode::iterator child, DomTreeNode::iterator end) + CallHTType &AvailableCalls, DivRemHTType &AvailableDivRems, + unsigned cg, DomTreeNode *n, DomTreeNode::iterator child, + DomTreeNode::iterator end) : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child), - EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls), + EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls, + AvailableDivRems), Processed(false) {} // Accessors. @@ -497,6 +507,8 @@ bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration, Instruction *EarlierInst, Instruction *LaterInst); + bool hoistDivRem(Instruction &I); + void removeMSSA(Instruction *Inst) { if (!MSSA) return; @@ -572,6 +584,47 @@ return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst)); } +/// Division is a hidden subexpression of remainder: +/// X % Y <--> X - ((X / Y) * Y) +/// If both values are used in this function, we want to calculate them in the +/// same basic block for performance because the expensive division component +/// will be shared (either as a unified divrem instruction on the target or by +/// using the intermediate division result while calcuating remainder). Return +/// true if a division or remainder is hoisted. +bool EarlyCSE::hoistDivRem(Instruction &I) { + bool IsSigned, IsRem; + switch (I.getOpcode()) { + case Instruction::SDiv: IsSigned = true; IsRem = false; break; + case Instruction::UDiv: IsSigned = false; IsRem = false; break; + case Instruction::SRem: IsSigned = true; IsRem = true; break; + case Instruction::URem: IsSigned = false; IsRem = true; break; + default: return false; + } + + // We are looking for pairs of instructions such as sdiv X, Y and srem X, Y. + // Ie, the sign of the operation, the dividend, and the divisor must match. + DivOpInfo DivRemKey(IsSigned, I.getOperand(0), I.getOperand(1)); + if (Instruction *DivRem = AvailableDivRems.lookup(DivRemKey)) { + // If the pair is already in the same block, there's nothing to change. + if (DivRem->getParent() == I.getParent()) + return false; + + // Only hoist a remainder instruction if the implicit multiply is cheap. We + // may be speculatively executing that extra operation (and the subtract + // that we assume is cheap) when the original code did not. + if (IsRem && TTI.getArithmeticInstrCost(Instruction::Mul, I.getType()) > 1) + return false; + + // Otherwise, hoist this instruction up to its sibling. + I.moveBefore(DivRem); + return true; + } + + // This operation isn't in the hash table yet. Save it. + AvailableDivRems.insert(DivRemKey, &I); + return false; +} + bool EarlyCSE::processNode(DomTreeNode *Node) { bool Changed = false; BasicBlock *BB = Node->getBlock(); @@ -713,6 +766,9 @@ // Otherwise, just remember that this value is available. AvailableValues.insert(Inst, Inst); + + // And look for an additional CSE opportunity for division and remainder. + Changed |= hoistDivRem(*Inst); continue; } @@ -916,9 +972,10 @@ bool Changed = false; // Process the root node. - nodesToProcess.push_back(new StackNode( - AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration, - DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end())); + nodesToProcess.push_back( + new StackNode(AvailableValues, AvailableLoads, AvailableCalls, + AvailableDivRems, CurrentGeneration, DT.getRootNode(), + DT.getRootNode()->begin(), DT.getRootNode()->end())); // Save the current generation. unsigned LiveOutGeneration = CurrentGeneration; @@ -943,8 +1000,8 @@ DomTreeNode *child = NodeToProcess->nextChild(); nodesToProcess.push_back( new StackNode(AvailableValues, AvailableLoads, AvailableCalls, - NodeToProcess->childGeneration(), child, child->begin(), - child->end())); + AvailableDivRems, NodeToProcess->childGeneration(), + child, child->begin(), child->end())); } else { // It has been processed, and there are no more children to process, // so delete it and pop it off the stack. Index: lib/Transforms/Utils/BypassSlowDivision.cpp =================================================================== --- lib/Transforms/Utils/BypassSlowDivision.cpp +++ lib/Transforms/Utils/BypassSlowDivision.cpp @@ -28,15 +28,6 @@ #define DEBUG_TYPE "bypass-slow-division" namespace { - struct DivOpInfo { - bool SignedOp; - Value *Dividend; - Value *Divisor; - - DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor) - : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {} - }; - struct QuotRemPair { Value *Quotient; Value *Remainder; @@ -56,29 +47,6 @@ } namespace llvm { - template<> - struct DenseMapInfo { - static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) { - return Val1.SignedOp == Val2.SignedOp && - Val1.Dividend == Val2.Dividend && - Val1.Divisor == Val2.Divisor; - } - - static DivOpInfo getEmptyKey() { - return DivOpInfo(false, nullptr, nullptr); - } - - static DivOpInfo getTombstoneKey() { - return DivOpInfo(true, nullptr, nullptr); - } - - static unsigned getHashValue(const DivOpInfo &Val) { - return (unsigned)(reinterpret_cast(Val.Dividend) ^ - reinterpret_cast(Val.Divisor)) ^ - (unsigned)Val.SignedOp; - } - }; - typedef DenseMap DivCacheTy; typedef DenseMap BypassWidthsTy; } Index: test/Transforms/EarlyCSE/div-rem-pairs.ll =================================================================== --- test/Transforms/EarlyCSE/div-rem-pairs.ll +++ test/Transforms/EarlyCSE/div-rem-pairs.ll @@ -0,0 +1,238 @@ +; RUN: opt -early-cse -S < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Hoist the sdiv because it's safe and free. +; PR31028 - https://bugs.llvm.org/show_bug.cgi?id=31028 + +define i32 @hoist_sdiv(i32 %a, i32 %b) { +; CHECK-LABEL: @hoist_sdiv( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 %a, %b +; CHECK-NEXT: [[REM:%.*]] = srem i32 %a, %b +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[REM]], 42 +; CHECK-NEXT: br i1 [[CMP]], label %if, label %end +; CHECK: if: +; CHECK-NEXT: br label %end +; CHECK: end: +; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[DIV]], %if ], [ 3, %entry ] +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + %rem = srem i32 %a, %b + %cmp = icmp eq i32 %rem, 42 + br i1 %cmp, label %if, label %end + +if: + %div = sdiv i32 %a, %b + br label %end + +end: + %ret = phi i32 [ %div, %if ], [ 3, %entry ] + ret i32 %ret +} + +; Hoist the udiv because it's safe and free. + +define i64 @hoist_udiv(i64 %a, i64 %b) { +; CHECK-LABEL: @hoist_udiv( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = udiv i64 %a, %b +; CHECK-NEXT: [[REM:%.*]] = urem i64 %a, %b +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[REM]], 42 +; CHECK-NEXT: br i1 [[CMP]], label %if, label %end +; CHECK: if: +; CHECK-NEXT: br label %end +; CHECK: end: +; CHECK-NEXT: [[RET:%.*]] = phi i64 [ [[DIV]], %if ], [ 3, %entry ] +; CHECK-NEXT: ret i64 [[RET]] +; +entry: + %rem = urem i64 %a, %b + %cmp = icmp eq i64 %rem, 42 + br i1 %cmp, label %if, label %end + +if: + %div = udiv i64 %a, %b + br label %end + +end: + %ret = phi i64 [ %div, %if ], [ 3, %entry ] + ret i64 %ret +} + +; Hoist the srem because it's safe and the additional cost over div is not great. + +define i16 @hoist_srem(i16 %a, i16 %b) { +; CHECK-LABEL: @hoist_srem( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[REM:%.*]] = srem i16 %a, %b +; CHECK-NEXT: [[DIV:%.*]] = sdiv i16 %a, %b +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[DIV]], 42 +; CHECK-NEXT: br i1 [[CMP]], label %if, label %end +; CHECK: if: +; CHECK-NEXT: br label %end +; CHECK: end: +; CHECK-NEXT: [[RET:%.*]] = phi i16 [ [[REM]], %if ], [ 3, %entry ] +; CHECK-NEXT: ret i16 [[RET]] +; +entry: + %div = sdiv i16 %a, %b + %cmp = icmp eq i16 %div, 42 + br i1 %cmp, label %if, label %end + +if: + %rem = srem i16 %a, %b + br label %end + +end: + %ret = phi i16 [ %rem, %if ], [ 3, %entry ] + ret i16 %ret +} + +; Hoist the urem because it's safe and the additional cost over div is not great. + +define i8 @hoist_urem(i8 %a, i8 %b) { +; CHECK-LABEL: @hoist_urem( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[REM:%.*]] = urem i8 %a, %b +; CHECK-NEXT: [[DIV:%.*]] = udiv i8 %a, %b +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[DIV]], 42 +; CHECK-NEXT: br i1 [[CMP]], label %if, label %end +; CHECK: if: +; CHECK-NEXT: br label %end +; CHECK: end: +; CHECK-NEXT: [[RET:%.*]] = phi i8 [ [[REM]], %if ], [ 3, %entry ] +; CHECK-NEXT: ret i8 [[RET]] +; +entry: + %div = udiv i8 %a, %b + %cmp = icmp eq i8 %div, 42 + br i1 %cmp, label %if, label %end + +if: + %rem = urem i8 %a, %b + br label %end + +end: + %ret = phi i8 [ %rem, %if ], [ 3, %entry ] + ret i8 %ret +} + +; If the ops don't match, don't do anything: signedness. + +define i32 @dont_hoist_udiv(i32 %a, i32 %b) { +; CHECK-LABEL: @dont_hoist_udiv( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[REM:%.*]] = srem i32 %a, %b +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[REM]], 42 +; CHECK-NEXT: br i1 [[CMP]], label %if, label %end +; CHECK: if: +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 %a, %b +; CHECK-NEXT: br label %end +; CHECK: end: +; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[DIV]], %if ], [ 3, %entry ] +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + %rem = srem i32 %a, %b + %cmp = icmp eq i32 %rem, 42 + br i1 %cmp, label %if, label %end + +if: + %div = udiv i32 %a, %b + br label %end + +end: + %ret = phi i32 [ %div, %if ], [ 3, %entry ] + ret i32 %ret +} + +; If the ops don't match, don't do anything: operation. + +define i32 @dont_hoist_srem(i32 %a, i32 %b) { +; CHECK-LABEL: @dont_hoist_srem( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[REM:%.*]] = urem i32 %a, %b +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[REM]], 42 +; CHECK-NEXT: br i1 [[CMP]], label %if, label %end +; CHECK: if: +; CHECK-NEXT: [[REM2:%.*]] = srem i32 %a, %b +; CHECK-NEXT: br label %end +; CHECK: end: +; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[REM2]], %if ], [ 3, %entry ] +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + %rem = urem i32 %a, %b + %cmp = icmp eq i32 %rem, 42 + br i1 %cmp, label %if, label %end + +if: + %rem2 = srem i32 %a, %b + br label %end + +end: + %ret = phi i32 [ %rem2, %if ], [ 3, %entry ] + ret i32 %ret +} + +; If the ops don't match, don't do anything: operands. + +define i32 @dont_hoist_sdiv(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @dont_hoist_sdiv( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[REM:%.*]] = srem i32 %a, %b +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[REM]], 42 +; CHECK-NEXT: br i1 [[CMP]], label %if, label %end +; CHECK: if: +; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 %a, %c +; CHECK-NEXT: br label %end +; CHECK: end: +; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[DIV]], %if ], [ 3, %entry ] +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + %rem = srem i32 %a, %b + %cmp = icmp eq i32 %rem, 42 + br i1 %cmp, label %if, label %end + +if: + %div = sdiv i32 %a, %c + br label %end + +end: + %ret = phi i32 [ %div, %if ], [ 3, %entry ] + ret i32 %ret +} + +; If the rem is much more expensive than div, don't do anything. + +define i128 @dont_hoist_urem(i128 %a, i128 %b) { +; CHECK-LABEL: @dont_hoist_urem( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DIV:%.*]] = udiv i128 %a, %b +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i128 [[DIV]], 42 +; CHECK-NEXT: br i1 [[CMP]], label %if, label %end +; CHECK: if: +; CHECK-NEXT: [[REM:%.*]] = urem i128 %a, %b +; CHECK-NEXT: br label %end +; CHECK: end: +; CHECK-NEXT: [[RET:%.*]] = phi i128 [ [[REM]], %if ], [ 3, %entry ] +; CHECK-NEXT: ret i128 [[RET]] +; +entry: + %div = udiv i128 %a, %b + %cmp = icmp eq i128 %div, 42 + br i1 %cmp, label %if, label %end + +if: + %rem = urem i128 %a, %b + br label %end + +end: + %ret = phi i128 [ %rem, %if ], [ 3, %entry ] + ret i128 %ret +} +