Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -262,7 +262,11 @@ bool isMaskAndBranchFoldingLegal() const { return MaskAndBranchFoldingIsLegal; } - + + /// Return true if the target can perform a store plus a vector extract + /// in one instruction. + bool canCombineStoreAndExtract() const { return CanCombineStoreAndExtract; } + /// Return true if target supports floating point exceptions. bool hasFloatingPointExceptions() const { return HasFloatingPointExceptions; @@ -1936,6 +1940,10 @@ /// a mask of a single bit, a compare, and a branch into a single instruction. bool MaskAndBranchFoldingIsLegal; + /// Indicates if the target support a store plus a vector extract in + /// one instruction. + bool CanCombineStoreAndExtract; + protected: /// Return true if the value types that can be represented by the specified /// register class are all legal. Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -63,6 +64,7 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches"); +STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), @@ -80,6 +82,10 @@ "enable-andcmp-sinking", cl::Hidden, cl::init(true), cl::desc("Enable sinkinig and/cmp into branches.")); +static cl::opt DisableStoreExtract( + "disable-cgp-store-extract", cl::Hidden, cl::init(false), + cl::desc("Disable store(extract) optimizations in CodeGenPrepare")); + namespace { typedef SmallPtrSet SetOfInstrs; typedef DenseMap InstrToOrigTy; @@ -89,6 +95,7 @@ /// transformation profitability. const TargetMachine *TM; const TargetLowering *TLI; + const TargetTransformInfo *TTI; const TargetLibraryInfo *TLInfo; DominatorTree *DT; @@ -118,7 +125,7 @@ public: static char ID; // Pass identification, replacement for typeid explicit CodeGenPrepare(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM), TLI(nullptr) { + : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr) { initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -128,6 +135,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved(); AU.addRequired(); + AU.addRequired(); } private: @@ -144,6 +152,7 @@ bool OptimizeExtUses(Instruction *I); bool OptimizeSelectInst(SelectInst *SI); bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI); + bool OptimizeExtractElementInst(Instruction *Inst); bool DupRetToEnableTailCallOpts(BasicBlock *BB); bool PlaceDbgValues(Function &F); bool sinkAndCmp(Function &F); @@ -171,6 +180,7 @@ if (TM) TLI = TM->getSubtargetImpl()->getTargetLowering(); TLInfo = &getAnalysis(); + TTI = &getAnalysis(); DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable(); DT = DTWP ? &DTWP->getDomTree() : nullptr; @@ -3168,6 +3178,272 @@ return MadeChange; } +namespace { +/// \brief Helper class to promote a scalar operation to a vector one. +/// This class is used to move downward extractelement transition. +/// E.g., +/// a = vector_op <2 x i32> +/// b = extractelement <2 x i32> a, i32 0 +/// c = scalar_op b +/// store c +/// +/// => +/// a = vector_op <2 x i32> +/// c = vector_op a (equivalent to scalar_op on the related lane) +/// * d = extractelement <2 x i32> c, i32 0 +/// * store d +/// Assuming both extractelement and store can be combine, we get rid of the +/// transition. +class VectorPromoteHelper { + /// Used to perform some checks on the legality of vector operations. + const TargetLowering &TLI; + + /// Used to estimated the cost of the promoted chain. + const TargetTransformInfo &TTI; + + /// The transition being moved downwards. + Instruction *Transition; + /// The sequence of instructions to be promoted. + SmallVector InstsToBePromoted; + + /// \brief The instruction that represents the current end of the transition. + /// Since we are faking the promotion until we reach the end of the chain + /// of computation, we need a way to get the current end of the transition. + Instruction *getEndOfTransition() const { + if (InstsToBePromoted.empty()) + return Transition; + return InstsToBePromoted.back(); + } + + /// \brief Return the index of the original value in the transition. + /// E.g., for "extractelement <2 x i32> c, i32 1" the original value, + /// c, is at index 0. + unsigned getTransitionOriginalValueIdx() const { + assert(isa(Transition) && + "Other kind of transitions are not supported yet"); + return 0; + } + + /// \brief Return the index of the index in the transition. + /// E.g., for "extractelement <2 x i32> c, i32 0" the index + /// is at index 1. + unsigned getTransitionIdx() const { + assert(isa(Transition) && + "Other kind of transitions are not supported yet"); + return 1; + } + + /// \brief Get the type of the transition. + /// This is the type of the original value. + /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the + /// transition is <2 x i32>. + Type *getTransitionType() const { + return Transition->getOperand(getTransitionOriginalValueIdx())->getType(); + } + + /// \brief Promote \p ToBePromoted by moving \p Def downward through. + /// I.e., we have the following sequence: + /// Def = Transition a to + /// b = ToBePromoted Def, ... + /// => + /// b = ToBePromoted a, ... + /// Def = Transition ToBePromoted to + void promoteImpl(Instruction *ToBePromoted); + + /// \brief Check whether or not it is profitable to promote all the + /// instructions enqueued to be promoted. + bool isProfitableToPromote() { + Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx()); + unsigned Index = isa(ValIdx) + ? cast(ValIdx)->getZExtValue() + : -1; + Type *PromotedType = getTransitionType(); + + // The scalar chain of computation has to pay for the transition + // scalar to vector. + // The vector chain does not. + unsigned ScalarCost = + TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index); + unsigned VectorCost = 0; + for (const auto &Inst : InstsToBePromoted) { + // Compute the cost. + // By construction, all instructions being promoted are arithmetic ones. + // Moreover, one argument is a constant that will become a splat constant. + Value *Arg0 = Inst->getOperand(0); + bool IsArg0Constant = isa(Arg0) || isa(Arg0) || + isa(Arg0); + TargetTransformInfo::OperandValueKind Arg0OVK = + IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue + : TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Arg1OVK = + !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue + : TargetTransformInfo::OK_AnyValue; + ScalarCost += TTI.getArithmeticInstrCost( + Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK); + VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType, + Arg0OVK, Arg1OVK); + } + DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: " + << ScalarCost << "\nVector: " << VectorCost << '\n'); + return ScalarCost > VectorCost; + } + +public: + VectorPromoteHelper(const TargetLowering &TLI, const TargetTransformInfo &TTI, + Instruction *Transition) + : TLI(TLI), TTI(TTI), Transition(Transition) { + assert(Transition && "Do not know how to promote null"); + } + + /// \brief Check if we can promote \p ToBePromoted to \p Type. + bool canPromote(const Instruction *ToBePromoted) const { + // We could support CastInst too. + return isa(ToBePromoted); + } + + /// \brief Check if it is profitable to promote \p ToBePromoted + /// by moving downward the transition through. + bool shouldPromote(const Instruction *ToBePromoted) const { + // Promote only if all the operands can be statically expanded. + // Indeed, we do not want to introduce any new kind of transitions. + for (const Use &U : ToBePromoted->operands()) { + const Value *Val = U.get(); + if (Val == getEndOfTransition()) + continue; + if (!isa(Val) && !isa(Val)) { + if (isa(Val)) + DEBUG(dbgs() << "Constant FP: could have handle that.\n"); + return false; + } + } + // Check that the resulting operation is legal. + int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode()); + if (!ISDOpcode) + return false; + return TLI.isOperationLegalOrCustom(ISDOpcode, + EVT::getEVT(getTransitionType(), true)); + } + + /// \brief Check whether or not \p Use can be combined + /// with the transition. + /// I.e., is it possible to do Use(Transition) => AnotherUse? + bool canCombine(const Instruction *Use) { return isa(Use); } + + /// \brief Record \p ToBePromoted as part of the chain to be promoted. + void enqueueForPromotion(Instruction *ToBePromoted) { + InstsToBePromoted.push_back(ToBePromoted); + } + + /// \brief Promote all the instructions enqueued for promotion if it is + /// is profitable. + /// \return True if the promotion happened, false otherwise. + bool promote() { + // Check if there is something to promote. + if (InstsToBePromoted.empty()) + return false; + + // Check cost. + if (!isProfitableToPromote()) + return false; + + // Promote. + for (auto &ToBePromoted : InstsToBePromoted) + promoteImpl(ToBePromoted); + InstsToBePromoted.clear(); + return true; + } +}; +} // End of anonymous namespace. + +void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { + // At this point, we know that all the operands of ToBePromoted but Def + // can be statically promoted. + // For Def, we need to use its parameter in ToBePromoted: + // b = ToBePromoted ty1 a + // Def = Transition ty1 b to ty2 + // Move the transition down. + // 1. Replace all uses of the promoted operation by the transition. + // = ... b => = ... Def. + assert(ToBePromoted->getType() == Transition->getType() && + "The type of the result of the transition does not match " + "the final type"); + ToBePromoted->replaceAllUsesWith(Transition); + // 2. Update the type of the uses. + // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def. + Type *TransitionTy = getTransitionType(); + ToBePromoted->mutateType(TransitionTy); + // 3. Update all the operands of the promoted operation with promoted + // operands. + // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a. + for (const Use &U : ToBePromoted->operands()) { + const Value *Val = U.get(); + Value *NewVal = nullptr; + if (Val == Transition) + NewVal = Transition->getOperand(getTransitionOriginalValueIdx()); + else if (isa(Val)) + NewVal = UndefValue::get(TransitionTy); + else if (isa(Val)) + NewVal = + ConstantInt::get(TransitionTy, cast(Val)->getValue()); + else + assert(0 && "Did you modified shouldPromote and forgot to update this?"); + ToBePromoted->setOperand(U.getOperandNo(), NewVal); + } + Transition->removeFromParent(); + Transition->insertAfter(ToBePromoted); + Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted); +} + +/// Some targets can do store(extractelement) with one instruction. +/// Try to push the extractelement towards the stores when the target +/// has this feature and this is profitable. +bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) { + if (DisableStoreExtract || !TLI || !TLI->canCombineStoreAndExtract()) + return false; + + // At this point we know that Inst is a vector to scalar transition. + // Try to move it down the def-use chain, until: + // - We can combine the transition with its single use + // => we got rid of the transition. + // - We escape the current basic block + // => we would need to check that we are moving it at a cheaper place and + // we do not do that for now. + BasicBlock *Parent = Inst->getParent(); + DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n'); + VectorPromoteHelper VPH(*TLI, *TTI, Inst); + // If the transition has more than one use, assume this is not going to be + // beneficial. + while (Inst->hasOneUse()) { + Instruction *ToBePromoted = cast(*Inst->user_begin()); + DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n'); + + if (ToBePromoted->getParent() != Parent) { + DEBUG(dbgs() << "Instruction to promote is in a different block (" + << ToBePromoted->getParent()->getName() + << ") than the transition (" << Parent->getName() << ").\n"); + return false; + } + + if (VPH.canCombine(ToBePromoted)) { + DEBUG(dbgs() << "Assume " << *Inst << '\n' + << "will be combined with: " << *ToBePromoted << '\n'); + bool Changed = VPH.promote(); + NumStoreExtractExposed += Changed; + return Changed; + } + + DEBUG(dbgs() << "Try promoting.\n"); + if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted)) + return false; + + DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n"); + + VPH.enqueueForPromotion(ToBePromoted); + Inst = ToBePromoted; + } + return false; +} + bool CodeGenPrepare::OptimizeInst(Instruction *I) { if (PHINode *P = dyn_cast(I)) { // It is possible for very late stage optimizations (such as SimplifyCFG) @@ -3262,6 +3538,9 @@ if (ShuffleVectorInst *SVI = dyn_cast(I)) return OptimizeShuffleVectorInst(SVI); + if (isa(I)) + return OptimizeExtractElementInst(I); + return false; } Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -705,6 +705,7 @@ JumpIsExpensive = false; PredictableSelectIsExpensive = false; MaskAndBranchFoldingIsLegal = false; + CanCombineStoreAndExtract = false; HasFloatingPointExceptions = true; StackPointerRegisterToSaveRestore = 0; ExceptionPointerRegister = 0; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -434,6 +434,7 @@ setOperationAction(ISD::ConstantFP, MVT::f64, Custom); if (Subtarget->hasNEON()) { + CanCombineStoreAndExtract = true; addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); addDRTypeForNEON(MVT::v4i16); Index: test/CodeGen/ARM/vector-promotion.ll =================================================================== --- test/CodeGen/ARM/vector-promotion.ll +++ test/CodeGen/ARM/vector-promotion.ll @@ -0,0 +1,119 @@ +; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR %s +; RUN: llc -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s + +; IR-LABEL: @simpleOneInstructionPromotion +; IR: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], +; IR-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1 +; IR-NEXT: store i32 [[EXTRACT]], i32* %dest +; IR-NEXT: ret +; +; Make sure we got rid of any expensive vmov.32 instructions. +; ASM-LABEL: simpleOneInstructionPromotion: +; ASM: vldr [[LOAD:d[0-9]+]], [r0] +; ASM-NEXT: vorr.i32 [[LOAD]], #0x1 +; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1:32] +; ASM-NEXT: bx +define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = or i32 %extract, 1 + store i32 %out, i32* %dest, align 4 + ret void +} + +; IR-LABEL: @unsupportedInstructionForPromotion +; IR: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0 +; IR-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2 +; IR-NEXT: store i1 [[CMP]], i1* %dest +; IR-NEXT: ret +; +; ASM-LABEL: unsupportedInstructionForPromotion: +; ASM: vldr [[LOAD:d[0-9]+]], [r0] +; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]] +; ASM: bx +define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 0 + %out = icmp eq i32 %extract, %in2 + store i1 %out, i1* %dest, align 4 + ret void +} + + +; IR-LABEL: @unsupportedChainInDifferentBBs +; IR: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0 +; IR-NEXT: br i1 %bool, label %bb2, label %end +; BB2 +; IR: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 +; IR-NEXT: store i32 [[OR]], i32* %dest, align 4 +; IR: ret +; +; ASM-LABEL: unsupportedChainInDifferentBBs: +; ASM: vldrne [[LOAD:d[0-9]+]], [r0] +; ASM: vmovne.32 {{r[0-9]+}}, [[LOAD]] +; ASM: bx +define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) { +bb1: + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 0 + br i1 %bool, label %bb2, label %end +bb2: + %out = or i32 %extract, 1 + store i32 %out, i32* %dest, align 4 + br label %end +end: + ret void +} + +; IR-LABEL: @chainOfInstructionsToPromote +; IR: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], +; IR-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], +; IR-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], +; IR-NEXT: [[VECTOR_OR4:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR3]], +; IR-NEXT: [[VECTOR_OR5:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR4]], +; IR-NEXT: [[VECTOR_OR6:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR5]], +; IR-NEXT: [[VECTOR_OR7:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR6]], +; IR-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR7]], i32 0 +; IR-NEXT: store i32 [[EXTRACT]], i32* %dest +; IR-NEXT: ret +; +; ASM-LABEL: chainOfInstructionsToPromote: +; ASM: vldr [[LOAD:d[0-9]+]], [r0] +; ASM-NOT: vmov.32 {{r[0-9]+}}, [[LOAD]] +; ASM: bx +define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 0 + %out1 = or i32 %extract, 1 + %out2 = or i32 %out1, 1 + %out3 = or i32 %out2, 1 + %out4 = or i32 %out3, 1 + %out5 = or i32 %out4, 1 + %out6 = or i32 %out5, 1 + %out7 = or i32 %out6, 1 + store i32 %out7, i32* %dest, align 4 + ret void +} + +; IR-LABEL: @unsupportedMultiUses +; IR: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 +; IR-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 +; IR-NEXT: store i32 [[OR]], i32* %dest +; IR-NEXT: ret i32 [[OR]] +; +; ASM-LABEL: unsupportedMultiUses: +; ASM: vldr [[LOAD:d[0-9]+]], [r0] +; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]] +; ASM: bx +define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = or i32 %extract, 1 + store i32 %out, i32* %dest, align 4 + ret i32 %out +}