Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -262,7 +262,15 @@ bool isMaskAndBranchFoldingLegal() const { return MaskAndBranchFoldingIsLegal; } - + + /// Return true if the target can combine store(extractelement VectorTy, + /// Idx). + /// \p Cost[out] gives the cost of that transformation when this is true. + virtual bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const { + return false; + } + /// Return true if target supports floating point exceptions. bool hasFloatingPointExceptions() const { return HasFloatingPointExceptions; Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -63,6 +64,7 @@ STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches"); +STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), @@ -80,6 +82,14 @@ "enable-andcmp-sinking", cl::Hidden, cl::init(true), cl::desc("Enable sinkinig and/cmp into branches.")); +static cl::opt DisableStoreExtract( + "disable-cgp-store-extract", cl::Hidden, cl::init(false), + cl::desc("Disable store(extract) optimizations in CodeGenPrepare")); + +static cl::opt StressStoreExtract( + "stress-cgp-store-extract", cl::Hidden, cl::init(false), + cl::desc("Stress test store(extract) optimizations in CodeGenPrepare")); + namespace { typedef SmallPtrSet SetOfInstrs; typedef DenseMap InstrToOrigTy; @@ -89,6 +99,7 @@ /// transformation profitability. const TargetMachine *TM; const TargetLowering *TLI; + const TargetTransformInfo *TTI; const TargetLibraryInfo *TLInfo; DominatorTree *DT; @@ -118,7 +129,7 @@ public: static char ID; // Pass identification, replacement for typeid explicit CodeGenPrepare(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM), TLI(nullptr) { + : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr) { initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -128,6 +139,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved(); AU.addRequired(); + AU.addRequired(); } private: @@ -144,6 +156,7 @@ bool OptimizeExtUses(Instruction *I); bool OptimizeSelectInst(SelectInst *SI); bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI); + bool OptimizeExtractElementInst(Instruction *Inst); bool DupRetToEnableTailCallOpts(BasicBlock *BB); bool PlaceDbgValues(Function &F); bool sinkAndCmp(Function &F); @@ -171,6 +184,7 @@ if (TM) TLI = TM->getSubtargetImpl()->getTargetLowering(); TLInfo = &getAnalysis(); + TTI = &getAnalysis(); DominatorTreeWrapperPass *DTWP = getAnalysisIfAvailable(); DT = DTWP ? &DTWP->getDomTree() : nullptr; @@ -3168,6 +3182,345 @@ return MadeChange; } +namespace { +/// \brief Helper class to promote a scalar operation to a vector one. +/// This class is used to move downward extractelement transition. +/// E.g., +/// a = vector_op <2 x i32> +/// b = extractelement <2 x i32> a, i32 0 +/// c = scalar_op b +/// store c +/// +/// => +/// a = vector_op <2 x i32> +/// c = vector_op a (equivalent to scalar_op on the related lane) +/// * d = extractelement <2 x i32> c, i32 0 +/// * store d +/// Assuming both extractelement and store can be combine, we get rid of the +/// transition. +class VectorPromoteHelper { + /// Used to perform some checks on the legality of vector operations. + const TargetLowering &TLI; + + /// Used to estimated the cost of the promoted chain. + const TargetTransformInfo &TTI; + + /// The transition being moved downwards. + Instruction *Transition; + /// The sequence of instructions to be promoted. + SmallVector InstsToBePromoted; + /// Cost of combining a store and an extract. + unsigned StoreExtractCombineCost; + + /// \brief The instruction that represents the current end of the transition. + /// Since we are faking the promotion until we reach the end of the chain + /// of computation, we need a way to get the current end of the transition. + Instruction *getEndOfTransition() const { + if (InstsToBePromoted.empty()) + return Transition; + return InstsToBePromoted.back(); + } + + /// \brief Return the index of the original value in the transition. + /// E.g., for "extractelement <2 x i32> c, i32 1" the original value, + /// c, is at index 0. + unsigned getTransitionOriginalValueIdx() const { + assert(isa(Transition) && + "Other kind of transitions are not supported yet"); + return 0; + } + + /// \brief Return the index of the index in the transition. + /// E.g., for "extractelement <2 x i32> c, i32 0" the index + /// is at index 1. + unsigned getTransitionIdx() const { + assert(isa(Transition) && + "Other kind of transitions are not supported yet"); + return 1; + } + + /// \brief Get the type of the transition. + /// This is the type of the original value. + /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the + /// transition is <2 x i32>. + Type *getTransitionType() const { + return Transition->getOperand(getTransitionOriginalValueIdx())->getType(); + } + + /// \brief Promote \p ToBePromoted by moving \p Def downward through. + /// I.e., we have the following sequence: + /// Def = Transition a to + /// b = ToBePromoted Def, ... + /// => + /// b = ToBePromoted a, ... + /// Def = Transition ToBePromoted to + void promoteImpl(Instruction *ToBePromoted); + + /// \brief Check whether or not it is profitable to promote all the + /// instructions enqueued to be promoted. + bool isProfitableToPromote() { + Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx()); + unsigned Index = isa(ValIdx) + ? cast(ValIdx)->getZExtValue() + : -1; + Type *PromotedType = getTransitionType(); + + // The scalar chain of computation has to pay for the transition + // scalar to vector. + // The vector chain has to account for the combining cost. + uint64_t ScalarCost = + TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index); + uint64_t VectorCost = StoreExtractCombineCost; + for (const auto &Inst : InstsToBePromoted) { + // Compute the cost. + // By construction, all instructions being promoted are arithmetic ones. + // Moreover, one argument is a constant that can be viewed as a splat + // constant. + Value *Arg0 = Inst->getOperand(0); + bool IsArg0Constant = isa(Arg0) || isa(Arg0) || + isa(Arg0); + TargetTransformInfo::OperandValueKind Arg0OVK = + IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue + : TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Arg1OVK = + !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue + : TargetTransformInfo::OK_AnyValue; + ScalarCost += TTI.getArithmeticInstrCost( + Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK); + VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType, + Arg0OVK, Arg1OVK); + } + DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: " + << ScalarCost << "\nVector: " << VectorCost << '\n'); + return ScalarCost > VectorCost; + } + + /// \brief Generate a constant vector with \p Val with the same + /// number of elements as the transition. + /// \p UseSplat defines whether or not \p Val should be replicated + /// accross the whole vector. + /// In other words, if UseSplat == true, we generate , + /// otherwise we generate a vector with as many undef as possible: + /// where \p Val is only + /// used at the index of the extract. + Value *getConstantVector(Constant *Val, bool UseSplat) const { + unsigned ExtractIdx = UINT_MAX; + if (!UseSplat) { + // If we cannot determine where the constant must be, we have to + // use a splat constant. + Value *ValExtractIdx = Transition->getOperand(getTransitionIdx()); + if (ConstantInt *CstVal = dyn_cast(ValExtractIdx)) + ExtractIdx = CstVal->getSExtValue(); + else + UseSplat = true; + } + + unsigned End = getTransitionType()->getVectorNumElements(); + if (UseSplat) + return ConstantVector::getSplat(End, Val); + + SmallVector ConstVec; + UndefValue *UndefVal = UndefValue::get(Val->getType()); + for (unsigned Idx = 0; Idx != End; ++Idx) { + if (Idx == ExtractIdx) + ConstVec.push_back(Val); + else + ConstVec.push_back(UndefVal); + } + return ConstantVector::get(ConstVec); + } + + /// \brief Check if promoting to a vector type an operand at \p OperandIdx + /// in \p Use can trigger undefined behavior. + static bool canCauseUndefinedBehavior(const Instruction *Use, + unsigned OperandIdx) { + // This is not safe to introduce undef when the operand is on + // the right hand side of a division-like instruction. + if (OperandIdx != 1) + return false; + switch (Use->getOpcode()) { + default: + return false; + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::SRem: + case Instruction::URem: + return true; + case Instruction::FDiv: + case Instruction::FRem: + return !Use->hasUnsafeAlgebra(); + } + llvm_unreachable(nullptr); + } + +public: + VectorPromoteHelper(const TargetLowering &TLI, const TargetTransformInfo &TTI, + Instruction *Transition, unsigned CombineCost) + : TLI(TLI), TTI(TTI), Transition(Transition), + StoreExtractCombineCost(CombineCost), CombineInst(nullptr) { + assert(Transition && "Do not know how to promote null"); + } + + /// \brief Check if we can promote \p ToBePromoted to \p Type. + bool canPromote(const Instruction *ToBePromoted) const { + // We could support CastInst too. + return isa(ToBePromoted); + } + + /// \brief Check if it is profitable to promote \p ToBePromoted + /// by moving downward the transition through. + bool shouldPromote(const Instruction *ToBePromoted) const { + // Promote only if all the operands can be statically expanded. + // Indeed, we do not want to introduce any new kind of transitions. + for (const Use &U : ToBePromoted->operands()) { + const Value *Val = U.get(); + if (Val == getEndOfTransition()) { + // If the use is a division and the transition is on the rhs, + // we cannot promote the operation, otherwise we may create a + // division by zero. + if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())) + return false; + continue; + } + if (!isa(Val) && !isa(Val) && + !isa(Val)) + return false; + } + // Check that the resulting operation is legal. + int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode()); + if (!ISDOpcode) + return false; + return StressStoreExtract || + TLI.isOperationLegalOrCustom(ISDOpcode, + EVT::getEVT(getTransitionType(), true)); + } + + /// \brief Check whether or not \p Use can be combined + /// with the transition. + /// I.e., is it possible to do Use(Transition) => AnotherUse? + bool canCombine(const Instruction *Use) { return isa(Use); } + + /// \brief Record \p ToBePromoted as part of the chain to be promoted. + void enqueueForPromotion(Instruction *ToBePromoted) { + InstsToBePromoted.push_back(ToBePromoted); + } + + /// \brief Promote all the instructions enqueued for promotion if it is + /// is profitable. + /// \return True if the promotion happened, false otherwise. + bool promote() { + // Check if there is something to promote. + if (InstsToBePromoted.empty()) + return false; + + // Check cost. + if (!StressStoreExtract && !isProfitableToPromote()) + return false; + + // Promote. + for (auto &ToBePromoted : InstsToBePromoted) + promoteImpl(ToBePromoted); + InstsToBePromoted.clear(); + return true; + } +}; +} // End of anonymous namespace. + +void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { + // At this point, we know that all the operands of ToBePromoted but Def + // can be statically promoted. + // For Def, we need to use its parameter in ToBePromoted: + // b = ToBePromoted ty1 a + // Def = Transition ty1 b to ty2 + // Move the transition down. + // 1. Replace all uses of the promoted operation by the transition. + // = ... b => = ... Def. + assert(ToBePromoted->getType() == Transition->getType() && + "The type of the result of the transition does not match " + "the final type"); + ToBePromoted->replaceAllUsesWith(Transition); + // 2. Update the type of the uses. + // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def. + Type *TransitionTy = getTransitionType(); + ToBePromoted->mutateType(TransitionTy); + // 3. Update all the operands of the promoted operation with promoted + // operands. + // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a. + for (Use &U : ToBePromoted->operands()) { + Value *Val = U.get(); + Value *NewVal = nullptr; + if (Val == Transition) + NewVal = Transition->getOperand(getTransitionOriginalValueIdx()); + else if (isa(Val) || isa(Val) || + isa(Val)) { + // Use a splat constant if it is not safe to use undef. + NewVal = getConstantVector( + cast(Val), + isa(Val) || + canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo())); + } else + assert(0 && "Did you modified shouldPromote and forgot to update this?"); + ToBePromoted->setOperand(U.getOperandNo(), NewVal); + } + Transition->removeFromParent(); + Transition->insertAfter(ToBePromoted); + Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted); +} + +/// Some targets can do store(extractelement) with one instruction. +/// Try to push the extractelement towards the stores when the target +/// has this feature and this is profitable. +bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) { + unsigned CombineCost = UINT_MAX; + if (DisableStoreExtract || !TLI || + (!StressStoreExtract && + !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(), + Inst->getOperand(1), CombineCost))) + return false; + + // At this point we know that Inst is a vector to scalar transition. + // Try to move it down the def-use chain, until: + // - We can combine the transition with its single use + // => we got rid of the transition. + // - We escape the current basic block + // => we would need to check that we are moving it at a cheaper place and + // we do not do that for now. + BasicBlock *Parent = Inst->getParent(); + DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n'); + VectorPromoteHelper VPH(*TLI, *TTI, Inst, CombineCost); + // If the transition has more than one use, assume this is not going to be + // beneficial. + while (Inst->hasOneUse()) { + Instruction *ToBePromoted = cast(*Inst->user_begin()); + DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n'); + + if (ToBePromoted->getParent() != Parent) { + DEBUG(dbgs() << "Instruction to promote is in a different block (" + << ToBePromoted->getParent()->getName() + << ") than the transition (" << Parent->getName() << ").\n"); + return false; + } + + if (VPH.canCombine(ToBePromoted)) { + DEBUG(dbgs() << "Assume " << *Inst << '\n' + << "will be combined with: " << *ToBePromoted << '\n'); + bool Changed = VPH.promote(); + NumStoreExtractExposed += Changed; + return Changed; + } + + DEBUG(dbgs() << "Try promoting.\n"); + if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted)) + return false; + + DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n"); + + VPH.enqueueForPromotion(ToBePromoted); + Inst = ToBePromoted; + } + return false; +} + bool CodeGenPrepare::OptimizeInst(Instruction *I) { if (PHINode *P = dyn_cast(I)) { // It is possible for very late stage optimizations (such as SimplifyCFG) @@ -3262,6 +3615,9 @@ if (ShuffleVectorInst *SVI = dyn_cast(I)) return OptimizeShuffleVectorInst(SVI); + if (isa(I)) + return OptimizeExtractElementInst(I); + return false; } Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -410,6 +410,9 @@ bool useLoadStackGuardNode() const override; + bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const override; + protected: std::pair findRepresentativeClass(MVT VT) const override; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -11105,6 +11105,35 @@ return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO; } +bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const { + // If we do not have NEON, vector types are not natively supported. + if (!Subtarget->hasNEON()) + return false; + + // Floating point values and vector values map to the same register file. + // Therefore, althought we could do a store extract of a vector type, this is + // better to leave at float as we have more freedom in the addressing mode for + // those. + if (VectorTy->isFPOrFPVectorTy()) + return false; + + // If the index is unknown at compile time, this is very expensive to lower + // and it is not possible to combine the store with the extract. + if (!isa(Idx)) + return false; + + assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); + unsigned BitWidth = cast(VectorTy)->getBitWidth(); + // We can do a store + vector extract on any vector that fits perfectly in a D + // or Q register. + if (BitWidth == 64 || BitWidth == 128) { + Cost = 0; + return true; + } + return false; +} + Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Index: test/CodeGen/ARM/vector-promotion.ll =================================================================== --- test/CodeGen/ARM/vector-promotion.ll +++ test/CodeGen/ARM/vector-promotion.ll @@ -0,0 +1,403 @@ +; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s +; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s +; RUN: llc -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s + +; IR-BOTH-LABEL: @simpleOneInstructionPromotion +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1 +; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest +; IR-BOTH-NEXT: ret +; +; Make sure we got rid of any expensive vmov.32 instructions. +; ASM-LABEL: simpleOneInstructionPromotion: +; ASM: vldr [[LOAD:d[0-9]+]], [r0] +; ASM-NEXT: vorr.i32 [[LOAD]], #0x1 +; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1:32] +; ASM-NEXT: bx +define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = or i32 %extract, 1 + store i32 %out, i32* %dest, align 4 + ret void +} + +; IR-BOTH-LABEL: @unsupportedInstructionForPromotion +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0 +; IR-BOTH-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2 +; IR-BOTH-NEXT: store i1 [[CMP]], i1* %dest +; IR-BOTH-NEXT: ret +; +; ASM-LABEL: unsupportedInstructionForPromotion: +; ASM: vldr [[LOAD:d[0-9]+]], [r0] +; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]] +; ASM: bx +define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 0 + %out = icmp eq i32 %extract, %in2 + store i1 %out, i1* %dest, align 4 + ret void +} + + +; IR-BOTH-LABEL: @unsupportedChainInDifferentBBs +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0 +; IR-BOTH-NEXT: br i1 %bool, label %bb2, label %end +; BB2 +; IR-BOTH: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 +; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest, align 4 +; IR-BOTH: ret +; +; ASM-LABEL: unsupportedChainInDifferentBBs: +; ASM: vldrne [[LOAD:d[0-9]+]], [r0] +; ASM: vmovne.32 {{r[0-9]+}}, [[LOAD]] +; ASM: bx +define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) { +bb1: + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 0 + br i1 %bool, label %bb2, label %end +bb2: + %out = or i32 %extract, 1 + store i32 %out, i32* %dest, align 4 + br label %end +end: + ret void +} + +; IR-LABEL: @chainOfInstructionsToPromote +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], +; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], +; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], +; IR-BOTH-NEXT: [[VECTOR_OR4:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR3]], +; IR-BOTH-NEXT: [[VECTOR_OR5:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR4]], +; IR-BOTH-NEXT: [[VECTOR_OR6:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR5]], +; IR-BOTH-NEXT: [[VECTOR_OR7:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR6]], +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR7]], i32 0 +; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest +; IR-BOTH-NEXT: ret +; +; ASM-LABEL: chainOfInstructionsToPromote: +; ASM: vldr [[LOAD:d[0-9]+]], [r0] +; ASM-NOT: vmov.32 {{r[0-9]+}}, [[LOAD]] +; ASM: bx +define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 0 + %out1 = or i32 %extract, 1 + %out2 = or i32 %out1, 1 + %out3 = or i32 %out2, 1 + %out4 = or i32 %out3, 1 + %out5 = or i32 %out4, 1 + %out6 = or i32 %out5, 1 + %out7 = or i32 %out6, 1 + store i32 %out7, i32* %dest, align 4 + ret void +} + +; IR-BOTH-LABEL: @unsupportedMultiUses +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 +; IR-BOTH-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 +; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest +; IR-BOTH-NEXT: ret i32 [[OR]] +; +; ASM-LABEL: unsupportedMultiUses: +; ASM: vldr [[LOAD:d[0-9]+]], [r0] +; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]] +; ASM: bx +define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = or i32 %extract, 1 + store i32 %out, i32* %dest, align 4 + ret i32 %out +} + +; Check that we promote we a splat constant when this is a division. +; The NORMAL mode does not promote anything as divisions are not legal. +; IR-BOTH-LABEL: @udivCase +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 [[EXTRACT]], 7 +; Vector version: +; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = udiv <2 x i32> [[LOAD]], +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 +; +; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest +; IR-BOTH-NEXT: ret +define void @udivCase(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = udiv i32 %extract, 7 + store i32 %out, i32* %dest, align 4 + ret void +} + +; IR-BOTH-LABEL: @uremCase +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = urem i32 [[EXTRACT]], 7 +; Vector version: +; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = urem <2 x i32> [[LOAD]], +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 +; +; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest +; IR-BOTH-NEXT: ret +define void @uremCase(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = urem i32 %extract, 7 + store i32 %out, i32* %dest, align 4 + ret void +} + +; IR-BOTH-LABEL: @sdivCase +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sdiv i32 [[EXTRACT]], 7 +; Vector version: +; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = sdiv <2 x i32> [[LOAD]], +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 +; +; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest +; IR-BOTH-NEXT: ret +define void @sdivCase(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = sdiv i32 %extract, 7 + store i32 %out, i32* %dest, align 4 + ret void +} + +; IR-BOTH-LABEL: @sremCase +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 [[EXTRACT]], 7 +; Vector version: +; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = srem <2 x i32> [[LOAD]], +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 +; +; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest +; IR-BOTH-NEXT: ret +define void @sremCase(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = srem i32 %extract, 7 + store i32 %out, i32* %dest, align 4 + ret void +} + +; IR-BOTH-LABEL: @fdivCase +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv float [[EXTRACT]], 7.0 +; Vector version: +; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fdiv <2 x float> [[LOAD]], +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1 +; +; IR-BOTH-NEXT: store float [[RES]], float* %dest +; IR-BOTH-NEXT: ret +define void @fdivCase(<2 x float>* %addr1, float* %dest) { + %in1 = load <2 x float>* %addr1, align 8 + %extract = extractelement <2 x float> %in1, i32 1 + %out = fdiv float %extract, 7.0 + store float %out, float* %dest, align 4 + ret void +} + +; IR-BOTH-LABEL: @fremCase +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem float [[EXTRACT]], 7.0 +; Vector version: +; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem <2 x float> [[LOAD]], +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1 +; +; IR-BOTH-NEXT: store float [[RES]], float* %dest +; IR-BOTH-NEXT: ret +define void @fremCase(<2 x float>* %addr1, float* %dest) { + %in1 = load <2 x float>* %addr1, align 8 + %extract = extractelement <2 x float> %in1, i32 1 + %out = frem float %extract, 7.0 + store float %out, float* %dest, align 4 + ret void +} + +; Check that we do not promote when we may introduce undefined behavior +; like division by zero. +; IR-BOTH-LABEL: @undefDivCase +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 +; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 7, [[EXTRACT]] +; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest +; IR-BOTH-NEXT: ret +define void @undefDivCase(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = udiv i32 7, %extract + store i32 %out, i32* %dest, align 4 + ret void +} + + +; Check that we do not promote when we may introduce undefined behavior +; like division by zero. +; IR-BOTH-LABEL: @undefRemCase +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1 +; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 7, [[EXTRACT]] +; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest +; IR-BOTH-NEXT: ret +define void @undefRemCase(<2 x i32>* %addr1, i32* %dest) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = srem i32 7, %extract + store i32 %out, i32* %dest, align 4 + ret void +} + +; Check that we use an undef mask for undefined behavior if the fast-math +; flag is set. +; IR-BOTH-LABEL: @undefConstantFRemCaseWithFastMath +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem fast float [[EXTRACT]], 7.0 +; Vector version: +; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem fast <2 x float> [[LOAD]], +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1 +; +; IR-BOTH-NEXT: store float [[RES]], float* %dest +; IR-BOTH-NEXT: ret +define void @undefConstantFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) { + %in1 = load <2 x float>* %addr1, align 8 + %extract = extractelement <2 x float> %in1, i32 1 + %out = frem fast float %extract, 7.0 + store float %out, float* %dest, align 4 + ret void +} + +; Check that we use an undef mask for undefined behavior if the fast-math +; flag is set. +; IR-BOTH-LABEL: @undefVectorFRemCaseWithFastMath +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem fast float 7.000000e+00, [[EXTRACT]] +; Vector version: +; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem fast <2 x float> , [[LOAD]] +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1 +; +; IR-BOTH-NEXT: store float [[RES]], float* %dest +; IR-BOTH-NEXT: ret +define void @undefVectorFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) { + %in1 = load <2 x float>* %addr1, align 8 + %extract = extractelement <2 x float> %in1, i32 1 + %out = frem fast float 7.0, %extract + store float %out, float* %dest, align 4 + ret void +} + +; Check that we are able to promote floating point value. +; This requires the STRESS mode, as floating point value are +; not promote on armv7. +; IR-BOTH-LABEL: @simpleOneInstructionPromotionFloat +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0 +; Vector version: +; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fadd <2 x float> [[LOAD]], +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1 +; +; IR-BOTH-NEXT: store float [[RES]], float* %dest +; IR-BOTH-NEXT: ret +define void @simpleOneInstructionPromotionFloat(<2 x float>* %addr1, float* %dest) { + %in1 = load <2 x float>* %addr1, align 8 + %extract = extractelement <2 x float> %in1, i32 1 + %out = fadd float %extract, 1.0 + store float %out, float* %dest, align 4 + ret void +} + +; Check that we correctly use a splat constant when we cannot +; determine at compile time the index of the extract. +; This requires the STRESS modes, as variable index are expensive +; to lower. +; IR-BOTH-LABEL: @simpleOneInstructionPromotionVariableIdx +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 %idx +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 +; Vector version: +; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[OR]], i32 %idx +; +; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest +; IR-BOTH-NEXT: ret +define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %dest, i32 %idx) { + %in1 = load <2 x i32>* %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 %idx + %out = or i32 %extract, 1 + store i32 %out, i32* %dest, align 4 + ret void +} + +; Check a vector with more than 2 elements. +; This requires the STRESS mode because currently 'or v8i8' is not marked +; as legal or custom, althought the actual assembly is better if we were +; promoting it. +; IR-BOTH-LABEL: @simpleOneInstructionPromotion8x8 +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <8 x i8>* %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[LOAD]], i32 1 +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i8 [[EXTRACT]], 1 +; Vector version: +; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <8 x i8> [[LOAD]], +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[OR]], i32 1 +; +; IR-BOTH-NEXT: store i8 [[RES]], i8* %dest +; IR-BOTH-NEXT: ret +define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) { + %in1 = load <8 x i8>* %addr1, align 8 + %extract = extractelement <8 x i8> %in1, i32 1 + %out = or i8 %extract, 1 + store i8 %out, i8* %dest, align 4 + ret void +} + +; Check that we optimized the sequence correctly when it can be +; lowered on a Q register. +; IR-BOTH-LABEL: @simpleOneInstructionPromotion +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>* %addr1 +; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[VECTOR_OR]], i32 1 +; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest +; IR-BOTH-NEXT: ret +; +; Make sure we got rid of any expensive vmov.32 instructions. +; ASM-LABEL: simpleOneInstructionPromotion4x32: +; ASM: vld1.64 {[[LOAD:d[0-9]+]], d{{[0-9]+}}}, [r0] +; The Q register used here must be [[LOAD]] / 2, but we cannot express that. +; ASM-NEXT: vorr.i32 q{{[[0-9]+}}, #0x1 +; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1] +; ASM-NEXT: bx +define void @simpleOneInstructionPromotion4x32(<4 x i32>* %addr1, i32* %dest) { + %in1 = load <4 x i32>* %addr1, align 8 + %extract = extractelement <4 x i32> %in1, i32 1 + %out = or i32 %extract, 1 + store i32 %out, i32* %dest, align 1 + ret void +}