Index: llvm/trunk/lib/Target/ARM/ARM.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARM.h +++ llvm/trunk/lib/Target/ARM/ARM.h @@ -43,6 +43,7 @@ FunctionPass *createA15SDOptimizerPass(); FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); FunctionPass *createARMExpandPseudoPass(); +FunctionPass *createARMCodeGenPreparePass(); FunctionPass *createARMConstantIslandPass(); FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); @@ -64,6 +65,7 @@ void initializeARMParallelDSPPass(PassRegistry &); void initializeARMLoadStoreOptPass(PassRegistry &); void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); +void initializeARMCodeGenPreparePass(PassRegistry &); void initializeARMConstantIslandsPass(PassRegistry &); void initializeARMExpandPseudoPass(PassRegistry &); void initializeThumb2SizeReducePass(PassRegistry &); Index: llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp +++ llvm/trunk/lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -0,0 +1,746 @@ +//===----- ARMCodeGenPrepare.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass inserts intrinsics to handle small types that would otherwise be +/// promoted during legalization. Here we can manually promote types or insert +/// intrinsics which can handle narrow types that aren't supported by the +/// register classes. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "ARMTargetMachine.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" + +#define DEBUG_TYPE "arm-codegenprepare" + +using namespace llvm; + +static cl::opt +DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(false), + cl::desc("Disable ARM specific CodeGenPrepare pass")); + +static cl::opt +EnableDSP("arm-enable-scalar-dsp", cl::Hidden, cl::init(false), + cl::desc("Use DSP instructions for scalar operations")); + +static cl::opt +EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false), + cl::desc("Use DSP instructions for scalar operations\ + with immediate operands")); + +namespace { + +class IRPromoter { + SmallPtrSet NewInsts; + SmallVector InstsToRemove; + Module *M = nullptr; + LLVMContext &Ctx; + +public: + IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { } + + void Cleanup() { + for (auto *I : InstsToRemove) { + LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n"); + I->dropAllReferences(); + I->eraseFromParent(); + } + InstsToRemove.clear(); + NewInsts.clear(); + } + + void Mutate(Type *OrigTy, + SmallPtrSetImpl &Visited, + SmallPtrSetImpl &Leaves, + SmallPtrSetImpl &Roots); +}; + +class ARMCodeGenPrepare : public FunctionPass { + const ARMSubtarget *ST = nullptr; + IRPromoter *Promoter = nullptr; + std::set AllVisited; + Type *OrigTy = nullptr; + unsigned TypeSize = 0; + + bool isNarrowInstSupported(Instruction *I); + bool isSupportedValue(Value *V); + bool isLegalToPromote(Value *V); + bool TryToPromote(Value *V); + +public: + static char ID; + + ARMCodeGenPrepare() : FunctionPass(ID) {} + + ~ARMCodeGenPrepare() { delete Promoter; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + + StringRef getPassName() const override { return "ARM IR optimizations"; } + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; +}; + +} + +/// Can the given value generate sign bits. +static bool isSigned(Value *V) { + if (!isa(V)) + return false; + + unsigned Opc = cast(V)->getOpcode(); + return Opc == Instruction::AShr || Opc == Instruction::SDiv || + Opc == Instruction::SRem; +} + +/// Some instructions can use 8- and 16-bit operands, and we don't need to +/// promote anything larger. We disallow booleans to make life easier when +/// dealing with icmps but allow any other integer that is <= 16 bits. Void +/// types are accepted so we can handle switches. +static bool isSupportedType(Value *V) { + if (V->getType()->isVoidTy()) + return true; + + const IntegerType *IntTy = dyn_cast(V->getType()); + if (!IntTy) + return false; + + // Don't try to promote boolean values. + if (IntTy->getBitWidth() == 1) + return false; + + if (auto *ZExt = dyn_cast(V)) + return isSupportedType(ZExt->getOperand(0)); + + return IntTy->getBitWidth() <= 16; +} + +/// Return true if V will require any promoted values to be truncated for the +/// use to be valid. +static bool isSink(Value *V) { + auto UsesNarrowValue = [](Value *V) { + return V->getType()->getScalarSizeInBits() <= 32; + }; + + if (auto *Store = dyn_cast(V)) + return UsesNarrowValue(Store->getValueOperand()); + if (auto *Return = dyn_cast(V)) + return UsesNarrowValue(Return->getReturnValue()); + + return isa(V); +} + +/// Return true if the given value is a leaf that will need to be zext'd. +static bool isSource(Value *V) { + if (isa(V) && isSupportedType(V)) + return true; + else if (isa(V)) + return true; + else if (auto *ZExt = dyn_cast(V)) + // ZExt can be a leaf if its the only user of a load. + return isa(ZExt->getOperand(0)) && + ZExt->getOperand(0)->hasOneUse(); + else if (auto *Call = dyn_cast(V)) + return Call->hasRetAttr(Attribute::AttrKind::ZExt); + else if (auto *Load = dyn_cast(V)) { + if (!isa(Load->getType())) + return false; + // A load is a leaf, unless its already just being zext'd. + if (Load->hasOneUse() && isa(*Load->use_begin())) + return false; + + return true; + } + return false; +} + +/// Return whether the instruction can be promoted within any modifications to +/// it's operands or result. +static bool isSafeOverflow(Instruction *I) { + if (isa(I) && I->hasNoUnsignedWrap()) + return true; + + unsigned Opc = I->getOpcode(); + if (Opc == Instruction::Add || Opc == Instruction::Sub) { + // We don't care if the add or sub could wrap if the value is decreasing + // and is only being used by an unsigned compare. + if (!I->hasOneUse() || + !isa(*I->user_begin()) || + !isa(I->getOperand(1))) + return false; + + auto *CI = cast(*I->user_begin()); + if (CI->isSigned()) + return false; + + bool NegImm = cast(I->getOperand(1))->isNegative(); + bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) || + ((Opc == Instruction::Add) && NegImm); + if (!IsDecreasing) + return false; + + LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n"); + return true; + } + + // Otherwise, if an instruction is using a negative immediate we will need + // to fix it up during the promotion. + for (auto &Op : I->operands()) { + if (auto *Const = dyn_cast(Op)) + if (Const->isNegative()) + return false; + } + return false; +} + +static bool shouldPromote(Value *V) { + auto *I = dyn_cast(V); + if (!I) + return false; + + if (!isa(V->getType())) + return false; + + if (isa(I) || isa(I) || isa(I) || + isa(I)) + return false; + + if (auto *ZExt = dyn_cast(I)) + return !ZExt->getDestTy()->isIntegerTy(32); + + return true; +} + +/// Return whether we can safely mutate V's type to ExtTy without having to be +/// concerned with zero extending or truncation. +static bool isPromotedResultSafe(Value *V) { + if (!isa(V)) + return true; + + if (isSigned(V)) + return false; + + // If I is only being used by something that will require its value to be + // truncated, then we don't care about the promoted result. + auto *I = cast(V); + if (I->hasOneUse() && isSink(*I->use_begin())) + return true; + + if (isa(I)) + return isSafeOverflow(I); + return true; +} + +/// Return the intrinsic for the instruction that can perform the same +/// operation but on a narrow type. This is using the parallel dsp intrinsics +/// on scalar values. +static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) { + // Whether we use the signed or unsigned versions of these intrinsics + // doesn't matter because we're not using the GE bits that they set in + // the APSR. + switch(I->getOpcode()) { + default: + break; + case Instruction::Add: + return TypeSize == 16 ? Intrinsic::arm_uadd16 : + Intrinsic::arm_uadd8; + case Instruction::Sub: + return TypeSize == 16 ? Intrinsic::arm_usub16 : + Intrinsic::arm_usub8; + } + llvm_unreachable("unhandled opcode for narrow intrinsic"); +} + +void IRPromoter::Mutate(Type *OrigTy, + SmallPtrSetImpl &Visited, + SmallPtrSetImpl &Leaves, + SmallPtrSetImpl &Roots) { + IRBuilder<> Builder{Ctx}; + Type *ExtTy = Type::getInt32Ty(M->getContext()); + unsigned TypeSize = OrigTy->getPrimitiveSizeInBits(); + SmallPtrSet Promoted; + LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize + << " to 32-bits\n"); + + auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) { + SmallVector Users; + Instruction *InstTo = dyn_cast(To); + for (Use &U : From->uses()) { + auto *User = cast(U.getUser()); + if (InstTo && User->isIdenticalTo(InstTo)) + continue; + Users.push_back(User); + } + + for (auto &U : Users) + U->replaceUsesOfWith(From, To); + }; + + auto FixConst = [&](ConstantInt *Const, Instruction *I) { + Constant *NewConst = nullptr; + if (isSafeOverflow(I)) { + NewConst = (Const->isNegative()) ? + ConstantExpr::getSExt(Const, ExtTy) : + ConstantExpr::getZExt(Const, ExtTy); + } else { + uint64_t NewVal = *Const->getValue().getRawData(); + if (Const->getType() == Type::getInt16Ty(Ctx)) + NewVal &= 0xFFFF; + else + NewVal &= 0xFF; + NewConst = ConstantInt::get(ExtTy, NewVal); + } + I->replaceUsesOfWith(Const, NewConst); + }; + + auto InsertDSPIntrinsic = [&](Instruction *I) { + LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for " + << *I << "\n"); + Function *DSPInst = + Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize)); + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + Value *Args[] = { I->getOperand(0), I->getOperand(1) }; + CallInst *Call = Builder.CreateCall(DSPInst, Args); + ReplaceAllUsersOfWith(I, Call); + InstsToRemove.push_back(I); + NewInsts.insert(Call); + }; + + auto InsertZExt = [&](Value *V, Instruction *InsertPt) { + LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n"); + Builder.SetInsertPoint(InsertPt); + if (auto *I = dyn_cast(V)) + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + auto *ZExt = cast(Builder.CreateZExt(V, ExtTy)); + if (isa(V)) + ZExt->moveBefore(InsertPt); + else + ZExt->moveAfter(InsertPt); + ReplaceAllUsersOfWith(V, ZExt); + NewInsts.insert(ZExt); + }; + + // First, insert extending instructions between the leaves and their users. + LLVM_DEBUG(dbgs() << "ARM CGP: Promoting leaves:\n"); + for (auto V : Leaves) { + LLVM_DEBUG(dbgs() << " - " << *V << "\n"); + if (auto *ZExt = dyn_cast(V)) + ZExt->mutateType(ExtTy); + else if (auto *I = dyn_cast(V)) + InsertZExt(I, I); + else if (auto *Arg = dyn_cast(V)) { + BasicBlock &BB = Arg->getParent()->front(); + InsertZExt(Arg, &*BB.getFirstInsertionPt()); + } else { + llvm_unreachable("unhandled leaf that needs extending"); + } + Promoted.insert(V); + } + + LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n"); + // Then mutate the types of the instructions within the tree. Here we handle + // constant operands. + for (auto *V : Visited) { + if (Leaves.count(V)) + continue; + + if (!isa(V)) + continue; + + auto *I = cast(V); + if (Roots.count(I)) + continue; + + for (auto &U : I->operands()) { + if ((U->getType() == ExtTy) || !isSupportedType(&*U)) + continue; + + if (auto *Const = dyn_cast(&*U)) + FixConst(Const, I); + else if (isa(&*U)) + U->mutateType(ExtTy); + } + + if (shouldPromote(I)) { + I->mutateType(ExtTy); + Promoted.insert(I); + } + } + + // Now we need to remove any zexts that have become unnecessary, as well + // as insert any intrinsics. + for (auto *V : Visited) { + if (Leaves.count(V)) + continue; + if (auto *ZExt = dyn_cast(V)) { + if (ZExt->getDestTy() != ExtTy) { + ZExt->mutateType(ExtTy); + Promoted.insert(ZExt); + } + else if (ZExt->getSrcTy() == ExtTy) { + ReplaceAllUsersOfWith(V, ZExt->getOperand(0)); + InstsToRemove.push_back(ZExt); + } + continue; + } + + if (!shouldPromote(V) || isPromotedResultSafe(V)) + continue; + + // Replace unsafe instructions with appropriate intrinsic calls. + InsertDSPIntrinsic(cast(V)); + } + + LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the roots:\n"); + // Fix up any stores or returns that use the results of the promoted + // chain. + for (auto I : Roots) { + LLVM_DEBUG(dbgs() << " - " << *I << "\n"); + Type *TruncTy = OrigTy; + if (auto *Store = dyn_cast(I)) { + auto *PtrTy = cast(Store->getPointerOperandType()); + TruncTy = PtrTy->getElementType(); + } else if (isa(I)) { + Function *F = I->getParent()->getParent(); + TruncTy = F->getFunctionType()->getReturnType(); + } + + for (unsigned i = 0; i < I->getNumOperands(); ++i) { + Value *V = I->getOperand(i); + if (Promoted.count(V) || NewInsts.count(V)) { + if (auto *Op = dyn_cast(V)) { + + if (auto *Call = dyn_cast(I)) + TruncTy = Call->getFunctionType()->getParamType(i); + + if (TruncTy == ExtTy) + continue; + + LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy + << " Trunc for " << *Op << "\n"); + Builder.SetInsertPoint(Op); + auto *Trunc = cast(Builder.CreateTrunc(Op, TruncTy)); + Trunc->moveBefore(I); + I->setOperand(i, Trunc); + NewInsts.insert(Trunc); + } + } + } + } + LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n"); +} + +bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) { + if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I)) + return false; + + if (ST->isThumb() && !ST->hasThumb2()) + return false; + + if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub) + return false; + + // TODO + // Would it be profitable? For Thumb code, these parallel DSP instructions + // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For + // Cortex-A, specifically Cortex-A72, the latency is double and throughput is + // halved. They also do not take immediates as operands. + for (auto &Op : I->operands()) { + if (isa(Op)) { + if (!EnableDSPWithImms) + return false; + } + } + return true; +} + +/// We accept most instructions, as well as Arguments and ConstantInsts. We +/// Disallow casts other than zext and truncs and only allow calls if their +/// return value is zeroext. We don't allow opcodes that can introduce sign +/// bits. +bool ARMCodeGenPrepare::isSupportedValue(Value *V) { + LLVM_DEBUG(dbgs() << "ARM CGP: Is " << *V << " supported?\n"); + + // Non-instruction values that we can handle. + if (isa(V) || isa(V)) + return true; + + // Memory instructions + if (isa(V) || isa(V) || isa(V)) + return true; + + // Branches and targets. + if (auto *ICmp = dyn_cast(V)) + return ICmp->isEquality() || !ICmp->isSigned(); + + if( isa(V) || isa(V) || isa(V)) + return true; + + if (isa(V) || isa(V) || isa(V)) + return true; + + // Special cases for calls as we need to check for zeroext + // TODO We should accept calls even if they don't have zeroext, as they can + // still be roots. + if (auto *Call = dyn_cast(V)) + return Call->hasRetAttr(Attribute::AttrKind::ZExt); + else if (auto *Cast = dyn_cast(V)) { + if (isa(Cast)) + return Cast->getDestTy()->getScalarSizeInBits() <= 32; + else if (auto *Trunc = dyn_cast(V)) + return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize; + else { + LLVM_DEBUG(dbgs() << "ARM CGP: No, unsupported cast.\n"); + return false; + } + } else if (!isa(V)) { + LLVM_DEBUG(dbgs() << "ARM CGP: No, not a binary operator.\n"); + return false; + } + + bool res = !isSigned(V); + if (!res) + LLVM_DEBUG(dbgs() << "ARM CGP: No, it's a signed instruction.\n"); + return res; +} + +/// Check that the type of V would be promoted and that the original type is +/// smaller than the targeted promoted type. Check that we're not trying to +/// promote something larger than our base 'TypeSize' type. +bool ARMCodeGenPrepare::isLegalToPromote(Value *V) { + if (!isSupportedType(V)) + return false; + + unsigned VSize = 0; + if (auto *Ld = dyn_cast(V)) { + auto *PtrTy = cast(Ld->getPointerOperandType()); + VSize = PtrTy->getElementType()->getPrimitiveSizeInBits(); + } else if (auto *ZExt = dyn_cast(V)) { + VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits(); + } else { + VSize = V->getType()->getPrimitiveSizeInBits(); + } + + if (VSize > TypeSize) + return false; + + if (isPromotedResultSafe(V)) + return true; + + if (auto *I = dyn_cast(V)) + return isNarrowInstSupported(I); + + return false; +} + +bool ARMCodeGenPrepare::TryToPromote(Value *V) { + OrigTy = V->getType(); + TypeSize = OrigTy->getPrimitiveSizeInBits(); + + if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V)) + return false; + + LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n"); + + SetVector WorkList; + SmallPtrSet Leaves; + SmallPtrSet Roots; + WorkList.insert(V); + SmallPtrSet CurrentVisited; + CurrentVisited.clear(); + + // Return true if the given value can, or has been, visited. Add V to the + // worklist if needed. + auto AddLegalInst = [&](Value *V) { + if (CurrentVisited.count(V)) + return true; + + if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) { + LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n"); + return false; + } + + WorkList.insert(V); + return true; + }; + + // Iterate through, and add to, a tree of operands and users in the use-def. + while (!WorkList.empty()) { + Value *V = WorkList.back(); + WorkList.pop_back(); + if (CurrentVisited.count(V)) + continue; + + if (!isa(V) && !isSource(V)) + continue; + + // If we've already visited this value from somewhere, bail now because + // the tree has already been explored. + // TODO: This could limit the transform, ie if we try to promote something + // from an i8 and fail first, before trying an i16. + if (AllVisited.count(V)) { + LLVM_DEBUG(dbgs() << "ARM CGP: Already visited this: " << *V << "\n"); + return false; + } + + CurrentVisited.insert(V); + AllVisited.insert(V); + + // Calls can be both sources and sinks. + if (isSink(V)) + Roots.insert(cast(V)); + if (isSource(V)) + Leaves.insert(V); + else if (auto *I = dyn_cast(V)) { + // Visit operands of any instruction visited. + for (auto &U : I->operands()) { + if (!AddLegalInst(U)) + return false; + } + } + + // Don't visit users of a node which isn't going to be mutated unless its a + // source. + if (isSource(V) || shouldPromote(V)) { + for (Use &U : V->uses()) { + if (!AddLegalInst(U.getUser())) + return false; + } + } + } + + unsigned NumToPromote = 0; + unsigned Cost = 0; + for (auto *V : CurrentVisited) { + // Truncs will cause a uxt and no zeroext arguments will often require + // a uxt somewhere. + if (isa(V)) + ++Cost; + else if (auto *Arg = dyn_cast(V)) { + if (!Arg->hasZExtAttr()) + ++Cost; + } + + // Mem ops can automatically be extended/truncated and non-instructions + // don't need anything done. + if (Leaves.count(V) || isa(V) || !isa(V)) + continue; + + // Will need to truncate calls args and returns. + if (Roots.count(cast(V))) { + ++Cost; + continue; + } + + if (shouldPromote(V)) + ++NumToPromote; + } + + LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n"; + for (auto *I : CurrentVisited) + I->dump(); + ); + LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote + << " instructions = " << Cost << "\n"); + if (Cost > NumToPromote || (NumToPromote == 0)) + return false; + + Promoter->Mutate(OrigTy, CurrentVisited, Leaves, Roots); + return true; +} + +bool ARMCodeGenPrepare::doInitialization(Module &M) { + Promoter = new IRPromoter(&M); + return false; +} + +bool ARMCodeGenPrepare::runOnFunction(Function &F) { + if (skipFunction(F) || DisableCGP) + return false; + + auto *TPC = &getAnalysis(); + if (!TPC) + return false; + + const TargetMachine &TM = TPC->getTM(); + ST = &TM.getSubtarget(F); + bool MadeChange = false; + LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n"); + + // Search up from icmps to try to promote their operands. + for (BasicBlock &BB : F) { + auto &Insts = BB.getInstList(); + for (auto &I : Insts) { + if (AllVisited.count(&I)) + continue; + + if (isa(I)) { + auto &CI = cast(I); + + // Skip signed or pointer compares + if (CI.isSigned() || !isa(CI.getOperand(0)->getType())) + continue; + + LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n"); + for (auto &Op : CI.operands()) { + if (auto *I = dyn_cast(Op)) { + if (isa(I)) + MadeChange |= TryToPromote(I->getOperand(0)); + else + MadeChange |= TryToPromote(I); + } + } + } + } + Promoter->Cleanup(); + LLVM_DEBUG(if (verifyFunction(F, &dbgs())) { + dbgs(); + report_fatal_error("Broken function after type promotion"); + }); + } + if (MadeChange) + LLVM_DEBUG(dbgs() << "After ARMCodeGenPrepare: " << F << "\n"); + + return MadeChange; +} + +INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE, + "ARM IR optimizations", false, false) +INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations", + false, false) + +char ARMCodeGenPrepare::ID = 0; + +FunctionPass *llvm::createARMCodeGenPreparePass() { + return new ARMCodeGenPrepare(); +} Index: llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp +++ llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp @@ -90,6 +90,7 @@ initializeARMLoadStoreOptPass(Registry); initializeARMPreAllocLoadStoreOptPass(Registry); initializeARMParallelDSPPass(Registry); + initializeARMCodeGenPreparePass(Registry); initializeARMConstantIslandsPass(Registry); initializeARMExecutionDomainFixPass(Registry); initializeARMExpandPseudoPass(Registry); @@ -346,6 +347,7 @@ } void addIRPasses() override; + void addCodeGenPrepare() override; bool addPreISel() override; bool addInstSelector() override; bool addIRTranslator() override; @@ -402,6 +404,12 @@ addPass(createInterleavedAccessPass()); } +void ARMPassConfig::addCodeGenPrepare() { + if (getOptLevel() != CodeGenOpt::None) + addPass(createARMCodeGenPreparePass()); + TargetPassConfig::addCodeGenPrepare(); +} + bool ARMPassConfig::addPreISel() { if (getOptLevel() != CodeGenOpt::None) addPass(createARMParallelDSPPass()); Index: llvm/trunk/lib/Target/ARM/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/ARM/CMakeLists.txt +++ llvm/trunk/lib/Target/ARM/CMakeLists.txt @@ -23,6 +23,7 @@ ARMBaseInstrInfo.cpp ARMBaseRegisterInfo.cpp ARMCallLowering.cpp + ARMCodeGenPrepare.cpp ARMConstantIslandPass.cpp ARMConstantPoolValue.cpp ARMExpandPseudoInsts.cpp Index: llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll +++ llvm/trunk/test/CodeGen/ARM/arm-cgp-icmps.ll @@ -0,0 +1,468 @@ +; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP +; RUN: llc -mtriple=thumbv7em %s -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP +; RUN: llc -mtriple=thumbv8 %s -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM + +; CHECK-COMMON-LABEL: test_ult_254_inc_imm: +; CHECK-DSP: adds r0, #1 +; CHECK-DSP-NEXT: uxtb r1, r0 +; CHECK-DSP-NEXT: movs r0, #47 +; CHECK-DSP-NEXT: cmp r1, #254 +; CHECK-DSP-NEXT: it lo +; CHECK-DSP-NEXT: movlo r0, #35 + +; CHECK-DSP-IMM: movs r1, #1 +; CHECK-DSP-IMM-NEXT: uadd8 r1, r0, r1 +; CHECK-DSP-IMM-NEXT: movs r0, #47 +; CHECK-DSP-IMM-NEXT: cmp r1, #254 +; CHECK-DSP-IMM-NEXT: it lo +; CHECK-DSP-IMM-NEXT: movlo r0, #35 +define i32 @test_ult_254_inc_imm(i8 zeroext %x) { +entry: + %add = add i8 %x, 1 + %cmp = icmp ult i8 %add, 254 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: test_slt_254_inc_imm +; CHECK-COMMON: adds +; CHECK-COMMON: sxtb +define i32 @test_slt_254_inc_imm(i8 signext %x) { +entry: + %add = add i8 %x, 1 + %cmp = icmp slt i8 %add, 254 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: test_ult_254_inc_var: +; CHECK-NODSP: add r0, r1 +; CHECK-NODSP-NEXT: uxtb r1, r0 +; CHECK-NODSP-NEXT: movs r0, #47 +; CHECK-NODSP-NEXT: cmp r1, #254 +; CHECK-NODSP-NEXT: it lo +; CHECK-NODSP-NEXT: movlo r0, #35 + +; CHECK-DSP: uadd8 r1, r0, r1 +; CHECK-DSP-NEXT: movs r0, #47 +; CHECK-DSP-NEXT: cmp r1, #254 +; CHECK-DSP-NEXT: it lo +; CHECK-DSP-NEXT: movlo r0, #35 +define i32 @test_ult_254_inc_var(i8 zeroext %x, i8 zeroext %y) { +entry: + %add = add i8 %x, %y + %cmp = icmp ult i8 %add, 254 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: test_sle_254_inc_var +; CHECK-COMMON: add +; CHECK-COMMON: sxtb +; CHECK-COMMON: cmp +define i32 @test_sle_254_inc_var(i8 %x, i8 %y) { +entry: + %add = add i8 %x, %y + %cmp = icmp sle i8 %add, 254 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: test_ugt_1_dec_imm: +; CHECK-COMMON: subs r1, r0, #1 +; CHECK-COMMON-NEXT: movs r0, #47 +; CHECK-COMMON-NEXT: cmp r1, #1 +; CHECK-COMMON-NEXT: it hi +; CHECK-COMMON-NEXT: movhi r0, #35 +define i32 @test_ugt_1_dec_imm(i8 zeroext %x) { +entry: + %add = add i8 %x, -1 + %cmp = icmp ugt i8 %add, 1 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: test_sgt_1_dec_imm +; CHECK-COMMON: subs +; CHECK-COMMON: sxtb +; CHECK-COMMON: cmp +define i32 @test_sgt_1_dec_imm(i8 %x) { +entry: + %add = add i8 %x, -1 + %cmp = icmp sgt i8 %add, 1 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: test_ugt_1_dec_var: +; CHECK-NODSP: subs r0, r0, r1 +; CHECK-NODSP-NEXT: uxtb r1, r0 +; CHECK-NODSP-NEXT: movs r0, #47 +; CHECK-NODSP-NEXT: cmp r1, #1 +; CHECK-NODSP-NEXT: it hi +; CHECK-NODSP-NEXT: movhi r0, #35 + +; CHECK-DSP: usub8 r1, r0, r1 +; CHECK-DSP-NEXT: movs r0, #47 +; CHECK-DSP-NEXT: cmp r1, #1 +; CHECK-DSP-NEXT: it hi +; CHECK-DSP-NEXT: movhi r0, #35 +define i32 @test_ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) { +entry: + %sub = sub i8 %x, %y + %cmp = icmp ugt i8 %sub, 1 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: test_sge_1_dec_var +; CHECK-COMMON: sub +; CHECK-COMMON: sxtb +; CHECK-COMMON: cmp +define i32 @test_sge_1_dec_var(i8 %x, i8 %y) { +entry: + %sub = sub i8 %x, %y + %cmp = icmp sge i8 %sub, 1 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: dsp_imm1: +; CHECK-DSP: eors r1, r0 +; CHECK-DSP-NEXT: and r0, r0, #7 +; CHECK-DSP-NEXT: subs r0, r0, r1 +; CHECK-DSP-NEXT: adds r0, #1 +; CHECK-DSP-NEXT: uxtb r1, r0 +; CHECK-DSP-NEXT: movs r0, #47 +; CHECK-DSP-NEXT: cmp r1, #254 +; CHECK-DSP-NEXT: it lo +; CHECK-DSP-NEXT: movlo r0, #35 + +; CHECK-DSP-IMM: eors r1, r0 +; CHECK-DSP-IMM-NEXT: and r0, r0, #7 +; CHECK-DSP-IMM-NEXT: usub8 r0, r0, r1 +; CHECK-DSP-IMM-NEXT: movs r1, #1 +; CHECK-DSP-IMM-NEXT: uadd8 r1, r0, r1 +; CHECK-DSP-IMM-NEXT: movs r0, #47 +; CHECK-DSP-IMM-NEXT: cmp r1, #254 +; CHECK-DSP-IMM-NEXT: it lo +; CHECK-DSP-IMM-NEXT: movlo r0, #35 +define i32 @dsp_imm1(i8 zeroext %x, i8 zeroext %y) { +entry: + %xor = xor i8 %x, %y + %and = and i8 %x, 7 + %sub = sub i8 %and, %xor + %add = add i8 %sub, 1 + %cmp = icmp ult i8 %add, 254 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: dsp_imm2 +; CHECK-COMMON: add r0, r1 +; CHECK-DSP-NEXT: ldrh r1, [r3] +; CHECK-DSP-NEXT: ldrh r2, [r2] +; CHECK-DSP-NEXT: subs r1, r1, r0 +; CHECK-DSP-NEXT: add r0, r2 +; CHECK-DSP-NEXT: uxth r3, r1 +; CHECK-DSP-NEXT: uxth r2, r0 +; CHECK-DSP-NEXT: cmp r2, r3 + +; CHECK-DSP-IMM: movs r1, #0 +; CHECK-DSP-IMM-NEXT: uxth r0, r0 +; CHECK-DSP-IMM-NEXT: usub16 r1, r1, r0 +; CHECK-DSP-IMM-NEXT: ldrh r0, [r2] +; CHECK-DSP-IMM-NEXT: ldrh r3, [r3] +; CHECK-DSP-IMM-NEXT: usub16 r0, r0, r1 +; CHECK-DSP-IMM-NEXT: uadd16 r1, r3, r1 +; CHECK-DSP-IMM-NEXT: cmp r0, r1 + +define i16 @dsp_imm2(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) { +entry: + %add0 = add i32 %arg0, %arg1 + %conv0 = trunc i32 %add0 to i16 + %sub0 = sub i16 0, %conv0 + %load0 = load i16, i16* %gep0, align 2 + %load1 = load i16, i16* %gep1, align 2 + %sub1 = sub i16 %load0, %sub0 + %add1 = add i16 %load1, %sub0 + %cmp = icmp ult i16 %sub1, %add1 + %res = select i1 %cmp, i16 %add1, i16 %sub1 + ret i16 %res +} + +; CHECK-COMMON-LABEL: dsp_var: +; CHECK-COMMON: eors r1, r0 +; CHECK-COMMON: and r2, r0, #7 +; CHECK-NODSP: subs r1, r2, r1 +; CHECK-NODSP: add.w r0, r1, r0, lsl #1 +; CHECK-NODSP: uxtb r1, r0 +; CHECK-DSP: usub8 r1, r2, r1 +; CHECK-DSP: lsls r0, r0, #1 +; CHECK-DSP: uadd8 r1, r1, r0 +; CHECK-DSP-NOT: uxt +; CHECK-COMMON: movs r0, #47 +; CHECK-COMMON: cmp r1, #254 +; CHECK-COMMON: it lo +; CHECK-COMMON: movlo r0, #35 +define i32 @dsp_var(i8 zeroext %x, i8 zeroext %y) { + %xor = xor i8 %x, %y + %and = and i8 %x, 7 + %sub = sub i8 %and, %xor + %mul = shl nuw i8 %x, 1 + %add = add i8 %sub, %mul + %cmp = icmp ult i8 %add, 254 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: store_dsp_res +; CHECK-DSP: usub8 +; CHECK-DSP: strb +define void @store_dsp_res(i8* %in, i8* %out, i8 %compare) { + %first = getelementptr inbounds i8, i8* %in, i32 0 + %second = getelementptr inbounds i8, i8* %in, i32 1 + %ld0 = load i8, i8* %first + %ld1 = load i8, i8* %second + %xor = xor i8 %ld0, -1 + %cmp = icmp ult i8 %compare, %ld1 + %select = select i1 %cmp, i8 %compare, i8 %xor + %sub = sub i8 %ld0, %select + store i8 %sub, i8* %out, align 1 + ret void +} + +; CHECK-COMMON-LABEL: ugt_1_dec_imm: +; CHECK-COMMON: subs r1, r0, #1 +; CHECK-COMMON-NEXT: movs r0, #47 +; CHECK-COMMON-NEXT: cmp r1, #1 +; CHECK-COMMON-NEXT: it hi +; CHECK-COMMON-NEXT: movhi r0, #35 +define i32 @ugt_1_dec_imm(i8 zeroext %x) { +entry: + %add = add i8 %x, -1 + %cmp = icmp ugt i8 %add, 1 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: ugt_1_dec_var: +; CHECK-NODSP: subs r0, r0, r1 +; CHECK-NODSP-NEXT: uxtb r1, r0 +; CHECK-NODSP-NEXT: movs r0, #47 +; CHECK-NODSP-NEXT: cmp r1, #1 +; CHECK-NODSP-NEXT: it hi +; CHECK-NODSP-NEXT: movhi r0, #35 + +; CHECK-DSP: usub8 r1, r0, r1 +; CHECK-DSP-NEXT: movs r0, #47 +; CHECK-DSP-NEXT: cmp r1, #1 +; CHECK-DSP-NEXT: it hi +; CHECK-DSP-NEXT: movhi r0, #35 +define i32 @ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) { +entry: + %sub = sub i8 %x, %y + %cmp = icmp ugt i8 %sub, 1 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: icmp_i32_zext: +; CHECK-COMMON: ldrb [[LD:r[^ ]+]], [r0] +; CHECK-COMMON: subs [[SUB:r[^ ]+]], [[LD]], #1 +; CHECK-COMMON-NOT: uxt +; CHECK-COMMON: cmp [[LD]], [[SUB]] +; CHECK-COMMON-NOT: uxt +define i8 @icmp_i32_zext(i8* %ptr) { +entry: + %gep = getelementptr inbounds i8, i8* %ptr, i32 0 + %0 = load i8, i8* %gep, align 1 + %1 = sub nuw nsw i8 %0, 1 + %conv44 = zext i8 %0 to i32 + br label %preheader + +preheader: + br label %body + +body: + %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ] + %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ] + %conv51266 = zext i8 %2 to i32 + %cmp52267 = icmp eq i32 %si.0274, %conv51266 + br i1 %cmp52267, label %if.end, label %exit + +if.end: + %inc = add i32 %si.0274, 1 + %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc + %3 = load i8, i8* %gep1, align 1 + br label %body + +exit: + ret i8 %2 +} + +@d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1 +@sh1 = hidden local_unnamed_addr global i16 0, align 2 +@d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2 + +; CHECK-COMMON-LABEL: icmp_sext_zext_store_i8_i16 +; CHECK-NODSP: ldrb [[BYTE:r[^ ]+]], +; CHECK-NODSP: strh [[BYTE]], +; CHECK-NODSP: ldrsh.w +define i32 @icmp_sext_zext_store_i8_i16() { +entry: + %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1 + %conv = zext i8 %0 to i16 + store i16 %conv, i16* @sh1, align 2 + %conv1 = zext i8 %0 to i32 + %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2 + %conv2 = sext i16 %1 to i32 + %cmp = icmp eq i32 %conv1, %conv2 + %conv3 = zext i1 %cmp to i32 + ret i32 %conv3 +} + +; CHECK-COMMON-LABEL: or_icmp_ugt: +; CHECK-COMMON: ldrb [[LD:r[^ ]+]], [r1] +; CHECK-COMMON: subs [[SUB:r[^ ]+]], #1 +; CHECK-COMMON-NOT: uxtb +; CHECK-COMMON: cmp [[SUB]], #3 +define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) { +entry: + %0 = load i8, i8* %ptr + %1 = zext i8 %0 to i32 + %mul = shl nuw nsw i32 %1, 1 + %add0 = add nuw nsw i32 %mul, 6 + %cmp0 = icmp ne i32 %arg, %add0 + %add1 = add i8 %0, -1 + %cmp1 = icmp ugt i8 %add1, 3 + %or = or i1 %cmp0, %cmp1 + ret i1 %or +} + +; CHECK-COMMON-LABEL: icmp_switch_trunc: +; CHECK-COMMON-NOT: uxt +define i16 @icmp_switch_trunc(i16 zeroext %arg) { +entry: + %conv = add nuw i16 %arg, 15 + %mul = mul nuw nsw i16 %conv, 3 + %trunc = trunc i16 %arg to i3 + switch i3 %trunc, label %default [ + i3 0, label %sw.bb + i3 1, label %sw.bb.i + ] + +sw.bb: + %cmp0 = icmp ult i16 %mul, 127 + %select = select i1 %cmp0, i16 %mul, i16 127 + br label %exit + +sw.bb.i: + %cmp1 = icmp ugt i16 %mul, 34 + %select.i = select i1 %cmp1, i16 %mul, i16 34 + br label %exit + +default: + br label %exit + +exit: + %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ] + ret i16 %res +} + +; CHECK-COMMON-LABEL: icmp_eq_minus_one +; CHECK-COMMON: cmp r0, #255 +define i32 @icmp_eq_minus_one(i8* %ptr) { + %load = load i8, i8* %ptr, align 1 + %conv = zext i8 %load to i32 + %cmp = icmp eq i8 %load, -1 + %ret = select i1 %cmp, i32 %conv, i32 -1 + ret i32 %ret +} + +; CHECK-COMMON-LABEL: icmp_not +; CHECK-COMMON: movw r2, #65535 +; CHECK-COMMON: eors r2, r0 +; CHECK-COMMON: movs r0, #32 +; CHECK-COMMON: cmp r2, r1 +define i32 @icmp_not(i16 zeroext %arg0, i16 zeroext %arg1) { + %not = xor i16 %arg0, -1 + %cmp = icmp eq i16 %not, %arg1 + %res = select i1 %cmp, i32 16, i32 32 + ret i32 %res +} + +; CHECK-COMMON-LABEL: mul_wrap +; CHECK-COMMON: mul +; CHECK-COMMON: uxth +; CHECK-COMMON: cmp +define i16 @mul_wrap(i16 %arg0, i16 %arg1) { + %mul = mul i16 %arg0, %arg1 + %cmp = icmp eq i16 %mul, 1 + %res = select i1 %cmp, i16 %arg0, i16 47 + ret i16 %res +} + +; CHECK-COMMON-LABEL: shl_wrap +; CHECK-COMMON: lsl +; CHECK-COMMON: uxth +; CHECK-COMMON: cmp +define i16 @shl_wrap(i16 %arg0) { + %mul = shl i16 %arg0, 4 + %cmp = icmp eq i16 %mul, 1 + %res = select i1 %cmp, i16 %arg0, i16 47 + ret i16 %res +} + +; CHECK-COMMON-LABEL: add_wrap +; CHECK-COMMON: add +; CHECK-COMMON: uxth +; CHECK-COMMON: cmp +define i16 @add_wrap(i16 %arg0, i16 %arg1) { + %add = add i16 %arg0, 128 + %cmp = icmp eq i16 %add, %arg1 + %res = select i1 %cmp, i16 %arg0, i16 1 + ret i16 %res +} + +; CHECK-COMMON-LABEL: sub_wrap +; CHECK-COMMON: sub +; CHECK-COMMON: uxth +; CHECK-COMMON: cmp +define i16 @sub_wrap(i16 %arg0, i16 %arg1, i16 %arg2) { + %sub = sub i16 %arg0, %arg2 + %cmp = icmp eq i16 %sub, %arg1 + %res = select i1 %cmp, i16 %arg0, i16 1 + ret i16 %res +} + +; CHECK-COMMON-LABEL: urem_trunc_icmps +; CHECK-COMMON-NOT: uxt +define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) { +entry: + %ptr = load i16*, i16** %in, align 4 + %ld = load i16, i16* %ptr, align 2 + %cmp.i = icmp eq i16 %ld, 0 + br i1 %cmp.i, label %exit, label %cond.false.i + +cond.false.i: + %rem = urem i16 5, %ld + %extract.t = trunc i16 %rem to i8 + br label %body + +body: + %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ] + %cmp = icmp ugt i8 %cond.in.i.off0, 7 + %conv5 = zext i1 %cmp to i32 + store i32 %conv5, i32* %g, align 4 + %.pr = load i32, i32* %k, align 4 + %tobool13150 = icmp eq i32 %.pr, 0 + br i1 %tobool13150, label %for.inc, label %exit + +for.inc: + %add = add nuw i8 %cond.in.i.off0, 1 + br label %body + +exit: + ret void +} Index: llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll +++ llvm/trunk/test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll @@ -0,0 +1,392 @@ +; RUN: llc -mtriple=thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP +; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP +; RUN: llc -mtriple=thumbv8m.main -arm-enable-scalar-dsp=true -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP +; RUN: llc -mtriple=thumbv7em %s -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM + +; Test that ARMCodeGenPrepare can handle: +; - loops +; - call operands +; - call return values +; - ret instructions +; We use nuw on the arithmetic instructions to avoid complications. + +; Check that the arguments are extended but then nothing else is. +; This also ensures that the pass can handle loops. +; CHECK-COMMON-LABEL: phi_feeding_phi_args +; CHECK-COMMON: uxtb +; CHECK-COMMON: uxtb +; CHECK-NOT: uxtb +define void @phi_feeding_phi_args(i8 %a, i8 %b) { +entry: + %0 = icmp ugt i8 %a, %b + br i1 %0, label %preheader, label %empty + +empty: + br label %preheader + +preheader: + %1 = phi i8 [ %a, %entry ], [ %b, %empty ] + br label %loop + +loop: + %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ] + %cmp = icmp ult i8 %val, 254 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %inc = sub nuw i8 %val, 2 + br label %if.end + +if.else: + %inc1 = shl nuw i8 %val, 1 + br label %if.end + +if.end: + %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ] + %cmp1 = icmp eq i8 %inc2, 255 + br i1 %cmp1, label %exit, label %loop + +exit: + ret void +} + +; Same as above, but as the args are zeroext, we shouldn't see any uxts. +; CHECK-COMMON-LABEL: phi_feeding_phi_zeroext_args +; CHECK-COMMON-NOT: uxt +define void @phi_feeding_phi_zeroext_args(i8 zeroext %a, i8 zeroext %b) { +entry: + %0 = icmp ugt i8 %a, %b + br i1 %0, label %preheader, label %empty + +empty: + br label %preheader + +preheader: + %1 = phi i8 [ %a, %entry ], [ %b, %empty ] + br label %loop + +loop: + %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ] + %cmp = icmp ult i8 %val, 254 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %inc = sub nuw i8 %val, 2 + br label %if.end + +if.else: + %inc1 = shl nuw i8 %val, 1 + br label %if.end + +if.end: + %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ] + %cmp1 = icmp eq i8 %inc2, 255 + br i1 %cmp1, label %exit, label %loop + +exit: + ret void +} + +; Just check that phis also work with i16s. +; CHECK-COMMON-LABEL: phi_i16: +; CHECK-COMMON-NOT: uxt +define void @phi_i16() { +entry: + br label %loop + +loop: + %val = phi i16 [ 0, %entry ], [ %inc2, %if.end ] + %cmp = icmp ult i16 %val, 128 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %inc = add nuw i16 %val, 2 + br label %if.end + +if.else: + %inc1 = add nuw i16 %val, 1 + br label %if.end + +if.end: + %inc2 = phi i16 [ %inc, %if.then], [ %inc1, %if.else ] + %cmp1 = icmp ult i16 %inc2, 253 + br i1 %cmp1, label %loop, label %exit + +exit: + ret void +} + +; CHECK-COMMON-LABEL: phi_feeding_switch +; CHECK-COMMON: ldrb +; CHECK-COMMON: uxtb +; CHECK-COMMON-NOT: uxt +define void @phi_feeding_switch(i8* %memblock, i8* %store, i16 %arg) { +entry: + %pre = load i8, i8* %memblock, align 1 + %conv = trunc i16 %arg to i8 + br label %header + +header: + %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ] + %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ] + %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ] + switch i8 %phi.0, label %default [ + i8 43, label %for.inc.i + i8 45, label %for.inc.i.i + ] + +for.inc.i: + %xor = xor i8 %phi.1, 1 + br label %latch + +for.inc.i.i: + %and = and i8 %phi.1, 3 + br label %latch + +default: + %sub = sub i8 %phi.0, 1 + %cmp2 = icmp ugt i8 %sub, 4 + br i1 %cmp2, label %latch, label %exit + +latch: + %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ] + %count = add nuw i8 %phi.2, 1 + store i8 %count, i8* %store, align 1 + br label %header + +exit: + ret void +} + +; CHECK-COMMON-LABEL: ret_i8 +; CHECK-COMMON-NOT: uxt +define i8 @ret_i8() { +entry: + br label %loop + +loop: + %val = phi i8 [ 0, %entry ], [ %inc2, %if.end ] + %cmp = icmp ult i8 %val, 128 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %inc = add nuw i8 %val, 2 + br label %if.end + +if.else: + %inc1 = add nuw i8 %val, 1 + br label %if.end + +if.end: + %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ] + %cmp1 = icmp ult i8 %inc2, 253 + br i1 %cmp1, label %exit, label %loop + +exit: + ret i8 %inc2 +} + +; Check that %exp requires uxth in all cases, and will also be required to +; promote %1 for the call - unless we can generate a uadd16. +; CHECK-COMMON-LABEL: zext_load_sink_call: +; CHECK-COMMON: uxt +; CHECK-DSP-IMM: uadd16 +; CHECK-COMMON: cmp +; CHECK-DSP: uxt +; CHECK-DSP-IMM-NOT: uxt +define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) { +entry: + %0 = load i16, i16* %ptr, align 4 + %1 = add i16 %exp, 3 + %cmp = icmp eq i16 %0, %exp + br i1 %cmp, label %exit, label %if.then + +if.then: + %conv0 = zext i16 %0 to i32 + %conv1 = zext i16 %1 to i32 + %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1) + br label %exit + +exit: + %exitval = phi i32 [ %call, %if.then ], [ 0, %entry ] + ret i32 %exitval +} + + +; Check that the pass doesn't try to promote the immediate parameters. +; CHECK-COMMON-LABEL: call_with_imms +; CHECK-COMMON-NOT: uxt +define i8 @call_with_imms(i8* %arg) { + %call = tail call arm_aapcs_vfpcc zeroext i8 @dummy2(i8* nonnull %arg, i8 zeroext 0, i8 zeroext 0) + %cmp = icmp eq i8 %call, 0 + %res = select i1 %cmp, i8 %call, i8 1 + ret i8 %res +} + +; Test that the call result is still extended. +; CHECK-COMMON-LABEL: test_call: +; CHECK-COMMON: bl +; CHECK-COMMONNEXT: sxtb r1, r0 +define i16 @test_call(i8 zeroext %arg) { + %call = call i8 @dummy_i8(i8 %arg) + %cmp = icmp ult i8 %call, 128 + %conv = zext i1 %cmp to i16 + ret i16 %conv +} + +; Test that the transformation bails when it finds that i16 is larger than i8. +; TODO: We should be able to remove the uxtb in these cases. +; CHECK-LABEL: promote_i8_sink_i16_1 +; CHECK-COMMON: bl dummy_i8 +; CHECK-COMMON: adds r0, #1 +; CHECK-COMMON: uxtb r0, r0 +; CHECK-COMMON: cmp r0 +define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) { + %call = tail call zeroext i8 @dummy_i8(i8 %arg0) + %add = add nuw i8 %call, 1 + %conv = zext i8 %add to i16 + %cmp = icmp ne i16 %conv, %arg1 + %sel = select i1 %cmp, i16 %arg1, i16 %arg2 + %res = tail call zeroext i16 @dummy3(i16 %sel) + ret i16 %res +} + +; CHECK-COMMON-LABEL: promote_i8_sink_i16_2 +; CHECK-COMMON: bl dummy_i8 +; CHECK-COMMON: adds r0, #1 +; CHECK-COMMON: uxtb r0, r0 +; CHECK-COMMON: cmp r0 +define i16 @promote_i8_sink_i16_2(i8 zeroext %arg0, i8 zeroext %arg1, i16 zeroext %arg2) { + %call = tail call zeroext i8 @dummy_i8(i8 %arg0) + %add = add nuw i8 %call, 1 + %cmp = icmp ne i8 %add, %arg1 + %conv = zext i8 %arg1 to i16 + %sel = select i1 %cmp, i16 %conv, i16 %arg2 + %res = tail call zeroext i16 @dummy3(i16 %sel) + ret i16 %res +} + +@uc = global i8 42, align 1 +@LL = global i64 0, align 8 + +; CHECK-COMMON-LABEL: zext_i64 +; CHECK-COMMON: ldrb +; CHECK-COMMON: strd +define void @zext_i64() { +entry: + %0 = load i8, i8* @uc, align 1 + %conv = zext i8 %0 to i64 + store i64 %conv, i64* @LL, align 8 + %cmp = icmp eq i8 %0, 42 + %conv1 = zext i1 %cmp to i32 + %call = tail call i32 bitcast (i32 (...)* @assert to i32 (i32)*)(i32 %conv1) + ret void +} + +@a = global i16* null, align 4 +@b = global i32 0, align 4 + +; CHECK-COMMON-LABEL: constexpr +; CHECK-COMMON: uxth +define i32 @constexpr() { +entry: + store i32 ptrtoint (i32* @b to i32), i32* @b, align 4 + %0 = load i16*, i16** @a, align 4 + %1 = load i16, i16* %0, align 2 + %or = or i16 %1, ptrtoint (i32* @b to i16) + store i16 %or, i16* %0, align 2 + %cmp = icmp ne i16 %or, 4 + %conv3 = zext i1 %cmp to i32 + %call = tail call i32 bitcast (i32 (...)* @e to i32 (i32)*)(i32 %conv3) #2 + ret i32 undef +} + +; Check that d.sroa.0.0.be is promoted passed directly into the tail call. +; CHECK-COMMON-LABEL: check_zext_phi_call_arg +; CHECK-COMMON-NOT: uxt +define i32 @check_zext_phi_call_arg() { +entry: + br label %for.cond + +for.cond: ; preds = %for.cond.backedge, %entry + %d.sroa.0.0 = phi i16 [ 30, %entry ], [ %d.sroa.0.0.be, %for.cond.backedge ] + %tobool = icmp eq i16 %d.sroa.0.0, 0 + br i1 %tobool, label %for.cond.backedge, label %if.then + +for.cond.backedge: ; preds = %for.cond, %if.then + %d.sroa.0.0.be = phi i16 [ %call, %if.then ], [ 0, %for.cond ] + br label %for.cond + +if.then: ; preds = %for.cond + %d.sroa.0.0.insert.ext = zext i16 %d.sroa.0.0 to i32 + %call = tail call zeroext i16 bitcast (i16 (...)* @f to i16 (i32)*)(i32 %d.sroa.0.0.insert.ext) #2 + br label %for.cond.backedge +} + + +; The call to safe_lshift_func takes two parameters, but they're the same value just one is zext. +; CHECK-COMMON-LABEL: call_zext_i8_i32 +define fastcc i32 @call_zext_i8_i32(i32 %p_45, i8 zeroext %p_46) { +for.cond8.preheader: + %call217 = call fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 zeroext undef) + %tobool219 = icmp eq i8 %call217, 0 + br i1 %tobool219, label %for.end411, label %for.cond273.preheader + +for.cond273.preheader: ; preds = %for.cond8.preheader + %call217.lcssa = phi i8 [ %call217, %for.cond8.preheader ] + %conv218.le = zext i8 %call217.lcssa to i32 + %call346 = call fastcc zeroext i8 @safe_lshift_func(i8 zeroext %call217.lcssa, i32 %conv218.le) + unreachable + +for.end411: ; preds = %for.cond8.preheader + %call452 = call fastcc i64 @safe_sub_func_int64_t_s_s(i64 undef, i64 4) + unreachable +} + +%struct.anon = type { i32 } + +@g_57 = hidden local_unnamed_addr global %struct.anon zeroinitializer, align 4 +@g_893 = hidden local_unnamed_addr global %struct.anon zeroinitializer, align 4 +@g_82 = hidden local_unnamed_addr global i32 0, align 4 + +; Test that the transform bails on finding a call which returns a i16** +; CHECK-COMMON-LABEL: call_return_pointer +; CHECK-COMMON: sxth +; CHECK-COMMON-NOT: uxt +define hidden i32 @call_return_pointer(i8 zeroext %p_13) local_unnamed_addr #0 { +entry: + %conv1 = zext i8 %p_13 to i16 + %call = tail call i16** @func_62(i8 zeroext undef, i32 undef, i16 signext %conv1, i32* undef) + %0 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @g_893, i32 0, i32 0), align 4 + %conv2 = trunc i32 %0 to i16 + br label %for.cond + +for.cond: ; preds = %for.cond.backedge, %entry + %p_13.addr.0 = phi i8 [ %p_13, %entry ], [ %p_13.addr.0.be, %for.cond.backedge ] + %tobool = icmp eq i8 %p_13.addr.0, 0 + br i1 %tobool, label %for.cond.backedge, label %if.then + +for.cond.backedge: ; preds = %for.cond, %if.then + %p_13.addr.0.be = phi i8 [ %conv4, %if.then ], [ 0, %for.cond ] + br label %for.cond + +if.then: ; preds = %for.cond + %call3 = tail call fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %conv2) + %conv4 = trunc i16 %call3 to i8 + br label %for.cond.backedge +} + +declare noalias i16** @func_62(i8 zeroext %p_63, i32 %p_64, i16 signext %p_65, i32* nocapture readnone %p_66) +declare fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %si2) +declare dso_local fastcc i64 @safe_sub_func_int64_t_s_s(i64, i64) +declare dso_local fastcc zeroext i8 @safe_lshift_func(i8 zeroext, i32) +declare dso_local fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 returned zeroext) + +declare dso_local i32 @e(...) local_unnamed_addr #1 +declare dso_local zeroext i16 @f(...) local_unnamed_addr #1 + +declare i32 @dummy(i32, i32) +declare i8 @dummy_i8(i8) +declare i8 @dummy2(i8*, i8, i8) +declare i16 @dummy3(i16) +declare i32 @assert(...) Index: llvm/trunk/test/CodeGen/ARM/arm-cgp-signed.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/arm-cgp-signed.ll +++ llvm/trunk/test/CodeGen/ARM/arm-cgp-signed.ll @@ -0,0 +1,45 @@ +; RUN: llc -mtriple=thumbv7m %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7 %s -o - | FileCheck %s +; RUN: llc -mtriple=armv8 %s -o - | FileCheck %s + +; Test to check that ARMCodeGenPrepare doesn't optimised away sign extends. +; CHECK-LABEL: test_signed_load: +; CHECK: uxth +define i16 @test_signed_load(i16* %ptr) { + %load = load i16, i16* %ptr + %conv0 = zext i16 %load to i32 + %conv1 = sext i16 %load to i32 + %cmp = icmp eq i32 %conv0, %conv1 + %conv2 = zext i1 %cmp to i16 + ret i16 %conv2 +} + +; Don't allow sign bit generating opcodes. +; CHECK-LABEL: test_ashr: +; CHECK: sxth +define i16 @test_ashr(i16 zeroext %arg) { + %ashr = ashr i16 %arg, 1 + %cmp = icmp eq i16 %ashr, 0 + %conv = zext i1 %cmp to i16 + ret i16 %conv +} + +; CHECK-LABEL: test_sdiv: +; CHECK: sxth +define i16 @test_sdiv(i16 zeroext %arg) { + %sdiv = sdiv i16 %arg, 2 + %cmp = icmp ne i16 %sdiv, 0 + %conv = zext i1 %cmp to i16 + ret i16 %conv +} + +; CHECK-LABEL: test_srem +; CHECK: sxth +define i16 @test_srem(i16 zeroext %arg) { + %srem = srem i16 %arg, 4 + %cmp = icmp ne i16 %srem, 0 + %conv = zext i1 %cmp to i16 + ret i16 %conv +} +