Index: lib/Target/ARM/ARM.h =================================================================== --- lib/Target/ARM/ARM.h +++ lib/Target/ARM/ARM.h @@ -43,6 +43,7 @@ FunctionPass *createA15SDOptimizerPass(); FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); FunctionPass *createARMExpandPseudoPass(); +FunctionPass *createARMCodeGenPreparePass(); FunctionPass *createARMConstantIslandPass(); FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); @@ -64,6 +65,7 @@ void initializeARMParallelDSPPass(PassRegistry &); void initializeARMLoadStoreOptPass(PassRegistry &); void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); +void initializeARMCodeGenPreparePass(PassRegistry &); void initializeARMConstantIslandsPass(PassRegistry &); void initializeARMExpandPseudoPass(PassRegistry &); void initializeThumb2SizeReducePass(PassRegistry &); Index: lib/Target/ARM/ARMCodeGenPrepare.cpp =================================================================== --- /dev/null +++ lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -0,0 +1,670 @@ +//===----- ARMCodeGenPrepare.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass inserts intrinsics to handle small types that would otherwise be +/// promoted during legalization. Here we can manually promote types or insert +/// intrinsics which can handle narrow types that aren't supported by the +/// register classes. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "ARMTargetMachine.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" + +#define DEBUG_TYPE "arm-codegenprepare" + +using namespace llvm; + +static cl::opt +DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(false), + cl::desc("Disable ARM specific CodeGenPrepare pass")); + +static cl::opt +DisableDSP("arm-disable-scalar-dsp", cl::Hidden, cl::init(false), + cl::desc("Don't use DSP instructions for scalar operations")); + +static cl::opt +DisableDSPWithImms("arm-disable-scalar-dsp-imms", cl::Hidden, cl::init(true), + cl::desc("Don't use DSP instructions for scalar operations\ + with immediate operands")); + +namespace { + +class IRPromoter { + SmallPtrSet NewInsts; + SmallVector InstsToRemove; + Module *M = nullptr; + LLVMContext &Ctx; + +public: + IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { } + + void Cleanup() { + for (auto *I : InstsToRemove) { + LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n"); + I->dropAllReferences(); + I->eraseFromParent(); + } + InstsToRemove.clear(); + NewInsts.clear(); + } + + void Mutate(Type *OrigTy, Type *ExtTy, + SmallPtrSetImpl &Visited, + SmallPtrSetImpl &Leaves, + SmallPtrSetImpl &Roots); +}; + +class ARMCodeGenPrepare : public FunctionPass { + const ARMSubtarget *ST = nullptr; + IRPromoter *Promoter = nullptr; + std::set AllVisited; + SmallPtrSet CurrentVisited; + SmallVector InstsToRemove; + Module *M = nullptr; + Function *F = nullptr; + Type *ExtTy = nullptr; + Type *OrigTy = nullptr; + unsigned TypeSize = 0; + + bool isNarrowInstSupported(Instruction *I); + bool isSupportedValue(Value *V); + bool isLegalToPromote(Value *V); + bool TryToPromote(Value *V); + +public: + static char ID; + + ARMCodeGenPrepare() : FunctionPass(ID) {} + ~ARMCodeGenPrepare() { delete Promoter; } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + StringRef getPassName() const override { return "ARM IR optimizations"; } +}; + +} + +static bool isSigned(Value *V) { + if (auto *I = dyn_cast(V)) { + if (auto *Cmp = dyn_cast(I)) { + if (Cmp->isSigned()) + return true; + // EQ or NE maybe comparing against a negative immediate, which will + // need to be fixed up later. + for (auto &Op : Cmp->operands()) { + if (auto *Const = dyn_cast(Op)) + if (Const->isNegative()) + return true; + } + } + + unsigned Opc = I->getOpcode(); + return Opc == Instruction::AShr || Opc == Instruction::SDiv || + Opc == Instruction::SRem; + } + return false; +} + +/// Some instructions can 8- and 16-bit operands, and we don't need to promote +/// anything larger. We disallow booleans to make life easier when dealing with +/// icmps but allow any other integer that is <= 16 bits. Void types are +/// accepted so we can handle switches. +static bool isSupportedType(Value *V) { + if (V->getType()->isVoidTy()) + return true; + + const IntegerType *IntTy = dyn_cast(V->getType()); + if (!IntTy) + return false; + + // Don't try to promote boolean values. + if (IntTy->getBitWidth() == 1) + return false; + + if (auto *ZExt = dyn_cast(V)) + return isSupportedType(ZExt->getOperand(0)); + + return IntTy->getBitWidth() <= 16; +} + +/// Ignore any pointers and any non-instruction values, other than function +/// arguments. +static bool shouldIgnore(Value *V) { + return V->getType()->isPointerTy() || + (!isa(V) && !isa(V)); +} + +/// Return true if V will require any promoted values to be truncated for the +/// use to be valid. +static bool isSink(Value *V) { + auto UsesNarrowValue = [](Value *V) { + return !V->getType()->isIntegerTy(32); + }; + + if (auto *Store = dyn_cast(V)) + return UsesNarrowValue(Store->getValueOperand()); + if (auto *Return = dyn_cast(V)) + return UsesNarrowValue(Return->getReturnValue()); + + return isa(V); +} + +/// Decide whether V will be mutated. If so, we will need to visit the users. +static bool shouldPromote(Value *V) { + auto *I = dyn_cast(V); + if (!I) + return false; + + if (isa(V) || isa(V)) + return false; + + switch(I->getOpcode()) { + default: + break; + case Instruction::ICmp: + case Instruction::ZExt: + case Instruction::Trunc: + return false; + } + return true; +} + +/// Return true is the given value is a leaf that will need to be zext'd. +static bool isSource(Value *V) { + if (isa(V) && isSupportedType(V)) + return true; + else if (isa(V)) + return true; + else if (auto *ZExt = dyn_cast(V)) + // ZExt can be a leaf if its the only user of a load. + return isa(ZExt->getOperand(0)) && + ZExt->getOperand(0)->hasOneUse(); + else if (auto *Call = dyn_cast(V)) + return Call->hasRetAttr(Attribute::AttrKind::ZExt); + else if (auto *Load = dyn_cast(V)) { + // A load is a leaf, unless its already just being zext'd. + if (Load->hasOneUse() && isa(*Load->use_begin())) + return false; + else + return true; + } + return false; +} + +/// Return whether will can safely mutate V's type to ExtTy without having +/// to be concerned with zero extending or truncation. +static bool isPromotedResultSafe(Value *V) { + if (!isa(V)) + return true; + + auto *I = cast(V); + + // If I is only being used by something that will require its value to be + // truncated, then we don't care about the promoted result. + if (I->hasOneUse() && isSink(*I->use_begin())) + return true; + + unsigned Opc = I->getOpcode(); + switch (Opc) { + default: + break; + case Instruction::Mul: + case Instruction::Shl: + return I->hasNoUnsignedWrap(); + case Instruction::Add: + case Instruction::Sub: { + if (I->hasNoUnsignedWrap()) + return true; + + // We don't care if the add or sub could wrap if the value is decreasing + // and is only being used by an unsigned compare. + if (!V->hasOneUse() || + !isa(*V->user_begin()) || + !isa(I->getOperand(1))) + return false; + + auto *CI = cast(*V->user_begin()); + ICmpInst::Predicate Pred = CI->getPredicate(); + if (ICmpInst::isSigned(Pred)) + return false; + + bool NegImm = cast(I->getOperand(1))->isNegative(); + bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) || + ((Opc == Instruction::Add) && NegImm); + if (!IsDecreasing) + return false; + + return true; + } + } + return !isSigned(I); +} + +/// Return the intrinsic for the instruction that can performed the same +/// operation but on a narrow type. This is using the parallel dsp intrinsics +/// on scalar values. +static Intrinsic::ID getNarrowIntrinsic(Instruction *I, unsigned TypeSize) { + // Whether we use the signed or unsigned versions of these intrinsics + // doesn't matter because we're not using the GE bits that they set in + // the APSR. + switch(I->getOpcode()) { + default: + break; + case Instruction::Add: + return TypeSize == 16 ? Intrinsic::arm_uadd16 : + Intrinsic::arm_uadd8; + case Instruction::Sub: + return TypeSize == 16 ? Intrinsic::arm_usub16 : + Intrinsic::arm_usub8; + } + llvm_unreachable("unhandled opcode for narrow intrinsic"); +} + +void IRPromoter::Mutate(Type *OrigTy, Type *ExtTy, + SmallPtrSetImpl &Visited, + SmallPtrSetImpl &Leaves, + SmallPtrSetImpl &Roots) { + IRBuilder<> Builder{Ctx}; + unsigned TypeSize = OrigTy->getPrimitiveSizeInBits(); + LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from " << TypeSize + << " to 32-bits\n"); + + auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) { + SmallVector Users; + Instruction *InstTo = dyn_cast(To); + for (Use &U : From->uses()) { + auto *User = cast(U.getUser()); + if (InstTo && User->isIdenticalTo(InstTo)) + continue; + Users.push_back(User); + } + + for (auto &U : Users) + U->replaceUsesOfWith(From, To); + }; + + auto FixConst = [&](ConstantInt *Const, Instruction *I) { + Constant *NewConst = nullptr; + if (isPromotedResultSafe(I)) { + NewConst = (Const->isNegative()) ? + ConstantExpr::getSExt(Const, ExtTy) : + ConstantExpr::getZExt(Const, ExtTy); + } else { + uint64_t NewVal = *Const->getValue().getRawData(); + if (Const->getType() == Type::getInt16Ty(Ctx)) + NewVal &= 0xFFFF; + else + NewVal &= 0xFF; + NewConst = ConstantInt::get(ExtTy, NewVal); + } + I->replaceUsesOfWith(Const, NewConst); + }; + + auto InsertDSPIntrinsic = [&](Instruction *I) { + LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for " + << *I << "\n"); + Function *DSPInst = + Intrinsic::getDeclaration(M, getNarrowIntrinsic(I, TypeSize)); + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + Value *Args[] = { I->getOperand(0), I->getOperand(1) }; + CallInst *Call = Builder.CreateCall(DSPInst, Args); + ReplaceAllUsersOfWith(I, Call); + InstsToRemove.push_back(I); + NewInsts.insert(Call); + }; + + auto InsertTrunc = [&](Instruction *I, Instruction *User) { + LLVM_DEBUG(dbgs() << "ARM CGP: Creating Trunc for " << *I << "\n"); + Builder.SetInsertPoint(I); + auto *Trunc = cast(Builder.CreateTrunc(I, OrigTy)); + Trunc->moveBefore(User); + User->replaceUsesOfWith(I, Trunc); + NewInsts.insert(Trunc); + }; + + auto InsertZExt = [&](Value *V, Instruction *InsertPt) { + LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n"); + Builder.SetInsertPoint(InsertPt); + if (auto *I = dyn_cast(V)) + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + auto *ZExt = cast(Builder.CreateZExt(V, ExtTy)); + if (isa(V)) + ZExt->moveBefore(InsertPt); + else + ZExt->moveAfter(InsertPt); + ReplaceAllUsersOfWith(V, ZExt); + NewInsts.insert(ZExt); + }; + + // First, insert extending instructions between the leaves and their users. + for (auto V : Leaves) { + if (auto *ZExt = dyn_cast(V)) + ZExt->mutateType(ExtTy); + else if (auto *I = dyn_cast(V)) + InsertZExt(I, I); + else if (auto *Arg = dyn_cast(V)) { + BasicBlock &BB = Arg->getParent()->front(); + InsertZExt(Arg, &*BB.getFirstInsertionPt()); + } else { + llvm_unreachable("unhandled leaf that needs extending"); + } + } + + // Then mutate the types of the instructions within the tree. Here we handle + // constant operands. + for (auto *V : Visited) { + if (Leaves.count(V)) + continue; + + assert(isa(V) && "Expected instruction!"); + auto *I = cast(V); + if (Roots.count(I)) + continue; + + for (auto &U : I->operands()) { + if ((U->getType() == ExtTy) || !isSupportedType(&*U)) + continue; + + if (auto *Const = dyn_cast(&*U)) + FixConst(Const, I); + else if (isa(&*U)) + U->mutateType(ExtTy); + } + + if (shouldPromote(I)) + I->mutateType(ExtTy); + } + + // Now we need to remove any zexts that have become unnecessary, as well + // as insert any intrinsics. + for (auto *V : Visited) { + if (Leaves.count(V)) + continue; + if (auto *ZExt = dyn_cast(V)) { + if (ZExt->getDestTy() != ExtTy) + ZExt->mutateType(ExtTy); + else if (ZExt->getSrcTy() == ExtTy) { + ReplaceAllUsersOfWith(V, ZExt->getOperand(0)); + InstsToRemove.push_back(ZExt); + } + continue; + } + + if (!shouldPromote(V) || isPromotedResultSafe(V)) + continue; + + // Replace unsafe instructions with appropriate intrinsic calls. + InsertDSPIntrinsic(cast(V)); + } + + // Fix up any stores or returns that use the results of the promoted + // chain. + for (auto I : Roots) { + for (unsigned i = 0; i < I->getNumOperands(); ++i) { + Value *Op = I->getOperand(i); + if (Visited.count(Op) || NewInsts.count(Op)) { + if (auto *V = dyn_cast(Op)) + InsertTrunc(V, I); + } + } + } +} + +bool ARMCodeGenPrepare::isNarrowInstSupported(Instruction *I) { + if (!ST->hasDSP() || DisableDSP || !isSupportedType(I)) + return false; + + if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub) + return false; + + // TODO + // Would it be profitable? For Thumb code, these parallel DSP instructions + // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For + // Cortex-A, specifically Cortex-A72, the latency is double and throughput is + // halved. They also do not take immediates as operands. + for (auto &Op : I->operands()) { + if (isa(Op)) { + if (DisableDSPWithImms) + return false; + } + } + return true; +} + +/// Disallow casts other than zext and truncs. Allow calls if their return +/// value is zeroext. +bool ARMCodeGenPrepare::isSupportedValue(Value *V) { + if (auto *Call = dyn_cast(V)) + return Call->hasRetAttr(Attribute::AttrKind::ZExt); + else if (auto *Cmp = dyn_cast(V)) { + if (!Cmp->isSigned()) + return true; + return !isSource(Cmp->getOperand(0)) && !isSource(Cmp->getOperand(1)); + } else if (isa(V)) { + if (isa(V)) + return true; + else if (auto *Trunc = dyn_cast(V)) + return Trunc->getDestTy()->getScalarSizeInBits() <= TypeSize; + else + return false; + } + return !isSigned(V); +} + +/// Check that the type of V would be promoted and that the original type is +/// smaller than the targeted promoted type. Check that the promoted type of +/// V matches the promoted type of the PHI. +bool ARMCodeGenPrepare::isLegalToPromote(Value *V) { + if (!isSupportedValue(V) || !isSupportedType(V)) + return false; + + unsigned VSize = 0; + if (auto *Ld = dyn_cast(V)) { + auto *PtrTy = cast(Ld->getPointerOperandType()); + VSize = PtrTy->getElementType()->getPrimitiveSizeInBits(); + } else if (auto *ZExt = dyn_cast(V)) { + VSize = ZExt->getOperand(0)->getType()->getPrimitiveSizeInBits(); + } else { + VSize = V->getType()->getPrimitiveSizeInBits(); + } + + if (VSize > TypeSize) + return false; + + if (isPromotedResultSafe(V)) + return true; + + if (auto *I = dyn_cast(V)) + return isNarrowInstSupported(I); + + return false; +} + +bool ARMCodeGenPrepare::TryToPromote(Value *V) { + if (shouldIgnore(V) || !shouldPromote(V)) + return false; + + OrigTy = V->getType(); + TypeSize = OrigTy->getPrimitiveSizeInBits(); + + if (!isLegalToPromote(V)) + return false; + + LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << "\n"); + + SetVector WorkList; + SmallPtrSet Leaves; + SmallPtrSet Roots; + WorkList.insert(V); + CurrentVisited.clear(); + + // Return true if the given value can, or has been, visited. Add V to the + // worklist if needed. + auto AddLegalInst = [&](Value *V) { + if ((isa(V) && CurrentVisited.count(cast(V))) || + shouldIgnore(V)) + return true; + + if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) + return false; + + WorkList.insert(V); + return true; + }; + + // Iterate through, and add to, a tree of operands and users in the use-def. + while (!WorkList.empty()) { + Value *V = WorkList.back(); + WorkList.pop_back(); + if (CurrentVisited.count(V)) + continue; + + // If we've already visited this value from somewhere, bail now because + // the tree has already been explored. + // TODO: This could limit the transform, if we try to promote something from + // an i8 and fail + if (AllVisited.count(V)) + return false; + + CurrentVisited.insert(V); + AllVisited.insert(V); + + // Calls can be both sources and sinks. + if (isSink(V)) + Roots.insert(cast(V)); + if (isSource(V)) + Leaves.insert(V); + else if (auto *I = dyn_cast(V)) { + // Visit operands of any instruction visited. + for (auto &U : I->operands()) { + if (!AddLegalInst(U)) + return false; + } + } + + // Don't visit users of a node which isn't going to be mutated unless its a + // source. + if (isSource(V) || shouldPromote(V)) { + for (Use &U : V->uses()) { + if (!AddLegalInst(U.getUser())) + return false; + } + } + } + + unsigned NumToPromote = 0; + unsigned Cost = 0; + for (auto *V : CurrentVisited) { + if (isa(V)) + continue; + if (shouldPromote(V) || isa(V)) + ++NumToPromote; + if (isa(V)) + ++Cost; + else if (auto *Arg = dyn_cast(V)) { + if (!Arg->hasZExtAttr()) + ++Cost; + } else if (isa(V)) + ++Cost; + } + + LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n"; + for (auto *I : CurrentVisited) + I->dump(); + ); + LLVM_DEBUG(dbgs() << "ARM CGP: Cost of promoting " << NumToPromote + << " instructions = " << Cost << "\n"); + if (Cost > NumToPromote) + return false; + + Promoter->Mutate(OrigTy, ExtTy, CurrentVisited, Leaves, Roots); + return true; +} + +bool ARMCodeGenPrepare::doInitialization(Module &M) { + this->M = &M; + ExtTy = Type::getInt32Ty(M.getContext()); + Promoter = new IRPromoter(&M); + return false; +} + +bool ARMCodeGenPrepare::runOnFunction(Function &f) { + if (skipFunction(f) || DisableCGP) + return false; + + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + F = &f; + const TargetMachine &TM = TPC->getTM(); + ST = &TM.getSubtarget(f); + bool MadeChange = false; + LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F->getName() << "\n"); + + // Search up from icmps to try to promote their operands. + for (BasicBlock &BB : *F) { + auto &Insts = BB.getInstList(); + for (auto &I : Insts) { + if (AllVisited.count(&I)) + continue; + + if (isa(I)) { + auto &CI = cast(I); + for (auto &Op : CI.operands()) { + if (auto *I = dyn_cast(Op)) { + if (isa(I)) + MadeChange |= TryToPromote(I->getOperand(0)); + else + MadeChange |= TryToPromote(I); + } + } + } + } + Promoter->Cleanup(); + LLVM_DEBUG(if (verifyFunction(*F, &dbgs())) { + dbgs(); + report_fatal_error("Broken function after type promotion"); + }); + } + + return MadeChange; +} + +INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE, + "ARM IR optimizations", false, false) +INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations", + false, false) + +char ARMCodeGenPrepare::ID = 0; + +FunctionPass *llvm::createARMCodeGenPreparePass() { + return new ARMCodeGenPrepare(); +} Index: lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- lib/Target/ARM/ARMTargetMachine.cpp +++ lib/Target/ARM/ARMTargetMachine.cpp @@ -90,6 +90,7 @@ initializeARMLoadStoreOptPass(Registry); initializeARMPreAllocLoadStoreOptPass(Registry); initializeARMParallelDSPPass(Registry); + initializeARMCodeGenPreparePass(Registry); initializeARMConstantIslandsPass(Registry); initializeARMExecutionDomainFixPass(Registry); initializeARMExpandPseudoPass(Registry); @@ -350,6 +351,7 @@ } void addIRPasses() override; + void addCodeGenPrepare() override; bool addPreISel() override; bool addInstSelector() override; bool addIRTranslator() override; @@ -406,6 +408,11 @@ addPass(createInterleavedAccessPass()); } +void ARMPassConfig::addCodeGenPrepare() { + addPass(createARMCodeGenPreparePass()); + TargetPassConfig::addCodeGenPrepare(); +} + bool ARMPassConfig::addPreISel() { if (getOptLevel() != CodeGenOpt::None) addPass(createARMParallelDSPPass()); Index: lib/Target/ARM/CMakeLists.txt =================================================================== --- lib/Target/ARM/CMakeLists.txt +++ lib/Target/ARM/CMakeLists.txt @@ -23,6 +23,7 @@ ARMBaseInstrInfo.cpp ARMBaseRegisterInfo.cpp ARMCallLowering.cpp + ARMCodeGenPrepare.cpp ARMConstantIslandPass.cpp ARMConstantPoolValue.cpp ARMExpandPseudoInsts.cpp Index: test/CodeGen/ARM/arm-cgp-icmps.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/arm-cgp-icmps.ll @@ -0,0 +1,311 @@ +; RUN: llc -mtriple=thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP +; RUN: llc -mtriple=thumbv7em %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP +; RUN: llc -mtriple=thumbv8 %s -arm-disable-scalar-dsp-imms=false -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM + +; CHECK-COMMON-LABEL: test_lt_254_inc_imm: +; CHECK-DSP: adds r0, #1 +; CHECK-DSP-NEXT: uxtb r1, r0 +; CHECK-DSP-NEXT: movs r0, #47 +; CHECK-DSP-NEXT: cmp r1, #254 +; CHECK-DSP-NEXT: it lo +; CHECK-DSP-NEXT: movlo r0, #35 + +; CHECK-DSP-IMM: movs r1, #1 +; CHECK-DSP-IMM-NEXT: uadd8 r1, r0, r1 +; CHECK-DSP-IMM-NEXT: movs r0, #47 +; CHECK-DSP-IMM-NEXT: cmp r1, #254 +; CHECK-DSP-IMM-NEXT: it lo +; CHECK-DSP-IMM-NEXT: movlo r0, #35 +define i32 @test_lt_254_inc_imm(i8 zeroext %x) { +entry: + %add = add i8 %x, 1 + %cmp = icmp ult i8 %add, 254 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: test_lt_254_inc_var: +; CHECK-NODSP: add r0, r1 +; CHECK-NODSP-NEXT: uxtb r1, r0 +; CHECK-NODSP-NEXT: movs r0, #47 +; CHECK-NODSP-NEXT: cmp r1, #254 +; CHECK-NODSP-NEXT: it lo +; CHECK-NODSP-NEXT: movlo r0, #35 + +; CHECK-DSP: uadd8 r1, r0, r1 +; CHECK-DSP-NEXT: movs r0, #47 +; CHECK-DSP-NEXT: cmp r1, #254 +; CHECK-DSP-NEXT: it lo +; CHECK-DSP-NEXT: movlo r0, #35 +define i32 @test_lt_254_inc_var(i8 zeroext %x, i8 zeroext %y) { +entry: + %add = add i8 %x, %y + %cmp = icmp ult i8 %add, 254 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: test_gt_1_dec_imm: +; CHECK-COMMON: subs r1, r0, #1 +; CHECK-COMMON-NEXT: movs r0, #47 +; CHECK-COMMON-NEXT: cmp r1, #1 +; CHECK-COMMON-NEXT: it hi +; CHECK-COMMON-NEXT: movhi r0, #35 +define i32 @test_gt_1_dec_imm(i8 zeroext %x) { +entry: + %add = add i8 %x, -1 + %cmp = icmp ugt i8 %add, 1 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: test_gt_1_dec_var: +; CHECK-NODSP: subs r0, r0, r1 +; CHECK-NODSP-NEXT: uxtb r1, r0 +; CHECK-NODSP-NEXT: movs r0, #47 +; CHECK-NODSP-NEXT: cmp r1, #1 +; CHECK-NODSP-NEXT: it hi +; CHECK-NODSP-NEXT: movhi r0, #35 + +; CHECK-DSP: usub8 r1, r0, r1 +; CHECK-DSP-NEXT: movs r0, #47 +; CHECK-DSP-NEXT: cmp r1, #1 +; CHECK-DSP-NEXT: it hi +; CHECK-DSP-NEXT: movhi r0, #35 +define i32 @test_gt_1_dec_var(i8 zeroext %x, i8 zeroext %y) { +entry: + %sub = sub i8 %x, %y + %cmp = icmp ugt i8 %sub, 1 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: dsp_imm1: +; CHECK-DSP: eors r1, r0 +; CHECK-DSP-NEXT: and r0, r0, #7 +; CHECK-DSP-NEXT: subs r0, r0, r1 +; CHECK-DSP-NEXT: adds r0, #1 +; CHECK-DSP-NEXT: uxtb r1, r0 +; CHECK-DSP-NEXT: movs r0, #47 +; CHECK-DSP-NEXT: cmp r1, #254 +; CHECK-DSP-NEXT: it lo +; CHECK-DSP-NEXT: movlo r0, #35 + +; CHECK-DSP-IMM: eors r1, r0 +; CHECK-DSP-IMM-NEXT: and r0, r0, #7 +; CHECK-DSP-IMM-NEXT: usub8 r0, r0, r1 +; CHECK-DSP-IMM-NEXT: movs r1, #1 +; CHECK-DSP-IMM-NEXT: uadd8 r1, r0, r1 +; CHECK-DSP-IMM-NEXT: movs r0, #47 +; CHECK-DSP-IMM-NEXT: cmp r1, #254 +; CHECK-DSP-IMM-NEXT: it lo +; CHECK-DSP-IMM-NEXT: movlo r0, #35 +define i32 @dsp_imm1(i8 zeroext %x, i8 zeroext %y) { +entry: + %xor = xor i8 %x, %y + %and = and i8 %x, 7 + %sub = sub i8 %and, %xor + %add = add i8 %sub, 1 + %cmp = icmp ult i8 %add, 254 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: dsp_imm2 +; CHECK-COMMON: add r0, r1 +; CHECK-DSP-NEXT: ldrh r1, [r3] +; CHECK-DSP-NEXT: ldrh r2, [r2] +; CHECK-DSP-NEXT: subs r1, r1, r0 +; CHECK-DSP-NEXT: rsbs r0, r0, #0 +; CHECK-DSP-NEXT: subs r0, r2, r0 +; CHECK-DSP-NEXT: sxth r3, r1 +; CHECK-DSP-NEXT: sxth r2, r0 +; CHECK-DSP-NEXT: cmp r2, r3 + +; CHECK-DSP-IMM: movs r1, #0 +; CHECK-DSP-IMM-NEXT: uxth r0, r0 +; CHECK-DSP-IMM-NEXT: usub16 r1, r1, r0 +; CHECK-DSP-IMM-NEXT: ldrh r0, [r2] +; CHECK-DSP-IMM-NEXT: ldrh r3, [r3] +; CHECK-DSP-IMM-NEXT: usub16 r0, r0, r1 +; CHECK-DSP-IMM-NEXT: uadd16 r1, r3, r1 +; CHECK-DSP-IMM-NEXT: cmp r0, r1 + +define i16 @dsp_imm2(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) { +entry: + %add0 = add i32 %arg0, %arg1 + %conv0 = trunc i32 %add0 to i16 + %sub0 = sub i16 0, %conv0 + %load0 = load i16, i16* %gep0, align 2 + %load1 = load i16, i16* %gep1, align 2 + %sub1 = sub i16 %load0, %sub0 + %add1 = add i16 %load1, %sub0 + %cmp = icmp slt i16 %sub1, %add1 + %res = select i1 %cmp, i16 %add1, i16 %sub1 + ret i16 %res +} + +; CHECK-COMMON-LABEL: dsp_var: +; CHECK-COMMON: eors r1, r0 +; CHECK-COMMON: and r2, r0, #7 +; CHECK-NODSP: subs r1, r2, r1 +; CHECK-NODSP: add.w r0, r1, r0, lsl #1 +; CHECK-NODSP: uxtb r1, r0 +; CHECK-DSP: usub8 r1, r2, r1 +; CHECK-DSP: lsls r0, r0, #1 +; CHECK-DSP: uadd8 r1, r1, r0 +; CHECK-DSP-NOT: uxt +; CHECK-COMMON: movs r0, #47 +; CHECK-COMMON: cmp r1, #254 +; CHECK-COMMON: it lo +; CHECK-COMMON: movlo r0, #35 +define i32 @dsp_var(i8 zeroext %x, i8 zeroext %y) { + %xor = xor i8 %x, %y + %and = and i8 %x, 7 + %sub = sub i8 %and, %xor + %mul = shl nuw i8 %x, 1 + %add = add i8 %sub, %mul + %cmp = icmp ult i8 %add, 254 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: store_dsp_res +; CHECK-DSP: usub8 +; CHECK-DSP: strb +define void @store_dsp_res(i8* %in, i8* %out, i8 %compare) { + %first = getelementptr inbounds i8, i8* %in, i32 0 + %second = getelementptr inbounds i8, i8* %in, i32 1 + %ld0 = load i8, i8* %first + %ld1 = load i8, i8* %second + %xor = xor i8 %ld0, -1 + %cmp = icmp ult i8 %compare, %ld1 + %select = select i1 %cmp, i8 %compare, i8 %xor + %sub = sub i8 %ld0, %select + store i8 %sub, i8* %out, align 1 + ret void +} + +; CHECK-COMMON-LABEL: ugt_1_dec_imm: +; CHECK-COMMON: subs r1, r0, #1 +; CHECK-COMMON-NEXT: movs r0, #47 +; CHECK-COMMON-NEXT: cmp r1, #1 +; CHECK-COMMON-NEXT: it hi +; CHECK-COMMON-NEXT: movhi r0, #35 +define i32 @ugt_1_dec_imm(i8 zeroext %x) { +entry: + %add = add i8 %x, -1 + %cmp = icmp ugt i8 %add, 1 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: ugt_1_dec_var: +; CHECK-NODSP: subs r0, r0, r1 +; CHECK-NODSP-NEXT: uxtb r1, r0 +; CHECK-NODSP-NEXT: movs r0, #47 +; CHECK-NODSP-NEXT: cmp r1, #1 +; CHECK-NODSP-NEXT: it hi +; CHECK-NODSP-NEXT: movhi r0, #35 + +; CHECK-DSP: usub8 r1, r0, r1 +; CHECK-DSP-NEXT: movs r0, #47 +; CHECK-DSP-NEXT: cmp r1, #1 +; CHECK-DSP-NEXT: it hi +; CHECK-DSP-NEXT: movhi r0, #35 +define i32 @ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) { +entry: + %sub = sub i8 %x, %y + %cmp = icmp ugt i8 %sub, 1 + %res = select i1 %cmp, i32 35, i32 47 + ret i32 %res +} + +; CHECK-COMMON-LABEL: icmp_i32_zext: +; CHECK-COMMON: ldrb [[LD:r[^ ]+]], [r0] +; CHECK-COMMON: subs [[SUB:r[^ ]+]], [[LD]], #1 +; CHECK-COMMON-NOT: uxt +; CHECK-COMMON: cmp [[LD]], [[SUB]] +; CHECK-COMMON-NOT: uxt +define i8 @icmp_i32_zext(i8* %ptr) { +entry: + %gep = getelementptr inbounds i8, i8* %ptr, i32 0 + %0 = load i8, i8* %gep, align 1 + %1 = sub nuw nsw i8 %0, 1 + %conv44 = zext i8 %0 to i32 + br label %preheader + +preheader: + br label %body + +body: + %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ] + %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ] + %conv51266 = zext i8 %2 to i32 + %cmp52267 = icmp eq i32 %si.0274, %conv51266 + br i1 %cmp52267, label %if.end, label %exit + +if.end: + %inc = add i32 %si.0274, 1 + %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc + %3 = load i8, i8* %gep1, align 1 + br label %body + +exit: + ret i8 %2 +} + +; CHECK-COMMON-LABEL: or_icmp_ugt: +; CHECK-COMMON: ldrb [[LD:r[^ ]+]], [r1] +; CHECK-COMMON: subs [[SUB:r[^ ]+]], #1 +; CHECK-COMMON-NOT: uxtb +; CHECK-COMMON: cmp [[SUB]], #3 +define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) { +entry: + %0 = load i8, i8* %ptr + %1 = zext i8 %0 to i32 + %mul = shl nuw nsw i32 %1, 1 + %add0 = add nuw nsw i32 %mul, 6 + %cmp0 = icmp ne i32 %arg, %add0 + %add1 = add i8 %0, -1 + %cmp1 = icmp ugt i8 %add1, 3 + %or = or i1 %cmp0, %cmp1 + ret i1 %or +} + +; CHECK-COMMON-LABEL: icmp_switch_trunc: +; CHECK-COMMON-NOT: uxt +define i16 @icmp_switch_trunc(i16 %arg) { +entry: + %conv = and i16 %arg, 15 + %mul = mul nuw nsw i16 %conv, 3 + %trunc = trunc i16 %arg to i3 + switch i3 %trunc, label %default [ + i3 0, label %sw.bb + i3 1, label %sw.bb.i + ] + +sw.bb: + %cmp0 = icmp ult i16 %mul, 127 + %select = select i1 %cmp0, i16 %mul, i16 127 + ret i16 %select + +sw.bb.i: + %cmp1 = icmp ugt i16 %mul, 34 + %select.i = select i1 %cmp1, i16 %mul, i16 34 + ret i16 %select.i + +default: + ret i16 %mul +} + +; CHECK-COMMON-LABEL: icmp_eq_minus_one +; CHECK-COMMON: cmp r0, #255 +define i32 @icmp_eq_minus_one(i8* %ptr) { + %load = load i8, i8* %ptr, align 1 + %conv = zext i8 %load to i32 + %cmp = icmp eq i8 %load, -1 + %ret = select i1 %cmp, i32 %conv, i32 -1 + ret i32 %ret +} + Index: test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/arm-cgp-phis-calls-ret.ll @@ -0,0 +1,201 @@ +; RUN: llc -mtriple=thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP +; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP +; RUN: llc -mtriple=thumbv7em %s -arm-disable-scalar-dsp-imms=false -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM + +; CHECK-COMMON-LABEL: phi_feeding_phi_args +; CHECK-COMMON: uxtb +; CHECK-COMMON: uxtb +; CHECK-NOT: uxtb +define void @phi_feeding_phi_args(i8 %a, i8 %b) { +entry: + %0 = icmp ugt i8 %a, %b + br i1 %0, label %preheader, label %empty + +empty: + br label %preheader + +preheader: + %1 = phi i8 [ %a, %entry ], [ %b, %empty ] + br label %loop + +loop: + %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ] + %cmp = icmp ult i8 %val, 254 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %inc = sub nuw i8 %val, 2 + br label %if.end + +if.else: + %inc1 = shl nuw i8 %val, 1 + br label %if.end + +if.end: + %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ] + %cmp1 = icmp eq i8 %inc2, 255 + br i1 %cmp1, label %exit, label %loop + +exit: + ret void +} + +; CHECK-COMMON-LABEL: phi_feeding_phi_zeroext_args +; CHECK-COMMON-NOT: uxt +define void @phi_feeding_phi_zeroext_args(i8 zeroext %a, i8 zeroext %b) { +entry: + %0 = icmp ugt i8 %a, %b + br i1 %0, label %preheader, label %empty + +empty: + br label %preheader + +preheader: + %1 = phi i8 [ %a, %entry ], [ %b, %empty ] + br label %loop + +loop: + %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ] + %cmp = icmp ult i8 %val, 254 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %inc = sub nuw i8 %val, 2 + br label %if.end + +if.else: + %inc1 = shl nuw i8 %val, 1 + br label %if.end + +if.end: + %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ] + %cmp1 = icmp eq i8 %inc2, 255 + br i1 %cmp1, label %exit, label %loop + +exit: + ret void +} + +; CHECK-LABEL: phi_i8 +; CHECK-NOT: uxt +define void @phi_i8() { +entry: + br label %loop + +loop: + %val = phi i8 [ 0, %entry ], [ %inc2, %if.end ] + %cmp = icmp ult i8 %val, 254 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %inc = add nuw i8 %val, 2 + br label %if.end + +if.else: + %inc1 = add nuw i8 %val, 1 + br label %if.end + +if.end: + %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ] + %cmp1 = icmp eq i8 %inc2, 255 + br i1 %cmp1, label %exit, label %loop + +exit: + ret void +} + +; CHECK-COMMON-LABEL: phi_i16: +; CHECK-COMMON-NOT: uxt +define void @phi_i16() { +entry: + br label %loop + +loop: + %val = phi i16 [ 0, %entry ], [ %inc2, %if.end ] + %cmp = icmp ult i16 %val, 128 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %inc = add nuw i16 %val, 2 + br label %if.end + +if.else: + %inc1 = add nuw i16 %val, 1 + br label %if.end + +if.end: + %inc2 = phi i16 [ %inc, %if.then], [ %inc1, %if.else ] + %cmp1 = icmp ult i16 %inc2, 253 + br i1 %cmp1, label %loop, label %exit + +exit: + ret void +} + +; CHECK-COMMON-LABEL: ret_i8: +; CHECK-COMMON-NOT: uxt +define i8 @ret_i8() { +entry: + br label %loop + +loop: + %val = phi i8 [ 0, %entry ], [ %inc2, %if.end ] + %cmp = icmp ult i8 %val, 128 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %inc = add nuw i8 %val, 2 + br label %if.end + +if.else: + %inc1 = add nuw i8 %val, 1 + br label %if.end + +if.end: + %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ] + %cmp1 = icmp ult i8 %inc2, 253 + br i1 %cmp1, label %exit, label %loop + +exit: + ret i8 %inc2 +} + + +declare i32 @dummy(i32, i32) + +; CHECK-COMMON-LABEL: zext_load_sink_call: +; CHECK-DSP: uxt +; CHECK-DSP-NEXT: cmp +; CHECK-DSP: uxt +; CHECK-DSP-IMM: uxt +; CHECK-DSP-IMM: uadd16 +; CHECK-DSP-IMM-NOT: uxt +define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) { +entry: + %0 = load i16, i16* %ptr, align 4 + %1 = add i16 %exp, 3 + %cmp = icmp eq i16 %0, %exp + br i1 %cmp, label %exit, label %if.then + +if.then: + %conv0 = zext i16 %0 to i32 + %conv1 = zext i16 %1 to i32 + %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1) + br label %exit + +exit: + %exitval = phi i32 [ %call, %if.then ], [ 0, %entry ] + ret i32 %exitval +} + +declare i8 @dummy2(i8*, i8, i8) + +; CHECK-COMMON-LABEL: call_with_imms +; CHECK-COMMON-NOT: uxt +define i8 @call_with_imms(i8* %arg) { + %call = tail call arm_aapcs_vfpcc zeroext i8 @dummy2(i8* nonnull %arg, i8 zeroext 0, i8 zeroext 0) + %cmp = icmp eq i8 %call, 0 + %res = select i1 %cmp, i8 %call, i8 1 + ret i8 %res +} + Index: test/CodeGen/ARM/arm-cgp-signed.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/arm-cgp-signed.ll @@ -0,0 +1,53 @@ +; RUN: llc -mtriple=thumbv7m %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7 %s -o - | FileCheck %s +; RUN: llc -mtriple=armv8 %s -o - | FileCheck %s + +; CHECK-LABEL: test_signed_load: +; CHECK: uxth +define i16 @test_signed_load(i16* %ptr) { + %load = load i16, i16* %ptr + %conv0 = zext i16 %load to i32 + %conv1 = sext i16 %load to i32 + %cmp = icmp eq i32 %conv0, %conv1 + %conv2 = zext i1 %cmp to i16 + ret i16 %conv2 +} + +; CHECK-LABEL: test_ashr: +; CHECK: sxth +define i16 @test_ashr(i16 zeroext %arg) { + %ashr = ashr i16 %arg, 1 + %cmp = icmp eq i16 %ashr, 0 + %conv = zext i1 %cmp to i16 + ret i16 %conv +} + +; CHECK-LABEL: test_sdiv: +; CHECK: sxth +define i16 @test_sdiv(i16 zeroext %arg) { + %sdiv = sdiv i16 %arg, 2 + %cmp = icmp ne i16 %sdiv, 0 + %conv = zext i1 %cmp to i16 + ret i16 %conv +} + +; CHECK-LABEL: test_srem +; CHECK: sxth +define i16 @test_srem(i16 zeroext %arg) { + %srem = srem i16 %arg, 4 + %cmp = icmp ne i16 %srem, 0 + %conv = zext i1 %cmp to i16 + ret i16 %conv +} + +declare i8 @dummy(i8) + +; CHECK-LABEL: test_call: +; CHECK: sxtb +define i16 @test_call(i8 zeroext %arg) { + %call = call i8 @dummy(i8 %arg) + %cmp = icmp ult i8 %call, 128 + %conv = zext i1 %cmp to i16 + ret i16 %conv +}