Index: lib/Target/ARM/ARMCodeGenPrepare.cpp =================================================================== --- lib/Target/ARM/ARMCodeGenPrepare.cpp +++ lib/Target/ARM/ARMCodeGenPrepare.cpp @@ -145,6 +145,8 @@ IRPromoter *Promoter = nullptr; std::set AllVisited; SmallPtrSet SafeToPromote; + SmallPtrSet SelUsers; + bool SafeToWriteGE = false; bool isSafeOverflow(Instruction *I); bool isSupportedValue(Value *V); @@ -730,16 +732,7 @@ // clear the data structures. Cleanup(Sinks); - LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete:\n"); - LLVM_DEBUG(dbgs(); - for (auto *V : Sources) - V->dump(); - for (auto *I : NewInsts) - I->dump(); - for (auto *V : Visited) { - if (!Sources.count(V)) - V->dump(); - }); + LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete.\n"); } /// We accept most instructions, as well as Arguments and ConstantInsts. We @@ -813,6 +806,12 @@ if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub) return false; + // Predicated on whether the sel intrinsic has been used within the function. + // If it has, conservatively choose to not generate any instructions that + // would overwrite the GE flags. + if (!SafeToWriteGE) + return false; + // If promotion is not safe, can we use a DSP instruction to natively // handle the narrow type? if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I)) @@ -943,6 +942,9 @@ bool ARMCodeGenPrepare::doInitialization(Module &M) { Promoter = new IRPromoter(&M); + auto Sel = Intrinsic::getDeclaration(&M, Intrinsic::arm_sel); + for (auto &U : Sel->uses()) + SelUsers.insert(cast(U.getUser())->getParent()->getParent()); return false; } @@ -959,6 +961,10 @@ bool MadeChange = false; LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n"); + SafeToWriteGE = SelUsers.count(&F) == 0; + LLVM_DEBUG(if (SafeToWriteGE) + dbgs() << "ARM CGP: Can use GE writing intrinsics.\n"); + // Search up from icmps to try to promote their operands. for (BasicBlock &BB : F) { auto &Insts = BB.getInstList(); Index: test/CodeGen/ARM/CGP/arm-cgp-sel.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/CGP/arm-cgp-sel.ll @@ -0,0 +1,41 @@ +; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=NODSP +; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 -arm-disable-cgp=false -arm-enable-scalar-dsp=true %s -o - | FileCheck %s --check-prefix=DSP + +; CHECK-LABEL: dsp_enable: +; NODSP-NOT: usub +; NODSP: uxtb +; DSP: usub8 +define void @dsp_enable(i8* %in, i8* %out, i8 zeroext %compare) { + %first = getelementptr inbounds i8, i8* %in, i32 0 + %second = getelementptr inbounds i8, i8* %in, i32 1 + %ld0 = load i8, i8* %first + %ld1 = load i8, i8* %second + %sub = sub i8 %ld0, %ld1 + %xor = xor i8 %ld0, -1 + %cmp = icmp ult i8 %compare, %sub + %select = select i1 %cmp, i8 %sub, i8 %xor + store i8 %select, i8* %out, align 1 + ret void +} + +; CHECK-LABEL: dsp_disable: +; NODSP-NOT: usub +; DSP-NOT: usub +; CHECK: uxtb +define i32 @dsp_disable(i8* %in, i8* %out, i8 zeroext %compare) { + %first = getelementptr inbounds i8, i8* %in, i32 0 + %second = getelementptr inbounds i8, i8* %in, i32 1 + %ld0 = load i8, i8* %first + %ld1 = load i8, i8* %second + %sub = sub i8 %ld0, %ld1 + %xor = xor i8 %ld0, -1 + %cmp = icmp ult i8 %compare, %sub + %select = select i1 %cmp, i8 %sub, i8 %xor + store i8 %select, i8* %out, align 1 + %a = zext i8 %ld0 to i32 + %b = zext i8 %ld1 to i32 + %sel = call i32 @llvm.arm.sel(i32 %a, i32 %b) + ret i32 %sel +} + +declare i32 @llvm.arm.sel(i32, i32)