diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -334,8 +334,9 @@ } // Checks whether Inst is part of a min(max()) or max(min()) pattern -// that will match to an SSAT instruction -static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { +// that will match to an SSAT instruction. Returns the instruction being +// saturated, or null if no saturation pattern was found. +static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { Value *LHS, *RHS; ConstantInt *C; SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; @@ -358,12 +359,26 @@ return false; }; - if (isSSatMin(Inst->getOperand(1)) || - (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) || - isSSatMin(*(++Inst->user_begin()))))) - return true; + if (isSSatMin(Inst->getOperand(1))) + return cast(Inst->getOperand(1))->getOperand(1); + if (Inst->hasNUses(2) && + (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin())))) + return Inst->getOperand(1); } - return false; + return nullptr; +} + +// Look for a FP Saturation pattern, where the instruction can be simplified to +// a fptosi.sat. max(min(fptosi)). The constant in this case is always free. +static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) { + if (Imm != -2147483648) + return false; + Value *FP = isSSATMinMaxPattern(Inst, Imm); + if (!FP && isa(Inst) && Inst->hasOneUse()) + FP = isSSATMinMaxPattern(cast(*Inst->user_begin()), Imm); + if (!FP) + return false; + return isa(FP); } InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, @@ -423,6 +438,9 @@ return 0; } + if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm)) + return 0; + // We can convert <= -1 to < 0, which is generally quite cheap. if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) { ICmpInst::Predicate Pred = cast(Inst)->getPredicate(); diff --git a/llvm/test/CodeGen/ARM/fpclamptosat.ll b/llvm/test/CodeGen/ARM/fpclamptosat.ll --- a/llvm/test/CodeGen/ARM/fpclamptosat.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat.ll @@ -5229,63 +5229,25 @@ ; ; VFP2-LABEL: unroll_maxmin: ; VFP2: @ %bb.0: -; VFP2-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; VFP2-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; VFP2-NEXT: .pad #4 -; VFP2-NEXT: sub sp, #4 -; VFP2-NEXT: .vsave {d8} -; VFP2-NEXT: vpush {d8} -; VFP2-NEXT: sub.w r4, r1, #8 -; VFP2-NEXT: sub.w r5, r0, #8 -; VFP2-NEXT: vldr s16, .LCPI54_0 -; VFP2-NEXT: mov.w r8, #-1 -; VFP2-NEXT: mov.w r9, #-2147483648 -; VFP2-NEXT: mov.w r6, #1024 -; VFP2-NEXT: mvn r7, #-2147483648 +; VFP2-NEXT: subs r1, #8 +; VFP2-NEXT: subs r0, #8 +; VFP2-NEXT: vldr s0, .LCPI54_0 +; VFP2-NEXT: mov.w r2, #1024 ; VFP2-NEXT: .LBB54_1: @ =>This Inner Loop Header: Depth=1 -; VFP2-NEXT: vldr s0, [r4, #8] -; VFP2-NEXT: vmul.f32 s0, s0, s16 -; VFP2-NEXT: vmov r0, s0 -; VFP2-NEXT: bl __aeabi_f2lz -; VFP2-NEXT: subs r2, r0, r7 -; VFP2-NEXT: sbcs r2, r1, #0 -; VFP2-NEXT: mov.w r2, #0 -; VFP2-NEXT: it lt -; VFP2-NEXT: movlt r2, #1 -; VFP2-NEXT: cmp r2, #0 -; VFP2-NEXT: ite ne -; VFP2-NEXT: movne r2, r1 -; VFP2-NEXT: moveq r0, r7 -; VFP2-NEXT: subs.w r1, r9, r0 -; VFP2-NEXT: sbcs.w r1, r8, r2 -; VFP2-NEXT: it ge -; VFP2-NEXT: movge r0, r9 -; VFP2-NEXT: str r0, [r5, #8]! -; VFP2-NEXT: vldr s0, [r4, #12] -; VFP2-NEXT: vmul.f32 s0, s0, s16 -; VFP2-NEXT: vmov r0, s0 -; VFP2-NEXT: bl __aeabi_f2lz -; VFP2-NEXT: subs r2, r0, r7 -; VFP2-NEXT: add.w r4, r4, #8 -; VFP2-NEXT: sbcs r2, r1, #0 -; VFP2-NEXT: mov.w r2, #0 -; VFP2-NEXT: it lt -; VFP2-NEXT: movlt r2, #1 -; VFP2-NEXT: cmp r2, #0 -; VFP2-NEXT: ite ne -; VFP2-NEXT: movne r2, r1 -; VFP2-NEXT: moveq r0, r7 -; VFP2-NEXT: subs.w r1, r9, r0 -; VFP2-NEXT: sbcs.w r1, r8, r2 -; VFP2-NEXT: it ge -; VFP2-NEXT: movge r0, r9 -; VFP2-NEXT: subs r6, #2 -; VFP2-NEXT: str r0, [r5, #4] +; VFP2-NEXT: vldr s2, [r1, #8] +; VFP2-NEXT: subs r2, #2 +; VFP2-NEXT: vmul.f32 s2, s2, s0 +; VFP2-NEXT: vcvt.s32.f32 s2, s2 +; VFP2-NEXT: vmov r3, s2 +; VFP2-NEXT: str r3, [r0, #8]! +; VFP2-NEXT: vldr s2, [r1, #12] +; VFP2-NEXT: add.w r1, r1, #8 +; VFP2-NEXT: vmul.f32 s2, s2, s0 +; VFP2-NEXT: vcvt.s32.f32 s2, s2 +; VFP2-NEXT: vstr s2, [r0, #4] ; VFP2-NEXT: bne .LBB54_1 ; VFP2-NEXT: @ %bb.2: -; VFP2-NEXT: vpop {d8} -; VFP2-NEXT: add sp, #4 -; VFP2-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; VFP2-NEXT: bx lr ; VFP2-NEXT: .p2align 2 ; VFP2-NEXT: @ %bb.3: ; VFP2-NEXT: .LCPI54_0: @@ -5293,57 +5255,26 @@ ; ; FULL-LABEL: unroll_maxmin: ; FULL: @ %bb.0: -; FULL-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; FULL-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; FULL-NEXT: .pad #4 -; FULL-NEXT: sub sp, #4 -; FULL-NEXT: .vsave {d8} -; FULL-NEXT: vpush {d8} -; FULL-NEXT: mov.w r2, #512 -; FULL-NEXT: sub.w r5, r1, #8 -; FULL-NEXT: sub.w r6, r0, #8 -; FULL-NEXT: vldr s16, .LCPI54_0 -; FULL-NEXT: mov r4, r2 -; FULL-NEXT: mov.w r8, #-1 -; FULL-NEXT: mov.w r9, #-2147483648 -; FULL-NEXT: mvn r7, #-2147483648 +; FULL-NEXT: .save {r7, lr} +; FULL-NEXT: push {r7, lr} +; FULL-NEXT: mov.w lr, #512 +; FULL-NEXT: subs r1, #8 +; FULL-NEXT: subs r0, #8 +; FULL-NEXT: vldr s0, .LCPI54_0 ; FULL-NEXT: .LBB54_1: @ =>This Inner Loop Header: Depth=1 -; FULL-NEXT: vldr s0, [r5, #8] -; FULL-NEXT: vmul.f32 s0, s0, s16 -; FULL-NEXT: vmov r0, s0 -; FULL-NEXT: bl __aeabi_f2lz -; FULL-NEXT: subs r2, r0, r7 -; FULL-NEXT: sbcs r2, r1, #0 -; FULL-NEXT: cset r2, lt -; FULL-NEXT: cmp r2, #0 -; FULL-NEXT: csel r0, r0, r7, ne -; FULL-NEXT: csel r1, r1, r2, ne -; FULL-NEXT: subs.w r2, r9, r0 -; FULL-NEXT: sbcs.w r1, r8, r1 -; FULL-NEXT: csel r0, r0, r9, lt -; FULL-NEXT: str r0, [r6, #8]! -; FULL-NEXT: vldr s0, [r5, #12] -; FULL-NEXT: vmul.f32 s0, s0, s16 -; FULL-NEXT: vmov r0, s0 -; FULL-NEXT: bl __aeabi_f2lz -; FULL-NEXT: subs r2, r0, r7 -; FULL-NEXT: add.w r5, r5, #8 -; FULL-NEXT: sbcs r2, r1, #0 -; FULL-NEXT: sub.w r4, r4, #1 -; FULL-NEXT: cset r2, lt -; FULL-NEXT: cmp r2, #0 -; FULL-NEXT: csel r0, r0, r7, ne -; FULL-NEXT: csel r1, r1, r2, ne -; FULL-NEXT: subs.w r2, r9, r0 -; FULL-NEXT: sbcs.w r1, r8, r1 -; FULL-NEXT: csel r0, r0, r9, lt -; FULL-NEXT: str r0, [r6, #4] -; FULL-NEXT: cbz r4, .LBB54_2 -; FULL-NEXT: le .LBB54_1 -; FULL-NEXT: .LBB54_2: -; FULL-NEXT: vpop {d8} -; FULL-NEXT: add sp, #4 -; FULL-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; FULL-NEXT: vldr s2, [r1, #8] +; FULL-NEXT: vmul.f32 s2, s2, s0 +; FULL-NEXT: vcvt.s32.f32 s2, s2 +; FULL-NEXT: vmov r2, s2 +; FULL-NEXT: str r2, [r0, #8]! +; FULL-NEXT: vldr s2, [r1, #12] +; FULL-NEXT: adds r1, #8 +; FULL-NEXT: vmul.f32 s2, s2, s0 +; FULL-NEXT: vcvt.s32.f32 s2, s2 +; FULL-NEXT: vstr s2, [r0, #4] +; FULL-NEXT: le lr, .LBB54_1 +; FULL-NEXT: @ %bb.2: +; FULL-NEXT: pop {r7, pc} ; FULL-NEXT: .p2align 2 ; FULL-NEXT: @ %bb.3: ; FULL-NEXT: .LCPI54_0: @@ -5494,63 +5425,25 @@ ; ; VFP2-LABEL: unroll_minmax: ; VFP2: @ %bb.0: -; VFP2-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; VFP2-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; VFP2-NEXT: .pad #4 -; VFP2-NEXT: sub sp, #4 -; VFP2-NEXT: .vsave {d8} -; VFP2-NEXT: vpush {d8} -; VFP2-NEXT: sub.w r4, r1, #8 -; VFP2-NEXT: sub.w r5, r0, #8 -; VFP2-NEXT: vldr s16, .LCPI55_0 -; VFP2-NEXT: mov.w r8, #-1 -; VFP2-NEXT: mov.w r9, #-2147483648 -; VFP2-NEXT: mov.w r6, #1024 -; VFP2-NEXT: mvn r7, #-2147483648 +; VFP2-NEXT: subs r1, #8 +; VFP2-NEXT: subs r0, #8 +; VFP2-NEXT: vldr s0, .LCPI55_0 +; VFP2-NEXT: mov.w r2, #1024 ; VFP2-NEXT: .LBB55_1: @ =>This Inner Loop Header: Depth=1 -; VFP2-NEXT: vldr s0, [r4, #8] -; VFP2-NEXT: vmul.f32 s0, s0, s16 -; VFP2-NEXT: vmov r0, s0 -; VFP2-NEXT: bl __aeabi_f2lz -; VFP2-NEXT: subs.w r2, r9, r0 -; VFP2-NEXT: sbcs.w r2, r8, r1 -; VFP2-NEXT: mov.w r2, #0 -; VFP2-NEXT: it lt -; VFP2-NEXT: movlt r2, #1 -; VFP2-NEXT: cmp r2, #0 -; VFP2-NEXT: itt eq -; VFP2-NEXT: moveq r1, r8 -; VFP2-NEXT: moveq r0, r9 -; VFP2-NEXT: subs r2, r0, r7 -; VFP2-NEXT: sbcs r1, r1, #0 -; VFP2-NEXT: it ge -; VFP2-NEXT: movge r0, r7 -; VFP2-NEXT: str r0, [r5, #8]! -; VFP2-NEXT: vldr s0, [r4, #12] -; VFP2-NEXT: vmul.f32 s0, s0, s16 -; VFP2-NEXT: vmov r0, s0 -; VFP2-NEXT: bl __aeabi_f2lz -; VFP2-NEXT: subs.w r2, r9, r0 -; VFP2-NEXT: add.w r4, r4, #8 -; VFP2-NEXT: sbcs.w r2, r8, r1 -; VFP2-NEXT: mov.w r2, #0 -; VFP2-NEXT: it lt -; VFP2-NEXT: movlt r2, #1 -; VFP2-NEXT: cmp r2, #0 -; VFP2-NEXT: itt eq -; VFP2-NEXT: moveq r1, r8 -; VFP2-NEXT: moveq r0, r9 -; VFP2-NEXT: subs r2, r0, r7 -; VFP2-NEXT: sbcs r1, r1, #0 -; VFP2-NEXT: it ge -; VFP2-NEXT: movge r0, r7 -; VFP2-NEXT: subs r6, #2 -; VFP2-NEXT: str r0, [r5, #4] +; VFP2-NEXT: vldr s2, [r1, #8] +; VFP2-NEXT: subs r2, #2 +; VFP2-NEXT: vmul.f32 s2, s2, s0 +; VFP2-NEXT: vcvt.s32.f32 s2, s2 +; VFP2-NEXT: vmov r3, s2 +; VFP2-NEXT: str r3, [r0, #8]! +; VFP2-NEXT: vldr s2, [r1, #12] +; VFP2-NEXT: add.w r1, r1, #8 +; VFP2-NEXT: vmul.f32 s2, s2, s0 +; VFP2-NEXT: vcvt.s32.f32 s2, s2 +; VFP2-NEXT: vstr s2, [r0, #4] ; VFP2-NEXT: bne .LBB55_1 ; VFP2-NEXT: @ %bb.2: -; VFP2-NEXT: vpop {d8} -; VFP2-NEXT: add sp, #4 -; VFP2-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; VFP2-NEXT: bx lr ; VFP2-NEXT: .p2align 2 ; VFP2-NEXT: @ %bb.3: ; VFP2-NEXT: .LCPI55_0: @@ -5558,57 +5451,26 @@ ; ; FULL-LABEL: unroll_minmax: ; FULL: @ %bb.0: -; FULL-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} -; FULL-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} -; FULL-NEXT: .pad #4 -; FULL-NEXT: sub sp, #4 -; FULL-NEXT: .vsave {d8} -; FULL-NEXT: vpush {d8} -; FULL-NEXT: mov.w r2, #512 -; FULL-NEXT: sub.w r5, r1, #8 -; FULL-NEXT: sub.w r6, r0, #8 -; FULL-NEXT: vldr s16, .LCPI55_0 -; FULL-NEXT: mov r4, r2 -; FULL-NEXT: mov.w r8, #-1 -; FULL-NEXT: mov.w r9, #-2147483648 -; FULL-NEXT: mvn r7, #-2147483648 +; FULL-NEXT: .save {r7, lr} +; FULL-NEXT: push {r7, lr} +; FULL-NEXT: mov.w lr, #512 +; FULL-NEXT: subs r1, #8 +; FULL-NEXT: subs r0, #8 +; FULL-NEXT: vldr s0, .LCPI55_0 ; FULL-NEXT: .LBB55_1: @ =>This Inner Loop Header: Depth=1 -; FULL-NEXT: vldr s0, [r5, #8] -; FULL-NEXT: vmul.f32 s0, s0, s16 -; FULL-NEXT: vmov r0, s0 -; FULL-NEXT: bl __aeabi_f2lz -; FULL-NEXT: subs.w r2, r9, r0 -; FULL-NEXT: sbcs.w r2, r8, r1 -; FULL-NEXT: cset r2, lt -; FULL-NEXT: cmp r2, #0 -; FULL-NEXT: csel r0, r0, r9, ne -; FULL-NEXT: csel r1, r1, r8, ne -; FULL-NEXT: subs r2, r0, r7 -; FULL-NEXT: sbcs r1, r1, #0 -; FULL-NEXT: csel r0, r0, r7, lt -; FULL-NEXT: str r0, [r6, #8]! -; FULL-NEXT: vldr s0, [r5, #12] -; FULL-NEXT: vmul.f32 s0, s0, s16 -; FULL-NEXT: vmov r0, s0 -; FULL-NEXT: bl __aeabi_f2lz -; FULL-NEXT: subs.w r2, r9, r0 -; FULL-NEXT: add.w r5, r5, #8 -; FULL-NEXT: sbcs.w r2, r8, r1 -; FULL-NEXT: sub.w r4, r4, #1 -; FULL-NEXT: cset r2, lt -; FULL-NEXT: cmp r2, #0 -; FULL-NEXT: csel r0, r0, r9, ne -; FULL-NEXT: csel r1, r1, r8, ne -; FULL-NEXT: subs r2, r0, r7 -; FULL-NEXT: sbcs r1, r1, #0 -; FULL-NEXT: csel r0, r0, r7, lt -; FULL-NEXT: str r0, [r6, #4] -; FULL-NEXT: cbz r4, .LBB55_2 -; FULL-NEXT: le .LBB55_1 -; FULL-NEXT: .LBB55_2: -; FULL-NEXT: vpop {d8} -; FULL-NEXT: add sp, #4 -; FULL-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; FULL-NEXT: vldr s2, [r1, #8] +; FULL-NEXT: vmul.f32 s2, s2, s0 +; FULL-NEXT: vcvt.s32.f32 s2, s2 +; FULL-NEXT: vmov r2, s2 +; FULL-NEXT: str r2, [r0, #8]! +; FULL-NEXT: vldr s2, [r1, #12] +; FULL-NEXT: adds r1, #8 +; FULL-NEXT: vmul.f32 s2, s2, s0 +; FULL-NEXT: vcvt.s32.f32 s2, s2 +; FULL-NEXT: vstr s2, [r0, #4] +; FULL-NEXT: le lr, .LBB55_1 +; FULL-NEXT: @ %bb.2: +; FULL-NEXT: pop {r7, pc} ; FULL-NEXT: .p2align 2 ; FULL-NEXT: @ %bb.3: ; FULL-NEXT: .LCPI55_0: