Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.h +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.h @@ -694,6 +694,9 @@ unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + SmallVectorImpl &Created) const override; + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -7794,6 +7794,50 @@ return LowerCallTo(CLI).first; } +// This is a code size optimisation: return the original SDIV node to +// DAGCombiner when we don't want to expand SDIV into a sequence of +// instructions, and an empty node otherwise which will cause the +// SDIV to be expanded in DAGCombine. +SDValue +ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + SmallVectorImpl &Created) const { + // TODO: Support SREM + if (N->getOpcode() != ISD::SDIV) + return SDValue(); + + const auto &ST = static_cast(DAG.getSubtarget()); + const auto &MF = DAG.getMachineFunction(); + const bool MinSize = MF.getFunction().optForMinSize(); + const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() + : ST.hasDivideInARMMode(); + + // Don't touch vector types; rewriting this may lead to scalarizing + // the int divs. + if (N->getOperand(0).getValueType().isVector()) + return SDValue(); + + // Bail if MinSize is not set, and also for both ARM and Thumb mode we need + // hwdiv support for this to be really profitable. + if (!(MinSize && HasDivide)) + return SDValue(); + + // ARM mode is a bit simpler than Thumb: we can handle large power + // of 2 immediates with 1 mov instruction; no further checks required, + // just return the sdiv node. + if (!ST.isThumb()) + return SDValue(N, 0); + + // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, + // and thus lose the code size benefits of a MOVS that requires only 2. + // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, + // but as it's doing exactly this, it's not worth the trouble to get TTI. + if (Divisor.sgt(128)) + return SDValue(); + + return SDValue(N, 0); +} + SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const { assert(Op.getValueType() == MVT::i32 && Index: llvm/trunk/test/CodeGen/ARM/sdiv-pow2-arm-size.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/sdiv-pow2-arm-size.ll +++ llvm/trunk/test/CodeGen/ARM/sdiv-pow2-arm-size.ll @@ -0,0 +1,79 @@ +; RUN: llc -mtriple=armv7a -mattr=+hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,DIV +; RUN: llc -mtriple=armv7a -mattr=-hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,NODIV + +; Check SREM +define dso_local i32 @test_rem(i32 %F) local_unnamed_addr #0 { +; CHECK-LABEL: test_rem +; CHECK: asr r1, r0, #31 +; CHECK-NEXT: add r1, r0, r1, lsr #30 +; CHECK-NEXT: bic r1, r1, #3 +; CHECK-NEXT: sub r0, r0, r1 + +entry: + %div = srem i32 %F, 4 + ret i32 %div +} + +; Try an i16 sdiv, with a small immediate. +define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 { +; CHECK-LABEL: f0 + +; DIV: mov r1, #2 +; DIV-NEXT: sdiv r0, r0, r1 +; DIV-NEXT: sxth r0, r0 +; DIV-NEXT: bx lr + +; NODIV: uxth r1, r0 +; NODIV-NEXT: add r0, r0, r1, lsr #15 +; NODIV-NEXT: sxth r0, r0 +; NODIV-NEXT: asr r0, r0, #1 +; NODIV-NEXT: bx lr + +entry: + %0 = sdiv i16 %F, 2 + ret i16 %0 +} + +; Try an i32 sdiv, with a small immediate. +define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 { +; CHECK-LABEL: f1 + +; DIV: mov r1, #4 +; DIV-NEXT: sdiv r0, r0, r1 +; DIV-NEXT: bx lr + +; NODIV: asr r1, r0, #31 +; NODIV-NEXT: add r0, r0, r1, lsr #30 +; NODIV-NEXT: asr r0, r0, #2 +; NODIV-NEXT: bx lr + +entry: + %div = sdiv i32 %F, 4 + ret i32 %div +} + +; Try a large power of 2 immediate, which should also be materialised with 1 +; move immediate instruction. +define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 { +; CHECK-LABEL: f2 +; DIV: mov r1, #131072 +; DIV-NEXT: sdiv r0, r0, r1 +; DIV-NEXT: bx lr +entry: + %div = sdiv i32 %F, 131072 + ret i32 %div +} + +; MinSize not set, so should expand to the faster but longer sequence. +define dso_local i32 @f3(i32 %F) { +; CHECK-LABEL: f3 +; CHECK: asr r1, r0, #31 +; CHECK-NEXT: add r0, r0, r1, lsr #30 +; CHECK-NEXT: asr r0, r0, #2 +; CHECK-NEXT: bx lr +entry: + %div = sdiv i32 %F, 4 + ret i32 %div +} + +attributes #0 = { minsize norecurse nounwind optsize readnone } Index: llvm/trunk/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll +++ llvm/trunk/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll @@ -0,0 +1,105 @@ +; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefixes=CHECK,T2 +; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s --check-prefixes=CHECK,T2 +; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefixes=CHECK,T1 +; RUN: llc -mtriple=thumbv7em %s -o - | FileCheck %s --check-prefixes=CHECK,T2 +; RUN: llc -mtriple=thumbv6m %s -o - | FileCheck %s --check-prefixes=V6M + +; Armv6m targets don't have a sdiv instruction, so sdiv should not appear at +; all in the output: + +; V6M: .file {{.*}} +; V6M-NOT: sdiv +; V6M-NOT: idiv + +; Test sdiv i16 +define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 { +; CHECK-LABEL: f0 +; CHECK: movs r1, #2 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: bx lr + +entry: + %0 = sdiv i16 %F, 2 + ret i16 %0 +} + +; Same as above, but now with i32 +define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 { +; CHECK-LABEL: f1 +; CHECK: movs r1, #4 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: bx lr + +entry: + %div = sdiv i32 %F, 4 + ret i32 %div +} + +; The immediate is not a power of 2, so we expect a sdiv. +define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 { +; CHECK-LABEL: f2 +; CHECK: movs r1, #5 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: bx lr + +entry: + %div = sdiv i32 %F, 5 + ret i32 %div +} + +; Try a larger power of 2 immediate: immediates larger than +; 128 don't give any code size savings. +define dso_local i32 @f3(i32 %F) local_unnamed_addr #0 { +; CHECK-LABEL: f3 +; CHECK-NOT: sdiv +entry: + %div = sdiv i32 %F, 256 + ret i32 %div +} + +attributes #0 = { minsize norecurse nounwind optsize readnone } + + +; These functions don't have the minsize attribute set, so should not lower +; the sdiv to sdiv, but to the faster instruction sequence. + +define dso_local signext i16 @f4(i16 signext %F) { +; T2-LABEL: f4 +; T2: uxth r1, r0 +; T2-NEXT: add.w r0, r0, r1, lsr #15 +; T2-NEXT: sxth r0, r0 +; T2-NEXT: asrs r0, r0, #1 +; T2-NEXT: bx lr + +; T1-LABEL: f4 +; T1: uxth r1, r0 +; T1-NEXT: lsrs r1, r1, #15 +; T1-NEXT: adds r0, r0, r1 +; T1-NEXT: sxth r0, r0 +; T1-NEXT: asrs r0, r0, #1 +; T1-NEXT: bx lr + +entry: + %0 = sdiv i16 %F, 2 + ret i16 %0 +} + +define dso_local i32 @f5(i32 %F) { +; T2-LABEL: f5 +; T2: asrs r1, r0, #31 +; T2-NEXT: add.w r0, r0, r1, lsr #30 +; T2-NEXT: asrs r0, r0, #2 +; T2-NEXT: bx lr + +; T1-LABEL: f5 +; T1: asrs r1, r0, #31 +; T1-NEXT: lsrs r1, r1, #30 +; T1-NEXT: adds r0, r0, r1 +; T1-NEXT: asrs r0, r0, #2 +; T1-NEXT: bx lr + +entry: + %div = sdiv i32 %F, 4 + ret i32 %div +}