Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -231,7 +231,7 @@ /// several shifts, adds, and multiplies for this target. /// The definition of "cheaper" may depend on whether we're optimizing /// for speed or for size. - virtual bool isIntDivCheap(EVT VT, AttributeSet Attr) const { + virtual bool isIntDivCheap(EVT VT, AttributeSet Attr, bool Signed) const { return false; } Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2369,14 +2369,14 @@ // alternate sequence. Targets may check function attributes for size/speed // trade-offs. AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); - if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr, true)) if (SDValue Op = BuildSDIV(N)) return Op; // sdiv, srem -> sdivrem // If the divisor is constant, then return DIVREM only if isIntDivCheap() is true. // Otherwise, we break the simplification logic in visitREM(). - if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr, true)) if (SDValue DivRem = useDivRem(N)) return DivRem; @@ -2434,14 +2434,14 @@ // fold (udiv x, c) -> alternate AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); - if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr, false)) if (SDValue Op = BuildUDIV(N)) return Op; // sdiv, srem -> sdivrem // If the divisor is constant, then return DIVREM only if isIntDivCheap() is true. // Otherwise, we break the simplification logic in visitREM(). - if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr, false)) if (SDValue DivRem = useDivRem(N)) return DivRem; @@ -2506,7 +2506,7 @@ // div is not cheap, combine will not return a DIVREM. Regardless, // checking cheapness here makes sense since the simplification results in // fatter code. - if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr)) { + if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr, isSigned)) { unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; SDValue Div = DAG.getNode(DivOpcode, DL, VT, N0, N1); AddToWorklist(Div.getNode()); Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2921,7 +2921,7 @@ std::vector *Created) const { AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (TLI.isIntDivCheap(N->getValueType(0), Attr, true)) return SDValue(N,0); // Lower SDIV as SDIV return SDValue(); } @@ -2950,19 +2950,29 @@ APInt::ms magics = Divisor.magic(); // Multiply the numerator (operand 0) by the magic value - // FIXME: We should support doing a MUL in a wider type + // FIXME: expand using MULHS for vector types after addressing possible + // regressions in X86 backend. + unsigned Opcode; + if (IsAfterLegalization ? isOperationLegal(ISD::MULHS, VT) + : isOperationLegalOrCustom(ISD::MULHS, VT)) + Opcode = ISD::MULHS; + else if (IsAfterLegalization ? isOperationLegal(ISD::SMUL_LOHI, VT) + : isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) + Opcode = ISD::SMUL_LOHI; + else if (!IsAfterLegalization && !VT.isVector()) + Opcode = ISD::MULHS; + else + return SDValue(); + SDValue Q; - if (IsAfterLegalization ? isOperationLegal(ISD::MULHS, VT) : - isOperationLegalOrCustom(ISD::MULHS, VT)) + if (Opcode == ISD::MULHS) Q = DAG.getNode(ISD::MULHS, dl, VT, N->getOperand(0), DAG.getConstant(magics.m, dl, VT)); - else if (IsAfterLegalization ? isOperationLegal(ISD::SMUL_LOHI, VT) : - isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) + else Q = SDValue(DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), N->getOperand(0), DAG.getConstant(magics.m, dl, VT)).getNode(), 1); - else - return SDValue(); // No mulhs or equvialent + // If d > 0 and m < 0, add the numerator if (Divisor.isStrictlyPositive() && magics.m.isNegative()) { Q = DAG.getNode(ISD::ADD, dl, VT, Q, N->getOperand(0)); @@ -3029,16 +3039,25 @@ } // Multiply the numerator (operand 0) by the magic value - // FIXME: We should support doing a MUL in a wider type - if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) : - isOperationLegalOrCustom(ISD::MULHU, VT)) + // FIXME: expand using MULHU for vector types after addressing possible + // regressions in X86 backend. + unsigned Opcode; + if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) + : isOperationLegalOrCustom(ISD::MULHU, VT)) + Opcode = ISD::MULHU; + else if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) + : isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) + Opcode = ISD::UMUL_LOHI; + else if (!IsAfterLegalization && !VT.isVector()) + Opcode = ISD::MULHU; + else + return SDValue(); + + if (Opcode == ISD::MULHU) Q = DAG.getNode(ISD::MULHU, dl, VT, Q, DAG.getConstant(magics.m, dl, VT)); - else if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) : - isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) + else Q = SDValue(DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), Q, DAG.getConstant(magics.m, dl, VT)).getNode(), 1); - else - return SDValue(); // No mulhu or equivalent Created->push_back(Q.getNode()); Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -402,7 +402,7 @@ return AArch64::X1; } - bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + bool isIntDivCheap(EVT VT, AttributeSet Attr, bool Signed) const override; bool isCheapToSpeculateCttz() const override { return true; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7611,7 +7611,7 @@ SelectionDAG &DAG, std::vector *Created) const { AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); - if (isIntDivCheap(N->getValueType(0), Attr)) + if (isIntDivCheap(N->getValueType(0), Attr, true)) return SDValue(N,0); // Lower SDIV as SDIV // fold (sdiv X, pow2) @@ -10621,7 +10621,8 @@ } } -bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { +bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr, + bool Signed) const { // Integer division on AArch64 is expensive. However, when aggressively // optimizing for code size, we prefer to use a div instruction, as it is // usually smaller than the alternative sequence. Index: lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- lib/Target/AMDGPU/SOPInstructions.td +++ lib/Target/AMDGPU/SOPInstructions.td @@ -910,6 +910,12 @@ (S_ADD_U32 $src0, $src1) >; +// Similarly for V_SUB_I32/S_SUB_U32. +def : Pat < + (i32 (subc i32:$src0, i32:$src1)), + (S_SUB_U32 $src0, $src1) +>; + // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that // REG_SEQUENCE patterns don't support instructions with multiple // outputs. @@ -932,7 +938,6 @@ >; - //===----------------------------------------------------------------------===// // SOPP Patterns //===----------------------------------------------------------------------===// Index: lib/Target/BPF/BPFISelLowering.h =================================================================== --- lib/Target/BPF/BPFISelLowering.h +++ lib/Target/BPF/BPFISelLowering.h @@ -46,6 +46,8 @@ EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; + bool isIntDivCheap(EVT VT, AttributeSet Attr, bool Signed) const override; + private: SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/BPF/BPFISelLowering.cpp =================================================================== --- lib/Target/BPF/BPFISelLowering.cpp +++ lib/Target/BPF/BPFISelLowering.cpp @@ -132,6 +132,14 @@ MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128; } +bool BPFTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr, + bool Signed) const { + // We don't want to apply optimizations to SDIV, so that the resulting + // error messages about not having signed division do not depend on + // optimizations. + return Signed; +} + SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { case ISD::BR_CC: Index: lib/Target/WebAssembly/WebAssemblyISelLowering.h =================================================================== --- lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -58,7 +58,7 @@ unsigned AS) const override; bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align, bool *Fast) const override; - bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + bool isIntDivCheap(EVT VT, AttributeSet Attr, bool Signed) const override; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; Index: lib/Target/WebAssembly/WebAssemblyISelLowering.cpp =================================================================== --- lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -253,7 +253,8 @@ return true; } -bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { +bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr, + bool Signed) const { // The current thinking is that wasm engines will perform this optimization, // so we can save on code size. return true; Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -1030,7 +1030,7 @@ /// \brief Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; - bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + bool isIntDivCheap(EVT VT, AttributeSet Attr, bool Signed) const override; bool supportSwiftError() const override; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -33383,7 +33383,8 @@ return -1; } -bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { +bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr, + bool Signed) const { // Integer division on x86 is expensive. However, when aggressively optimizing // for code size, we prefer to use a div instruction, as it is usually smaller // than the alternative sequence. Index: test/CodeGen/AMDGPU/sdiv.ll =================================================================== --- test/CodeGen/AMDGPU/sdiv.ll +++ test/CodeGen/AMDGPU/sdiv.ll @@ -136,6 +136,27 @@ ret void } +; FUNC-LABEL: {{^}}sdiv_i32_const: +; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493 +; SI-NOT: v_rcp +define void @sdiv_i32_const(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1)* %in + %result = sdiv i32 %num, 7 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sdiv_i64_const: +; SI-DAG: s_mov_b32 [[MAGIC_LO:s[0-9]+]], 0x24924925 +; SI-DAG: s_mov_b32 [[MAGIC_HI:s[0-9]+]], 0x49249249 +; SI-NOT: v_rcp +define void @sdiv_i64_const(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %num = load i64, i64 addrspace(1)* %in + %result = sdiv i64 %num, 7 + store i64 %result, i64 addrspace(1)* %out + ret void +} + ; Tests for 64-bit divide bypass. ; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { ; %result = sdiv i64 %a, %b Index: test/CodeGen/AMDGPU/udiv.ll =================================================================== --- test/CodeGen/AMDGPU/udiv.ll +++ test/CodeGen/AMDGPU/udiv.ll @@ -158,3 +158,24 @@ store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 ret void } + +; FUNC-LABEL: {{^}}udiv_i32_const: +; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925 +; SI-NOT: v_rcp +define void @udiv_i32_const(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1)* %in + %result = udiv i32 %num, 7 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}udiv_i64_const: +; SI-DAG: s_mov_b32 [[MAGIC_HI:s[0-9]+]], 0x24924924 +; SI-DAG: s_mov_b32 [[MAGIC_LO:s[0-9]+]], 0x92492493 +; SI-NOT: v_rcp +define void @udiv_i64_const(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %num = load i64, i64 addrspace(1)* %in + %result = udiv i64 %num, 7 + store i64 %result, i64 addrspace(1)* %out + ret void +} Index: test/CodeGen/SPARC/rem.ll =================================================================== --- test/CodeGen/SPARC/rem.ll +++ test/CodeGen/SPARC/rem.ll @@ -24,12 +24,39 @@ ; PR18150 ; CHECK-LABEL: test3 -; CHECK: sethi 2545, [[R0:%[gilo][0-7]]] -; CHECK: or [[R0]], 379, [[R1:%[gilo][0-7]]] -; CHECK: mulx %o0, [[R1]], [[R2:%[gilo][0-7]]] -; CHECK: udivx [[R2]], 1021, [[R3:%[gilo][0-7]]] -; CHECK: mulx [[R3]], 1021, [[R4:%[gilo][0-7]]] -; CHECK: sub [[R2]], [[R4]], %o0 +; CHECK: sethi 2545, %o1 +; CHECK-NEXT: or %o1, 379, %o1 +; CHECK-NEXT: mulx %o0, %o1, %o0 +; CHECK-NEXT: srl %o0, 0, %o1 +; CHECK-NEXT: sethi 12324, %o2 +; CHECK-NEXT: or %o2, 108, %o2 +; CHECK-NEXT: mulx %o1, %o2, %o3 +; CHECK-NEXT: sethi 1331003, %o4 +; CHECK-NEXT: or %o4, 435, %o4 +; CHECK-NEXT: mulx %o1, %o4, %o1 +; CHECK-NEXT: srlx %o1, 32, %o1 +; CHECK-NEXT: add %o1, %o3, %o1 +; CHECK-NEXT: srlx %o1, 32, %o3 +; CHECK-NEXT: srlx %o0, 32, %o5 +; CHECK-NEXT: mulx %o5, %o4, %o4 +; CHECK-NEXT: srlx %o4, 32, %g2 +; CHECK-NEXT: mulx %o5, %o2, %o2 +; CHECK-NEXT: srlx %o2, 32, %o5 +; CHECK-NEXT: addcc %o1, %o4, %o1 +; CHECK-NEXT: addxcc %o3, %g2, %o1 +; CHECK-NEXT: addxcc %o5, 0, %o3 +; CHECK-NEXT: sllx %o3, 32, %o3 +; CHECK-NEXT: srl %o2, 0, %o2 +; CHECK-NEXT: or %o2, %o3, %o2 +; CHECK-NEXT: srl %o1, 0, %o1 +; CHECK-NEXT: add %o1, %o2, %o1 +; CHECK-NEXT: sub %o0, %o1, %o2 +; CHECK-NEXT: srlx %o2, 1, %o2 +; CHECK-NEXT: add %o2, %o1, %o1 +; CHECK-NEXT: srlx %o1, 9, %o1 +; CHECK-NEXT: mulx %o1, 1021, %o1 +; CHECK-NEXT: retl +; CHECK-NEXT: sub %o0, %o1, %o0 define i64 @test3(i64 %b) { entry: