diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -647,6 +647,13 @@ bool matchUDivByConst(MachineInstr &MI); void applyUDivByConst(MachineInstr &MI); + /// Given an G_SDIV \p MI expressing a signed divide by constant, return an + /// expression that implements it by multiplying by a magic number. + /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". + MachineInstr *buildSDivUsingMul(MachineInstr &MI); + bool matchSDivByConst(MachineInstr &MI); + void applySDivByConst(MachineInstr &MI); + // G_UMULH x, (1 << c)) -> x >> (bitwidth - c) bool matchUMulHToLShr(MachineInstr &MI); void applyUMulHToLShr(MachineInstr &MI); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -764,7 +764,13 @@ [{ return Helper.matchUDivByConst(*${root}); }]), (apply [{ Helper.applyUDivByConst(*${root}); }])>; -def intdiv_combines : GICombineGroup<[udiv_by_const]>; +def sdiv_by_const : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_SDIV):$root, + [{ return Helper.matchSDivByConst(*${root}); }]), + (apply [{ Helper.applySDivByConst(*${root}); }])>; + +def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const]>; def reassoc_ptradd : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4935,6 +4935,108 @@ replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); } +bool CombinerHelper::matchSDivByConst(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV"); + Register Dst = MI.getOperand(0).getReg(); + Register RHS = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(Dst); + + auto &MF = *MI.getMF(); + AttributeList Attr = MF.getFunction().getAttributes(); + const auto &TLI = getTargetLowering(); + LLVMContext &Ctx = MF.getFunction().getContext(); + auto &DL = MF.getDataLayout(); + if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, DL, Ctx), Attr)) + return false; + + // Don't do this for minsize because the instruction sequence is usually + // larger. + if (MF.getFunction().hasMinSize()) + return false; + + // If the sdiv has an 'exact' flag we can use a simpler lowering. + if (MI.getFlag(MachineInstr::MIFlag::IsExact)) { + return matchUnaryPredicate( + MRI, RHS, [](const Constant *C) { return C && !C->isZeroValue(); }); + } + + // Don't support the general case for now. + return false; +} + +void CombinerHelper::applySDivByConst(MachineInstr &MI) { + auto *NewMI = buildSDivUsingMul(MI); + replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); +} + +MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV"); + auto &SDiv = cast(MI); + Register Dst = SDiv.getReg(0); + Register LHS = SDiv.getReg(1); + Register RHS = SDiv.getReg(2); + LLT Ty = MRI.getType(Dst); + LLT ScalarTy = Ty.getScalarType(); + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType(); + auto &MIB = Builder; + MIB.setInstrAndDebugLoc(MI); + + bool UseSRA = false; + SmallVector Shifts, Factors; + + auto *RHSDef = cast(getDefIgnoringCopies(RHS, MRI)); + bool IsSplat = getIConstantSplatVal(*RHSDef, MRI).hasValue(); + + auto BuildSDIVPattern = [&](const Constant *C) { + // Don't recompute inverses for each splat element. + if (IsSplat && !Factors.empty()) { + Shifts.push_back(Shifts[0]); + Factors.push_back(Factors[0]); + return true; + } + + auto *CI = cast(C); + APInt Divisor = CI->getValue(); + unsigned Shift = Divisor.countTrailingZeros(); + if (Shift) { + Divisor.ashrInPlace(Shift); + UseSRA = true; + } + + // Calculate the multiplicative inverse modulo BW. + // 2^W requires W + 1 bits, so we have to extend and then truncate. + unsigned W = Divisor.getBitWidth(); + APInt Factor = Divisor.zext(W + 1) + .multiplicativeInverse(APInt::getSignedMinValue(W + 1)) + .trunc(W); + Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0)); + Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0)); + return true; + }; + + // Collect all magic values from the build vector. + bool Matched = matchUnaryPredicate(MRI, RHS, BuildSDIVPattern); + (void)Matched; + assert(Matched && "Expected unary predicate match to succeed"); + + Register Shift, Factor; + if (Ty.isVector()) { + Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0); + Factor = MIB.buildBuildVector(Ty, Factors).getReg(0); + } else { + Shift = Shifts[0]; + Factor = Factors[0]; + } + + Register Res = LHS; + + if (UseSRA) + Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0); + + return MIB.buildMul(Ty, Res, Factor); +} + bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::G_UMULH); Register RHS = MI.getOperand(2).getReg(); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1077,8 +1077,8 @@ return None; } -Optional getIConstantSplatVal(const MachineInstr &MI, - const MachineRegisterInfo &MRI) { +Optional llvm::getIConstantSplatVal(const MachineInstr &MI, + const MachineRegisterInfo &MRI) { return getIConstantSplatVal(MI.getOperand(0).getReg(), MRI); } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sdiv.mir @@ -0,0 +1,133 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @sdiv_exact() { ret void } + define void @sdiv_noexact() { ret void } + define void @sdiv_exact_minsize() #0 { ret void } + define void @div_v4s32() { ret void } + define void @div_v4s32_splat() { ret void } + + attributes #0 = { minsize } + +... +--- +name: sdiv_exact +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: sdiv_exact + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = exact G_ASHR [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[ASHR]], [[C1]] + ; CHECK-NEXT: $w0 = COPY [[MUL]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 104 + %2:_(s32) = exact G_SDIV %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: sdiv_noexact +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: sdiv_noexact + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 104 + ; CHECK-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[COPY]], [[C]] + ; CHECK-NEXT: $w0 = COPY [[SDIV]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 104 + %2:_(s32) = G_SDIV %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: sdiv_exact_minsize +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: sdiv_exact_minsize + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 104 + ; CHECK-NEXT: [[SDIV:%[0-9]+]]:_(s32) = exact G_SDIV [[COPY]], [[C]] + ; CHECK-NEXT: $w0 = COPY [[SDIV]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 104 + %2:_(s32) = exact G_SDIV %0, %1 + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: div_v4s32 +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: div_v4s32 + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 954437177 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32), [[C1]](s32), [[C2]](s32) + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<4 x s32>) = exact G_ASHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[ASHR]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: $q0 = COPY [[MUL]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %c1:_(s32) = G_CONSTANT i32 104 + %c2:_(s32) = G_CONSTANT i32 72 + %1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c2(s32), %c1(s32), %c2(s32) + %3:_(<4 x s32>) = exact G_SDIV %0, %1 + $q0 = COPY %3(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: div_v4s32_splat +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: div_v4s32_splat + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32) + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<4 x s32>) = exact G_ASHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[ASHR]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: $q0 = COPY [[MUL]](<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %c1:_(s32) = G_CONSTANT i32 104 + %1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c1(s32), %c1(s32), %c1(s32) + %3:_(<4 x s32>) = exact G_SDIV %0, %1 + $q0 = COPY %3(<4 x s32>) + RET_ReallyLR implicit $q0 + +...