diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -810,9 +810,11 @@ /// /// \return a MachineInstrBuilder for the newly created instruction. MachineInstrBuilder buildIntrinsic(Intrinsic::ID ID, ArrayRef Res, - bool HasSideEffects); + bool HasSideEffects, + Optional Flags = None); MachineInstrBuilder buildIntrinsic(Intrinsic::ID ID, ArrayRef Res, - bool HasSideEffects); + bool HasSideEffects, + Optional Flags = None); /// Build and insert \p Res = G_FPTRUNC \p Op /// @@ -867,7 +869,8 @@ /// /// \return a MachineInstrBuilder for the newly created instruction. MachineInstrBuilder buildFCmp(CmpInst::Predicate Pred, const DstOp &Res, - const SrcOp &Op0, const SrcOp &Op1); + const SrcOp &Op0, const SrcOp &Op1, + Optional Flags = None); /// Build and insert a \p Res = G_SELECT \p Tst, \p Op0, \p Op1 /// @@ -880,7 +883,8 @@ /// /// \return a MachineInstrBuilder for the newly created instruction. MachineInstrBuilder buildSelect(const DstOp &Res, const SrcOp &Tst, - const SrcOp &Op0, const SrcOp &Op1); + const SrcOp &Op0, const SrcOp &Op1, + Optional Flags = None); /// Build and insert \p Res = G_INSERT_VECTOR_ELT \p Val, /// \p Elt, \p Idx @@ -1210,6 +1214,12 @@ return buildInstr(TargetOpcode::G_SMULH, {Dst}, {Src0, Src1}, Flags); } + MachineInstrBuilder buildFMul(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_FMUL, {Dst}, {Src0, Src1}, Flags); + } + MachineInstrBuilder buildShl(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional Flags = None) { diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -655,25 +655,35 @@ MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID, ArrayRef ResultRegs, - bool HasSideEffects) { + bool HasSideEffects, + Optional Flags) { auto MIB = buildInstr(HasSideEffects ? TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS : TargetOpcode::G_INTRINSIC); for (unsigned ResultReg : ResultRegs) MIB.addDef(ResultReg); MIB.addIntrinsicID(ID); + + if (Flags) + MIB->setFlags(*Flags); + return MIB; } MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID, ArrayRef Results, - bool HasSideEffects) { + bool HasSideEffects, + Optional Flags) { auto MIB = buildInstr(HasSideEffects ? TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS : TargetOpcode::G_INTRINSIC); for (DstOp Result : Results) Result.addDefToMIB(*getMRI(), MIB); MIB.addIntrinsicID(ID); + + if (Flags) + MIB->setFlags(*Flags); + return MIB; } @@ -697,17 +707,19 @@ MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred, const DstOp &Res, const SrcOp &Op0, - const SrcOp &Op1) { + const SrcOp &Op1, + Optional Flags) { - return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1}); + return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1}, Flags); } MachineInstrBuilder MachineIRBuilder::buildSelect(const DstOp &Res, const SrcOp &Tst, const SrcOp &Op0, - const SrcOp &Op1) { + const SrcOp &Op1, + Optional Flags) { - return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1}); + return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1}, Flags); } MachineInstrBuilder diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -65,6 +65,9 @@ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizeFDIVFast(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1274,6 +1274,41 @@ return false; } +bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(2).getReg(); + Register RHS = MI.getOperand(3).getReg(); + uint16_t Flags = MI.getFlags(); + + LLT S32 = LLT::scalar(32); + LLT S1 = LLT::scalar(1); + + auto Abs = B.buildFAbs(S32, RHS); + const APFloat C0Val(1.0f); + + auto C0 = B.buildConstant(S32, 0x6f800000); + auto C1 = B.buildConstant(S32, 0x2f800000); + auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); + + auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); + auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); + + auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); + + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false, Flags) + .addUse(Mul0.getReg(0)); + + auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); + + B.buildFMul(Res, Sel, Mul1, Flags); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1388,6 +1423,8 @@ case Intrinsic::amdgcn_dispatch_id: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::DISPATCH_ID); + case Intrinsic::amdgcn_fdiv_fast: + return legalizeFDIVFast(MI, MRI, B); default: return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1958,7 +1958,6 @@ case Intrinsic::amdgcn_udot4: case Intrinsic::amdgcn_sdot8: case Intrinsic::amdgcn_udot8: - case Intrinsic::amdgcn_fdiv_fast: case Intrinsic::amdgcn_wwm: case Intrinsic::amdgcn_wqm: return getDefaultMappingVOP(MI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-amdgcn-fdiv-fast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-amdgcn-fdiv-fast.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-amdgcn-fdiv-fast.mir @@ -0,0 +1,54 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -run-pass=legalizer %s -o - | FileCheck %s + +--- +name: test_amdgcn_fdiv_fast +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_amdgcn_fdiv_fast + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[COPY1]] + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1870659584 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 796917760 + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 + ; CHECK: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[FABS]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C1]], [[C2]] + ; CHECK: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY1]], [[SELECT]] + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FMUL]](s32) + ; CHECK: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]] + ; CHECK: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[SELECT]], [[FMUL1]] + ; CHECK: $vgpr0 = COPY [[FMUL2]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fdiv.fast), %0, %1 + $vgpr0 = COPY %2 +... + +--- +name: test_amdgcn_fdiv_fast_propagate_flags +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_amdgcn_fdiv_fast_propagate_flags + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[COPY1]] + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1870659584 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 796917760 + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1065353216 + ; CHECK: %7:_(s1) = nsz G_FCMP floatpred(ogt), [[FABS]](s32), [[C]] + ; CHECK: %8:_(s32) = nsz G_SELECT %7(s1), [[C1]], [[C2]] + ; CHECK: %9:_(s32) = nsz G_FMUL [[COPY1]], %8 + ; CHECK: %10:_(s32) = nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %9(s32) + ; CHECK: %11:_(s32) = nsz G_FMUL [[COPY]], %10 + ; CHECK: %2:_(s32) = nsz G_FMUL %8, %11 + ; CHECK: $vgpr0 = COPY %2(s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fdiv.fast), %0, %1 + $vgpr0 = COPY %2 +...