Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -302,6 +302,7 @@ LegalizeResult lowerFPTRUNC_F64_TO_F16(MachineInstr &MI); LegalizeResult lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + LegalizeResult lowerFPOWI(MachineInstr &MI); LegalizeResult lowerMinMax(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult lowerFCopySign(MachineInstr &MI, unsigned TypeIdx, LLT Ty); Index: llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1583,6 +1583,13 @@ return buildInstr(TargetOpcode::G_FEXP2, {Dst}, {Src}, Flags); } + /// Build and insert \p Dst = G_FPOW \p Src0, \p Src1 + MachineInstrBuilder buildFPow(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_FPOW, {Dst}, {Src0, Src1}, Flags); + } + /// Build and insert \p Res = G_FCOPYSIGN \p Op0, \p Op1 MachineInstrBuilder buildFCopysign(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1) { Index: llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2147,6 +2147,15 @@ widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); Observer.changedInstr(MI); return Legalized; + case TargetOpcode::G_FPOWI: { + if (TypeIdx != 0) + return UnableToLegalize; + Observer.changingInstr(MI); + widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); + widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); + Observer.changedInstr(MI); + return Legalized; + } case TargetOpcode::G_INTTOPTR: if (TypeIdx != 1) return UnableToLegalize; @@ -2651,6 +2660,8 @@ return lowerFPTOSI(MI); case G_FPTRUNC: return lowerFPTRUNC(MI, TypeIdx, Ty); + case G_FPOWI: + return lowerFPOWI(MI); case G_SMIN: case G_SMAX: case G_UMIN: @@ -4799,6 +4810,20 @@ return UnableToLegalize; } +// TODO: If RHS is a constant SelectionDAGBuilder expands this into a +// multiplication tree. +LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + LLT Ty = MRI.getType(Dst); + + auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1); + MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags()); + MI.eraseFromParent(); + return Legalized; +} + static CmpInst::Predicate minMaxToCompare(unsigned Opc) { switch (Opc) { case TargetOpcode::G_SMIN: Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -747,6 +747,10 @@ ExpOps.clampScalar(0, MinScalarFPTy, S32) .scalarize(0); + getActionDefinitionsBuilder(G_FPOWI) + .clampScalar(0, MinScalarFPTy, S32) + .lower(); + // The 64-bit versions produce 32-bit results, but only on the SALU. getActionDefinitionsBuilder(G_CTPOP) .legalFor({{S32, S32}, {S32, S64}}) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpowi.mir @@ -0,0 +1,70 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s + +--- +name: test_fpowi_s16_s32_flags +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: test_fpowi_s16_s32_flags + ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX6: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX6: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[COPY1]](s32) + ; GFX6: [[FLOG2_:%[0-9]+]]:_(s32) = nnan G_FLOG2 [[FPEXT]] + ; GFX6: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[SITOFP]](s32) + ; GFX6: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT]] + ; GFX6: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FEXP2_]](s32) + ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) + ; GFX6: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9-LABEL: name: test_fpowi_s16_s32_flags + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[COPY1]](s32) + ; GFX9: [[FLOG2_:%[0-9]+]]:_(s16) = nnan G_FLOG2 [[TRUNC]] + ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = nnan G_FPEXT [[FLOG2_]](s16) + ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = nnan G_FPEXT [[SITOFP]](s16) + ; GFX9: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FPEXT]](s32), [[FPEXT1]](s32) + ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) + ; GFX9: [[FEXP2_:%[0-9]+]]:_(s16) = nnan G_FEXP2 [[FPTRUNC]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FEXP2_]](s16) + ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = nnan G_FPOWI %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: test_fpowi_s32_s32_flags +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: test_fpowi_s32_s32_flags + ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX6: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[COPY1]](s32) + ; GFX6: [[FLOG2_:%[0-9]+]]:_(s32) = nnan G_FLOG2 [[COPY]] + ; GFX6: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[SITOFP]](s32) + ; GFX6: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT]] + ; GFX6: $vgpr0 = COPY [[FEXP2_]](s32) + ; GFX9-LABEL: name: test_fpowi_s32_s32_flags + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[COPY1]](s32) + ; GFX9: [[FLOG2_:%[0-9]+]]:_(s32) = nnan G_FLOG2 [[COPY]] + ; GFX9: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FLOG2_]](s32), [[SITOFP]](s32) + ; GFX9: [[FEXP2_:%[0-9]+]]:_(s32) = nnan G_FEXP2 [[INT]] + ; GFX9: $vgpr0 = COPY [[FEXP2_]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = nnan G_FPOWI %0, %1 + $vgpr0 = COPY %2 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s + +define i16 @v_powi_f16(i16 %l, i32 %r) { +; GFX7-LABEL: v_powi_f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX7-NEXT: v_log_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_exp_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_powi_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX8-NEXT: v_log_f16_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_exp_f16_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %l.cast = bitcast i16 %l to half + %res = call half @llvm.powi.f16(half %l.cast, i32 %r) + %res.cast = bitcast half %res to i16 + ret i16 %res.cast +} + +define float @v_powi_f32(float %l, i32 %r) { +; GCN-LABEL: v_powi_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 %r) + ret float %res +} + +define float @v_powi_0_f32(float %l) { +; GCN-LABEL: v_powi_0_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 0) + ret float %res +} + +define float @v_powi_1_f32(float %l) { +; GCN-LABEL: v_powi_1_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 1) + ret float %res +} + +define float @v_powi_neg1_f32(float %l) { +; GCN-LABEL: v_powi_neg1_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_i32_e32 v1, -1 +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 -1) + ret float %res +} + +define float @v_powi_2_f32(float %l) { +; GCN-LABEL: v_powi_2_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, 2 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 2) + ret float %res +} + +define float @v_powi_neg2_f32(float %l) { +; GCN-LABEL: v_powi_neg2_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_i32_e32 v1, -2 +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 -2) + ret float %res +} + +define float @v_powi_4_f32(float %l) { +; GCN-LABEL: v_powi_4_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, 4 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 4) + ret float %res +} + +define float @v_powi_8_f32(float %l) { +; GCN-LABEL: v_powi_8_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, 8 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 8) + ret float %res +} + +define float @v_powi_16_f32(float %l) { +; GCN-LABEL: v_powi_16_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, 16 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 16) + ret float %res +} + +define float @v_powi_128_f32(float %l) { +; GCN-LABEL: v_powi_128_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, 0x80 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 128) + ret float %res +} + +define float @v_powi_neg128_f32(float %l) { +; GCN-LABEL: v_powi_neg128_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_i32_e32 v1, 0xffffff80 +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %res = call float @llvm.powi.f32(float %l, i32 -128) + ret float %res +} + +; FIXME: f64 broken +; define double @v_powi_f64(double %l, i32 %r) { +; %res = call double @llvm.powi.f64(double %l, i32 %r) +; ret double %res +; } + +declare half @llvm.powi.f16(half, i32) #0 +declare float @llvm.powi.f32(float, i32) #0 +declare double @llvm.powi.f64(double, i32) #0 + +attributes #0 = { nounwind readnone speculatable willreturn }