diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1210,7 +1210,14 @@ Ops[8] = N->getOperand(0); Ops[9] = N->getOperand(4); - CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32_e64, N->getVTList(), Ops); + // If there are no source modifiers, prefer fmac over fma because it can use + // the smaller VOP2 encoding. + bool UseFMAC = Subtarget->hasDLInsts() && + cast(Ops[0])->isZero() && + cast(Ops[2])->isZero() && + cast(Ops[4])->isZero(); + unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64; + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops); } void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -21,10 +21,12 @@ ; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX10: s_denorm_mode 15 ; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 -; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] ; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] ; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] -; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] ; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] ; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX10: s_denorm_mode 12 @@ -293,10 +295,12 @@ ; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX10: s_denorm_mode 15 ; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 -; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] ; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] ; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] -; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] ; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] ; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX10: s_denorm_mode 12 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -566,10 +566,10 @@ ; GFX10-NEXT: v_rcp_f32_e32 v5, v4 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; GFX10-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v5 ; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 ; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3 -; GFX10-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v5 ; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3 ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6 @@ -2164,10 +2164,10 @@ ; GFX10-NEXT: v_rcp_f32_e32 v7, v6 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX10-NEXT: v_fma_f32 v7, v8, v7, v7 +; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v7 ; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 ; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5 -; GFX10-NEXT: v_fma_f32 v8, v9, v7, v8 +; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v7 ; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5 ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8 @@ -2179,10 +2179,10 @@ ; GFX10-NEXT: v_rcp_f32_e32 v6, v5 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX10-NEXT: v_fma_f32 v6, v7, v6, v6 +; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 ; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3 -; GFX10-NEXT: v_fma_f32 v7, v8, v6, v7 +; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v6 ; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3 ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7 @@ -2538,10 +2538,10 @@ ; GFX10-NEXT: v_rcp_f32_e32 v11, v10 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 -; GFX10-NEXT: v_fma_f32 v11, v12, v11, v11 +; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v11 ; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11 ; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9 -; GFX10-NEXT: v_fma_f32 v12, v13, v11, v12 +; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v11 ; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9 ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 @@ -2553,10 +2553,10 @@ ; GFX10-NEXT: v_rcp_f32_e32 v10, v9 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; GFX10-NEXT: v_fma_f32 v10, v11, v10, v10 +; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10 ; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 ; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 -; GFX10-NEXT: v_fma_f32 v11, v12, v10, v11 +; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10 ; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 @@ -2568,10 +2568,10 @@ ; GFX10-NEXT: v_rcp_f32_e32 v9, v7 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 -; GFX10-NEXT: v_fma_f32 v9, v10, v9, v9 +; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v9 ; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9 ; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6 -; GFX10-NEXT: v_fma_f32 v10, v11, v9, v10 +; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v9 ; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6 ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10 @@ -2583,10 +2583,10 @@ ; GFX10-NEXT: v_rcp_f32_e32 v7, v6 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 -; GFX10-NEXT: v_fma_f32 v7, v9, v7, v7 +; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7 ; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5 -; GFX10-NEXT: v_fma_f32 v9, v10, v7, v9 +; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v7 ; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5 ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9