Index: llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -142,16 +142,20 @@ switch (Opc) { case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: - case AMDGPU::V_FMAC_F32_e64: { + case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMAC_F16_e64: { // Special case for mac. Since this is replaced with mad when folded into // src2, we need to check the legality for the final instruction. int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (static_cast(OpNo) == Src2Idx) { - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; - bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64; + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64; unsigned Opc = IsFMA ? - AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) : + (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); const MCInstrDesc &MadDesc = TII->get(Opc); return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); } @@ -314,12 +318,15 @@ // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64) && + Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; - bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64; + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64; unsigned NewOpc = IsFMA ? - AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) : + (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); // Check if changing this to a v_mad_{f16, f32} instruction will allow us // to fold the operand. Index: llvm/trunk/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -126,7 +126,7 @@ ; GFX8_10: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 ; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 -; GFX10-DENORM: v_fmac_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} +; GFX10-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 ; GFX10-FLUSH: v_sub_f16_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { %x = bitcast i16 %x.arg to half @@ -152,7 +152,7 @@ ; VI-FLUSH-DAG: v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0 ; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, v{{[0-9]+}} ; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]] -; GFX10-DENORM-DAG: v_fmac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0 +; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, s{{[0-9]+}} ; GCN-DAG: buffer_store_short [[MUL2]] ; GCN-DAG: buffer_store_short [[MAD]] @@ -174,7 +174,7 @@ ; VI-FLUSH-DAG: v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} ; VI-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} ; GFX10-FLUSH-DAG: v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]] -; GFX10-DENORM-DAG: v_fmac_f16_e64 [[MAD:v[0-9]+]], |[[X]]|, 2.0 +; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}} ; GCN-DAG: buffer_store_short [[MUL2]] ; GCN-DAG: buffer_store_short [[MAD]] @@ -201,8 +201,8 @@ ; GFX10-FLUSH: v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |{{s[0-9]+}}| ; GFX10-FLUSH: v_add_f16_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[MUL2]] ; GFX10-FLUSH: v_add_f16_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[MUL2]] -; GFX10-DENORM: v_fmac_f16_e64 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0 -; GFX10-DENORM: v_fmac_f16_e64 {{v[0-9]+}}, |[[X]]|, 2.0 +; GFX10-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, s{{[0-9]+}} +; GFX10-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, s{{[0-9]+}} define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { %x = bitcast i16 %x.arg to half