Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -85,6 +85,7 @@ SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const; static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -478,6 +478,7 @@ setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); + setTargetDAGCombine(ISD::FABS); } //===----------------------------------------------------------------------===// @@ -2953,6 +2954,45 @@ SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); } + case ISD::FP16_TO_FP: { + // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal + // f16, but legalization of f16 fneg ends up pulling it out of the source. + // Put the fneg back as a legal source operation that can be matched later. + SDLoc SL(N); + + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) + SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, + DAG.getConstant(0x8000, SL, SrcVT)); + return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); + } + default: + return SDValue(); + } +} + +SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + + if (!N0.hasOneUse()) + return SDValue(); + + switch (N0.getOpcode()) { + case ISD::FP16_TO_FP: { + assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); + SDLoc SL(N); + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + + // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) + SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, + DAG.getConstant(0x7fff, SL, SrcVT)); + return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); + } default: return SDValue(); } @@ -3065,6 +3105,8 @@ return performSelectCombine(N, DCI); case ISD::FNEG: return performFNegCombine(N, DCI); + case ISD::FABS: + return performFAbsCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -629,6 +629,9 @@ def smin_oneuse : HasOneUseBinOp; def umax_oneuse : HasOneUseBinOp; def umin_oneuse : HasOneUseBinOp; +def and_oneuse : HasOneUseBinOp; +def or_oneuse : HasOneUseBinOp; +def xor_oneuse : HasOneUseBinOp; } // Properties = [SDNPCommutative, SDNPAssociative] def sub_oneuse : HasOneUseBinOp; Index: lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -226,7 +226,10 @@ case 8: return getLit64Encoding(static_cast(Imm), STI); case 2: - return getLit16Encoding(static_cast(Imm), STI); + // FIXME Is this correct? What do inline immediates do on SI for f16 src? + if (STI.getFeatureBits()[AMDGPU::Feature16BitInsts]) + return getLit16Encoding(static_cast(Imm), STI); + return getLit32Encoding(static_cast(Imm), STI); default: llvm_unreachable("invalid operand size"); } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1688,7 +1688,8 @@ return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); case 16: - return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); default: llvm_unreachable("invalid bitwidth"); @@ -1718,8 +1719,13 @@ } case 16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { + // A few special case instructions have 16-bit operands on subtargets + // where 16-bit instructions are not legal. + // TODO: Do the 32-bit immediates work? We shouldn't really need to handle + // constants in these cases int16_t Trunc = static_cast(Imm); - return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); } return false; Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -607,6 +607,8 @@ def SRCMODS { int NONE = 0; int NEG = 1; + int ABS = 2; + int NEG_ABS = 3; } def DSTCLAMP { @@ -1117,6 +1119,8 @@ def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; +def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>; +def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>; def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>; def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -423,9 +423,26 @@ } // End Predicates = [UnsafeFPMath] + +// f16_to_fp patterns +def : Pat < + (f32 (f16_to_fp i32:$src0)), + (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : Pat < + (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), + (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : Pat < + (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), + (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + def : Pat < - (f32 (fpextend f16:$src)), - (V_CVT_F32_F16_e32 $src) + (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), + (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; def : Pat < @@ -433,9 +450,10 @@ (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) >; +// fp_to_fp16 patterns def : Pat < - (f16 (fpround f32:$src)), - (V_CVT_F16_F32_e32 $src) + (i32 (fp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))), + (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod) >; def : Pat < Index: lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP1Instructions.td +++ lib/Target/AMDGPU/VOP1Instructions.td @@ -147,8 +147,8 @@ defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>; defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; -defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>; -defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>; +defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; +defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>; Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -19,8 +19,7 @@ ; GCN-LABEL: {{^}}fabs_f16: ; CI: flat_load_ushort [[VAL:v[0-9]+]], -; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]] -; CI: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], |[[CVT0]]| +; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define void @fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) @@ -30,10 +29,10 @@ ; FIXME: Should be able to use single and ; GCN-LABEL: {{^}}fabs_v2f16: -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| + +; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; VI: flat_load_ushort [[LO:v[0-9]+]] ; VI: flat_load_ushort [[HI:v[0-9]+]] @@ -51,10 +50,11 @@ } ; GCN-LABEL: {{^}}fabs_v4f16: -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}| +; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} @@ -72,9 +72,10 @@ ; GCN-LABEL: {{^}}fabs_fold_f16: ; GCN: flat_load_ushort [[IN0:v[0-9]+]] ; GCN: flat_load_ushort [[IN1:v[0-9]+]] + ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]] -; CI-DAG: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], [[IN1]] -; CI: v_mul_f32_e64 [[RESULT:v[0-9]+]], |[[CVT1]]|, [[CVT0]] +; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]| +; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]] ; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]] Index: test/CodeGen/AMDGPU/fcmp.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcmp.f16.ll +++ test/CodeGen/AMDGPU/fcmp.f16.ll @@ -28,10 +28,10 @@ ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] +; SI: v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]| +; SI: v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]| -; SI: v_cmp_lt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F32]]|, |v[[B_F32]]| +; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] ; VI: v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]| ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]] Index: test/CodeGen/AMDGPU/fneg-fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -3,8 +3,8 @@ ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: ; CI: v_cvt_f32_f16_e32 -; CI: v_cvt_f32_f16_e32 -; CI: v_sub_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |v{{[0-9]+}}| +; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}| +; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}} ; VI-NOT: and ; VI: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| @@ -17,14 +17,15 @@ } ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16: -; CI: v_cvt_f32_f16_e32 -; CI: v_cvt_f32_f16_e32 -; CI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}| +; CI-DAG: v_cvt_f32_f16_e32 +; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}| +; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}} ; CI: v_cvt_f16_f32_e32 ; VI-NOT: and -; VI: v_mul_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}| -; VI-NOT: and +; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}| +; VI-NOT: [[MUL]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.000000e+00, %fabs @@ -49,10 +50,7 @@ ; FIXME: Should use or ; GCN-LABEL: {{^}}fneg_fabs_f16: -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} +; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) %fsub = fsub half -0.000000e+00, %fabs @@ -61,10 +59,7 @@ } ; GCN-LABEL: {{^}}v_fneg_fabs_f16: -; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} +; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %val = load half, half addrspace(1)* %in, align 2 %fabs = call half @llvm.fabs.f16(half %val) @@ -75,13 +70,10 @@ ; FIXME: single bit op ; GCN-LABEL: {{^}}fneg_fabs_v2f16: -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: flat_store_dword +; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: store_dword define void @fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) %fsub = fsub <2 x half> , %fabs @@ -90,17 +82,12 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v4f16: -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| - -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: flat_store_dwordx2 +; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; GCN: store_dwordx2 define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) %fsub = fsub <4 x half> , %fabs Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -15,13 +15,9 @@ ; FUNC-LABEL: {{^}}v_fneg_f16: ; GCN: flat_load_ushort [[VAL:v[0-9]+]], - -; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]] -; CI: v_cvt_f16_f32_e64 [[CVT1:v[0-9]+]], -[[CVT0]] -; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] - -; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]] +; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] +; SI: buffer_store_short [[XOR]] define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %val = load half, half addrspace(1)* %in, align 2 %fneg = fsub half -0.000000e+00, %val @@ -45,8 +41,9 @@ ; FUNC-LABEL: {{^}}v_fneg_fold_f16: ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]] -; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[CVT0]] -; CI: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[CVT0]], [[CVT0]] +; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]] +; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]] +; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]] ; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] Index: test/CodeGen/AMDGPU/fpext.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fpext.f16.ll +++ test/CodeGen/AMDGPU/fpext.f16.ll @@ -68,3 +68,202 @@ store <2 x double> %r.val, <2 x double> addrspace(1)* %r ret void } + +; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32: +; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}} +define void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) { +entry: + %a.trunc = trunc i32 %a to i16 + %a.val = bitcast i16 %a.trunc to half + %r.val = fpext half %a.val to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]] +define void @fneg_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.neg = fsub half -0.0, %a.val + %r.val = fpext half %a.neg to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]| +define void @fabs_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %r.val = fpext half %a.fabs to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]| +define void @fneg_fabs_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %a.fneg.fabs = fsub half -0.0, %a.fabs + %r.val = fpext half %a.fneg.fabs to float + store float %r.val, float addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_multi_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[A]] + +; FIXME: Using the source modifier here only wastes code size +; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] + +; GCN: store_dword [[CVT]] +; GCN: store_short [[XOR]] +define void @fneg_multi_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.neg = fsub half -0.0, %a.val + %r.val = fpext half %a.neg to float + store volatile float %r.val, float addrspace(1)* %r + store volatile half %a.neg, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fneg_multi_foldable_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] +; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]] +; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] + +; VI-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]] +; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]] + +; GCN: buffer_store_dword [[CVTA_NEG]] +; GCN: buffer_store_short [[MUL]] +define void @fneg_multi_foldable_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.neg = fsub half -0.0, %a.val + %r.val = fpext half %a.neg to float + %mul = fmul half %a.neg, %a.val + store volatile float %r.val, float addrspace(1)* %r + store volatile half %mul, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_multi_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_and_b32_e32 [[XOR:v[0-9]+]], 0x7fff, [[A]] + +; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] +; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |[[A]]| + +; GCN: store_dword [[CVT]] +; GCN: store_short [[XOR]] +define void @fabs_multi_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %r.val = fpext half %a.fabs to float + store volatile float %r.val, float addrspace(1)* %r + store volatile half %a.fabs, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_multi_foldable_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] +; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], |[[CVTA]]|, [[CVTA]] +; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] +; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[CVTA]] + +; VI-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]| +; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]] + +; GCN: buffer_store_dword [[ABS_A]] +; GCN: buffer_store_short [[MUL]] +define void @fabs_multi_foldable_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %r.val = fpext half %a.fabs to float + %mul = fmul half %a.fabs, %a.val + store volatile float %r.val, float addrspace(1)* %r + store volatile half %mul, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_fneg_multi_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x8000, [[A]] + +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[OR]] +; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[OR]]| + +; GCN: buffer_store_dword [[CVT]] +; GCN: buffer_store_short [[OR]] +define void @fabs_fneg_multi_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %a.fneg.fabs = fsub half -0.0, %a.fabs + %r.val = fpext half %a.fneg.fabs to float + store volatile float %r.val, float addrspace(1)* %r + store volatile half %a.fneg.fabs, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] +; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], -|[[CVTA]]|, [[CVTA]] +; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] +; SI: v_or_b32_e32 [[FABS_FNEG:v[0-9]+]], 0x80000000, [[CVTA]] + +; VI-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]| +; VI-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]] + +; GCN: buffer_store_dword [[FABS_FNEG]] +; GCN: buffer_store_short [[MUL]] +define void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( + float addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %a.fabs = call half @llvm.fabs.f16(half %a.val) + %a.fneg.fabs = fsub half -0.0, %a.fabs + %r.val = fpext half %a.fneg.fabs to float + %mul = fmul half %a.fneg.fabs, %a.val + store volatile float %r.val, float addrspace(1)* %r + store volatile half %mul, half addrspace(1)* undef + ret void +} + +declare half @llvm.fabs.f16(half) #1 + +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/fptrunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptrunc.f16.ll +++ test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; GCN-LABEL: {{^}}fptrunc_f32_to_f16 +; GCN-LABEL: {{^}}fptrunc_f32_to_f16: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] ; GCN: buffer_store_short v[[R_F16]] @@ -16,7 +16,7 @@ ret void } -; GCN-LABEL: {{^}}fptrunc_f64_to_f16 +; GCN-LABEL: {{^}}fptrunc_f64_to_f16: ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]{{\]}} ; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v{{\[}}[[A_F64_0]]:[[A_F64_1]]{{\]}} ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] @@ -32,7 +32,7 @@ ret void } -; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16 +; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16: ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] @@ -51,7 +51,7 @@ ret void } -; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16 +; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16: ; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}} ; GCN: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}} ; GCN: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}} @@ -70,3 +70,56 @@ store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +; GCN-LABEL: {{^}}fneg_fptrunc_f32_to_f16: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]] +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @fneg_fptrunc_f32_to_f16( + half addrspace(1)* %r, + float addrspace(1)* %a) { +entry: + %a.val = load float, float addrspace(1)* %a + %a.fneg = fsub float -0.0, %a.val + %r.val = fptrunc float %a.fneg to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fabs_fptrunc_f32_to_f16: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @fabs_fptrunc_f32_to_f16( + half addrspace(1)* %r, + float addrspace(1)* %a) { +entry: + %a.val = load float, float addrspace(1)* %a + %a.fabs = call float @llvm.fabs.f32(float %a.val) + %r.val = fptrunc float %a.fabs to half + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}fneg_fabs_fptrunc_f32_to_f16: +; GCN: buffer_load_dword v[[A_F32:[0-9]+]] +; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]| +; GCN: buffer_store_short v[[R_F16]] +; GCN: s_endpgm +define void @fneg_fabs_fptrunc_f32_to_f16( + half addrspace(1)* %r, + float addrspace(1)* %a) { +entry: + %a.val = load float, float addrspace(1)* %a + %a.fabs = call float @llvm.fabs.f32(float %a.val) + %a.fneg.fabs = fsub float -0.0, %a.fabs + %r.val = fptrunc float %a.fneg.fabs to half + store half %r.val, half addrspace(1)* %r + ret void +} + +declare float @llvm.fabs.f32(float) #1 + +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- test/CodeGen/AMDGPU/v_mac_f16.ll +++ test/CodeGen/AMDGPU/v_mac_f16.ll @@ -31,9 +31,10 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_same_add +; GCN-LABEL: {{^}}mac_f16_same_add: ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] ; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm @@ -63,9 +64,11 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_f16_neg_a: +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm @@ -87,9 +90,10 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_f16_neg_b: +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm @@ -111,9 +115,12 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_f16_neg_c: +; SI: v_cvt_f32_f16_e32 +; SI-DAG: v_cvt_f32_f16_e32 +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; GCN: s_endpgm @@ -207,9 +214,11 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} +; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math: +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} ; GCN: s_endpgm @@ -231,9 +240,11 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} +; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math: +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} ; GCN: s_endpgm @@ -255,9 +266,12 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} +; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math: +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} +; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} ; GCN: s_endpgm @@ -279,7 +293,7 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16 +; GCN-LABEL: {{^}}mac_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] @@ -322,7 +336,7 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_same_add +; GCN-LABEL: {{^}}mac_v2f16_same_add: ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]] ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]] ; SI: v_mac_f32_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}} @@ -358,10 +372,13 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_v2f16_neg_a: +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} + +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -385,9 +402,12 @@ } ; GCN-LABEL: {{^}}mac_v2f16_neg_b -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] + + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -410,10 +430,13 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-LABEL: {{^}}mac_v2f16_neg_c: +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} + +; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} @@ -464,7 +487,7 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math +; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math: ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} @@ -492,7 +515,7 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math +; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math: ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} @@ -520,10 +543,13 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math: +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} + +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} @@ -546,10 +572,13 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math: +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} + +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} @@ -572,10 +601,13 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math -; SI-NOT: v_mac_f32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} +; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math: +; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} + +; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}} + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}