Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -485,6 +485,7 @@ // Target Information //===----------------------------------------------------------------------===// +LLVM_READNONE static bool fnegFoldsIntoOp(unsigned Opc) { switch (Opc) { case ISD::FADD: @@ -506,6 +507,59 @@ } } +/// \p returns true if the operation will definitely need to use a 64-bit +/// encoding, and thus will use a VOP3 encoding regardless of the source +/// modifiers. +LLVM_READONLY +static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { + return N->getNumOperands() > 2 || VT == MVT::f64; +} + +// Most FP instructions support source modifiers, but this could be refined +// slightly. +LLVM_READONLY +static bool hasSourceMods(const SDNode *N) { + if (isa(N)) + return false; + + switch (N->getOpcode()) { + case ISD::CopyToReg: + case ISD::SELECT: + case ISD::FDIV: + case ISD::FREM: + case ISD::INLINEASM: + case AMDGPUISD::INTERP_P1: + case AMDGPUISD::INTERP_P2: + case AMDGPUISD::DIV_SCALE: + return false; + default: + return true; + } +} + +static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 10) { + // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus + // it is truly free to use a source modifier in all cases. If there are + // multiple users but for each one will necessitate using VOP3, there will be + // a code size increase. Try to avoid increasing code size unless we know it + // will save on the instruction count. + unsigned NumMayIncreaseSize = 0; + MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); + + // XXX - Should this limit number of uses to check? + for (const SDNode *U : N->uses()) { + if (!hasSourceMods(U)) + return false; + + if (!opMustUseVOP3Encoding(U, VT)) { + if (++NumMayIncreaseSize > CostThreshold) + return false; + } + } + + return true; +} + MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -2851,10 +2905,16 @@ // the other uses cannot, give up. This both prevents unprofitable // transformations and infinite loops: we won't repeatedly try to fold around // a negate that has no 'good' form. - // - // TODO: Check users can fold - if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse()) - return SDValue(); + if (N0.hasOneUse()) { + // This may be able to fold into the source, but at a code size cost. Don't + // fold if the fold into the user is free. + if (allUsesHaveSourceMods(N, 0)) + return SDValue(); + } else { + if (fnegFoldsIntoOp(Opc) && + (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode()))) + return SDValue(); + } SDLoc SL(N); switch (Opc) { Index: test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -19,8 +19,8 @@ ; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}| ; VI: v_cndmask_b32_e32 ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| -; VI: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; VI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0 +; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 { %a11 = fadd fast float %y, -1.0 %a12 = call float @llvm.fabs.f32(float %a11) @@ -114,8 +114,8 @@ ; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}| ; VI: v_cndmask_b32_e32 ; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| -; VI: v_mul_f16_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0 +; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0 define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 { %x = bitcast i16 %x.arg to half %y = bitcast i16 %y.arg to half Index: test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-combines.ll +++ test/CodeGen/AMDGPU/fneg-combines.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; -------------------------------------------------------------------------------- ; fadd tests @@ -48,9 +48,8 @@ ; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] +; GCN: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]] +; GCN-NEXT: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0 ; GCN-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NEXT: buffer_store_dword [[MUL]] define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { @@ -225,11 +224,10 @@ ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] -; GCN-NEXT: buffer_store_dword [[NEG_MUL]] -; GCN: buffer_store_dword [[MUL]] +; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]] +; GCN-NEXT: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], 4.0 +; GCN-NEXT: buffer_store_dword [[MUL0]] +; GCN-NEXT: buffer_store_dword [[MUL1]] define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -409,9 +407,8 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] -; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]] +; GCN: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] +; GCN-NEXT: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[NEG_FMA]], 4.0 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]] ; GCN-NEXT: buffer_store_dword [[MUL]] define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { @@ -632,10 +629,9 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]] -; GCN-DAG: v_xor_b32_e32 [[NEG_C:v[0-9]+]], 0x80000000, [[C]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] -; GCN-NEXT: buffer_store_dword [[NEG_C]] +; GCN: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]] +; GCN-NEXT: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[NEG_MAD]], 4.0 +; GCN-NEXT: buffer_store_dword [[NEG_MAD]] ; GCN-NEXT: buffer_store_dword [[MUL]] define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1351,6 +1347,387 @@ ret void } +; -------------------------------------------------------------------------------- +; vintrp tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_interp_p1_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] +; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]] +; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]] +define void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.0, %mul + %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0) + %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0) + store volatile float %intrp0, float addrspace(1)* %out.gep + store volatile float %intrp1, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_interp_p2_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] +; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]] +; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]] +define void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.0, %mul + %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0) + %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0) + store volatile float %intrp0, float addrspace(1)* %out.gep + store volatile float %intrp1, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; CopyToReg tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_copytoreg_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] +; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]] +; GCN: s_cbranch_scc1 + +; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]] +; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]] +; GCN: buffer_store_dword [[MUL1]] + +; GCN: buffer_store_dword [[MUL0]] +define void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.0, %mul + %cmp0 = icmp eq i32 %d, 0 + br i1 %cmp0, label %if, label %endif + +if: + %mul1 = fmul float %fneg, %c + store volatile float %mul1, float addrspace(1)* %out.gep + br label %endif + +endif: + store volatile float %mul, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; inlineasm tests +; -------------------------------------------------------------------------------- + +; Can't fold into use, so should fold into source +; GCN-LABEL: {{^}}v_fneg_inlineasm_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] +; GCN: ; use [[MUL]] +; GCN: buffer_store_dword [[MUL]] +define void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.0, %mul + call void asm sideeffect "; use $0", "v"(float %fneg) #0 + store volatile float %fneg, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; inlineasm tests +; -------------------------------------------------------------------------------- + +; Can't fold into use, so should fold into source +; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] +; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]] +; GCN: ; use [[NEG]] +; GCN: buffer_store_dword [[MUL]] +define void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %mul = fmul float %a, %b + %fneg = fsub float -0.0, %mul + call void asm sideeffect "; use $0", "v"(float %fneg) #0 + store volatile float %mul, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- +; code size regression tests +; -------------------------------------------------------------------------------- + +; There are multiple users of the fneg that must use a VOP3 +; instruction, so there is no penalty +; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] + +; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]] +; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0 +; GCN-NEXT: buffer_store_dword [[FMA0]] +; GCN-NEXT: buffer_store_dword [[FMA1]] +define void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + + %fneg.a = fsub float -0.0, %a + %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) + %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0) + + store volatile float %fma0, float addrspace(1)* %out + store volatile float %fma1, float addrspace(1)* %out + ret void +} + +; There are multiple users, but both require using a larger encoding +; for the modifier. + +; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] + +; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]] +; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] +; GCN-NEXT: buffer_store_dword [[MUL0]] +; GCN-NEXT: buffer_store_dword [[MUL1]] +define void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + + %fneg.a = fsub float -0.0, %a + %mul0 = fmul float %fneg.a, %b + %mul1 = fmul float %fneg.a, %c + + store volatile float %mul0, float addrspace(1)* %out + store volatile float %mul1, float addrspace(1)* %out + ret void +} + +; One user is VOP3 so has no cost to folding the modifier, the other does. +; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] + +; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0 +; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] + +; GCN: buffer_store_dword [[FMA0]] +; GCN-NEXT: buffer_store_dword [[MUL1]] +define void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + + %fneg.a = fsub float -0.0, %a + %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0) + %mul1 = fmul float %fneg.a, %c + + store volatile float %fma0, float addrspace(1)* %out + store volatile float %mul1, float addrspace(1)* %out + ret void +} + +; The use of the fneg requires a code size increase, but folding into +; the source does not + +; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] + +; GCN: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0 +; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]] +; GCN-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]] + +; GCN: buffer_store_dword [[MUL1]] +; GCN-NEXT: buffer_store_dword [[MUL2]] +define void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %d = load volatile float, float addrspace(1)* %d.gep + + %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0) + %fneg.fma0 = fsub float -0.0, %fma0 + %mul1 = fmul float %fneg.fma0, %c + %mul2 = fmul float %fneg.fma0, %d + + store volatile float %mul1, float addrspace(1)* %out + store volatile float %mul2, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] +; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]] +; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]] + +; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0 +; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]] +; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]] + +; GCN: buffer_store_dwordx2 [[MUL0]] +; GCN: buffer_store_dwordx2 [[MUL1]] +define void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext + %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext + %a = load volatile double, double addrspace(1)* %a.gep + %b = load volatile double, double addrspace(1)* %b.gep + %c = load volatile double, double addrspace(1)* %c.gep + %d = load volatile double, double addrspace(1)* %d.gep + + %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0) + %fneg.fma0 = fsub double -0.0, %fma0 + %mul1 = fmul double %fneg.fma0, %c + %mul2 = fmul double %fneg.fma0, %d + + store volatile double %mul1, double addrspace(1)* %out + store volatile double %mul2, double addrspace(1)* %out + ret void +} + +; %trunc.a has one fneg use, but it requires a code size increase and +; %the fneg can instead be folded for free into the fma. + +; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] +; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] +; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] +; GCN: buffer_store_dword [[FMA0]] +define void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %d = load volatile float, float addrspace(1)* %d.gep + + %trunc.a = call float @llvm.trunc.f32(float %a) + %trunc.fneg.a = fsub float -0.0, %trunc.a + %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) + store volatile float %fma0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] +; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] +; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] +; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]] +; GCN: buffer_store_dword [[FMA0]] +; GCN: buffer_store_dword [[MUL1]] +define void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext + %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %c = load volatile float, float addrspace(1)* %c.gep + %d = load volatile float, float addrspace(1)* %d.gep + + %trunc.a = call float @llvm.trunc.f32(float %a) + %trunc.fneg.a = fsub float -0.0, %trunc.a + %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) + %mul1 = fmul float %trunc.a, %d + store volatile float %fma0, float addrspace(1)* %out + store volatile float %mul1, float addrspace(1)* %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fma.f32(float, float, float) #1 declare float @llvm.fmuladd.f32(float, float, float) #1 @@ -1360,10 +1737,14 @@ declare float @llvm.rint.f32(float) #1 declare float @llvm.nearbyint.f32(float) #1 +declare double @llvm.fma.f64(double, double, double) #1 + declare float @llvm.amdgcn.sin.f32(float) #1 declare float @llvm.amdgcn.rcp.f32(float) #1 declare float @llvm.amdgcn.rcp.legacy(float) #1 declare float @llvm.amdgcn.fmul.legacy(float, float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/frem.ll =================================================================== --- test/CodeGen/AMDGPU/frem.ll +++ test/CodeGen/AMDGPU/frem.ll @@ -12,8 +12,8 @@ ; GCN: v_mul_f32_e32 ; GCN: v_div_fmas_f32 ; GCN: v_div_fixup_f32 -; GCN: v_trunc_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}} -; GCN: v_mac_f32_e32 +; GCN: v_trunc_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 { @@ -28,12 +28,11 @@ ; FUNC-LABEL: {{^}}unsafe_frem_f32: ; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 ; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}} -; GCN: v_rcp_f32_e64 [[INVY:v[0-9]+]], -[[Y]] +; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]] ; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]] ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]] -; GCN: v_mac_f32_e32 [[X]], [[Y]], [[TRUNC]] -; GCN: buffer_store_dword [[X]] -; GCN: s_endpgm +; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]] +; GCN: buffer_store_dword [[RESULT]] define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #1 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 Index: test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- test/CodeGen/AMDGPU/v_mac_f16.ll +++ test/CodeGen/AMDGPU/v_mac_f16.ll @@ -65,9 +65,10 @@ } ; GCN-LABEL: {{^}}mac_f16_neg_a: -; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] +; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -91,9 +92,11 @@ } ; GCN-LABEL: {{^}}mac_f16_neg_b: -; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] +; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] + ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN: s_endpgm @@ -117,9 +120,9 @@ ; GCN-LABEL: {{^}}mac_f16_neg_c: ; SI: v_cvt_f32_f16_e32 -; SI-DAG: v_cvt_f32_f16_e32 -; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} -; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 +; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} @@ -215,9 +218,10 @@ } ; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math: -; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] +; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} @@ -241,9 +245,10 @@ } ; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math: -; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_OTHER:v[0-9]+]], v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, [[CVT_OTHER]], [[CVT_NEG]] +; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} @@ -267,10 +272,10 @@ } ; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math: -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG:v[0-9]+]], -v{{[0-9]+}} -; SI: v_mac_f32_e32 [[CVT_NEG]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} +; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]] ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} @@ -373,11 +378,11 @@ } ; GCN-LABEL: {{^}}mac_v2f16_neg_a: -; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} -; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}} ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -402,10 +407,10 @@ } ; GCN-LABEL: {{^}}mac_v2f16_neg_b -; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} -; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}} ; VI-NOT: v_mac_f16 @@ -431,11 +436,15 @@ } ; GCN-LABEL: {{^}}mac_v2f16_neg_c: -; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} -; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} -; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT2]] +; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT5]] ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} @@ -544,11 +553,15 @@ } ; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math: -; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} -; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} @@ -573,11 +586,15 @@ } ; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math: -; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} -; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG0]] -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_NEG1]] +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} @@ -602,11 +619,15 @@ } ; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math: -; SI: v_cvt_f32_f16_e64 [[CVT_NEG0:v[0-9]+]], -{{v[0-9]+}} -; SI: v_cvt_f32_f16_e64 [[CVT_NEG1:v[0-9]+]], -{{v[0-9]+}} - -; SI-DAG: v_mac_f32_e32 [[CVT_NEG0]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 [[CVT_NEG1]], v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} +; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} + +; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} ; VI-NOT: v_mac_f16 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}