Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -85,6 +85,7 @@ SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; + unsigned getFusedOpcode(const SelectionDAG &DAG, EVT VT) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3906,24 +3906,31 @@ return SDValue(); } +unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, EVT VT) const { + // Only do this if we are not trying to support denormals. v_mad_f32 does not + // support denormals ever. + if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || + (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) + return ISD::FMAD; + + const TargetOptions &Options = DAG.getTarget().Options; + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && + isFMAFasterThanFMulAndFAdd(VT)) { + return ISD::FMA; + } + + return 0; +} + SDValue SITargetLowering::performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) return SDValue(); + SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); - if (VT == MVT::f64) - return SDValue(); - - assert(!VT.isVector()); - - // Only do this if we are not trying to support denormals. v_mad_f32 does - // not support denormals ever. - if ((VT == MVT::f32 && Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && Subtarget->hasFP16Denormals())) - return SDValue(); + assert(!VT.isVector()); - SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -3935,8 +3942,11 @@ if (LHS.getOpcode() == ISD::FADD) { SDValue A = LHS.getOperand(0); if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, SL, VT); - return DAG.getNode(ISD::FMAD, SL, VT, Two, A, RHS); + unsigned FusedOp = getFusedOpcode(DAG, VT); + if (FusedOp != 0) { + const SDValue Two = DAG.getConstantFP(2.0, SL, VT); + return DAG.getNode(FusedOp, SL, VT, Two, A, RHS); + } } } @@ -3944,8 +3954,11 @@ if (RHS.getOpcode() == ISD::FADD) { SDValue A = RHS.getOperand(0); if (A == RHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, SL, VT); - return DAG.getNode(ISD::FMAD, SL, VT, Two, A, LHS); + unsigned FusedOp = getFusedOpcode(DAG, VT); + if (FusedOp != 0) { + const SDValue Two = DAG.getConstantFP(2.0, SL, VT); + return DAG.getNode(FusedOp, SL, VT, Two, A, LHS); + } } } @@ -3967,29 +3980,31 @@ // // Only do this if we are not trying to support denormals. v_mad_f32 does // not support denormals ever. - if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) { - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS.getOpcode() == ISD::FADD) { - // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) - - SDValue A = LHS.getOperand(0); - if (A == LHS.getOperand(1)) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() == ISD::FADD) { + // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + unsigned FusedOp = getFusedOpcode(DAG, VT); + if (FusedOp != 0){ const SDValue Two = DAG.getConstantFP(2.0, SL, VT); SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); - return DAG.getNode(ISD::FMAD, SL, VT, Two, A, NegRHS); + return DAG.getNode(FusedOp, SL, VT, Two, A, NegRHS); } } + } - if (RHS.getOpcode() == ISD::FADD) { - // (fsub c, (fadd a, a)) -> mad -2.0, a, c + if (RHS.getOpcode() == ISD::FADD) { + // (fsub c, (fadd a, a)) -> mad -2.0, a, c - SDValue A = RHS.getOperand(0); - if (A == RHS.getOperand(1)) { + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + unsigned FusedOp = getFusedOpcode(DAG, VT); + if (FusedOp != 0){ const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); - return DAG.getNode(ISD::FMAD, SL, VT, NegTwo, A, LHS); + return DAG.getNode(FusedOp, SL, VT, NegTwo, A, LHS); } } } Index: test/CodeGen/AMDGPU/fmuladd.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -0,0 +1,467 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-STRICT -check-prefix=VI-FLUSH -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-STRICT -check-prefix=VI-FLUSH -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-CONTRACT -check-prefix=VI-FLUSH -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-CONTRACT -check-prefix=VI-FLUSH -check-prefix=VI %s + +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-STRICT -check-prefix=VI-DENORM-STRICT -check-prefix=VI-DENORM -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-STRICT -check-prefix=VI-DENORM-STRICT -check-prefix=VI-DENORM -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-CONTRACT -check-prefix=VI-DENORM-CONTRACT -check-prefix=VI-DENORM -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-CONTRACT -check-prefix=VI-DENORM-CONTRACT -check-prefix=VI-DENORM -check-prefix=VI %s + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare half @llvm.fmuladd.f16(half, half, half) #1 +declare half @llvm.fabs.f16(half) #1 + +; GCN-LABEL: {{^}}fmuladd_f16: +; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, + half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { + %r0 = load half, half addrspace(1)* %in1 + %r1 = load half, half addrspace(1)* %in2 + %r2 = load half, half addrspace(1)* %in3 + %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2) + store half %r3, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_a_a_b_f16: +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_a_a_b_f16(half addrspace(1)* %out, + half addrspace(1)* %in1, + half addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r0 = load volatile half, half addrspace(1)* %gep.0 + %r1 = load volatile half, half addrspace(1)* %gep.1 + + %add.0 = fadd half %r0, %r0 + %add.1 = fadd half %add.0, %r1 + store half %add.1, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_b_a_a_f16: +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_b_a_a_f16(half addrspace(1)* %out, + half addrspace(1)* %in1, + half addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r0 = load volatile half, half addrspace(1)* %gep.0 + %r1 = load volatile half, half addrspace(1)* %gep.1 + + %add.0 = fadd half %r0, %r0 + %add.1 = fadd half %r1, %add.0 + store half %add.1, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] +; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r1.fneg = fsub half -0.000000e+00, %r1 + + %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r1.fneg = fsub half -0.000000e+00, %r1 + + %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16 +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %r2.fneg = fsub half -0.000000e+00, %r2 + + %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg) + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}mad_sub_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] + +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] + +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %mul = fmul half %a, %b + %sub = fsub half %mul, %c + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_inv_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] + +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %mul = fmul half %a, %b + %sub = fsub half %c, %mul + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_fabs_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| + +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %c.abs = call half @llvm.fabs.f16(half %c) #0 + %mul = fmul half %a, %b + %sub = fsub half %mul, %c.abs + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] + +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| + +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %c.abs = call half @llvm.fabs.f16(half %c) #0 + %mul = fmul half %a, %b + %sub = fsub half %c.abs, %mul + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}neg_neg_mad_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] + +; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGB]], [[REGA]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] + +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %nega = fsub half -0.000000e+00, %a + %negb = fsub half -0.000000e+00, %b + %mul = fmul half %nega, %negb + %sub = fadd half %mul, %c + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}mad_fabs_sub_f16: +; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] + +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] + +; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| +; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext + %a = load volatile half, half addrspace(1)* %gep0, align 2 + %b = load volatile half, half addrspace(1)* %gep1, align 2 + %c = load volatile half, half addrspace(1)* %gep2, align 2 + %b.abs = call half @llvm.fabs.f16(half %b) #0 + %mul = fmul half %a, %b.abs + %sub = fsub half %mul, %c + store half %sub, half addrspace(1)* %outgep, align 2 + ret void +} + +; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16: +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] + +; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %add = fadd half %r1, %r1 + %r3 = fsub half %r2, %add + + store half %r3, half addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16: +; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], + +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] + +; VI-DENORM-CONTRACT: v_fma_f16 [[R2]], [[R1]], 2.0, -[[R2]] + +; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid + + %r1 = load volatile half, half addrspace(1)* %gep.0 + %r2 = load volatile half, half addrspace(1)* %gep.1 + + %add = fadd half %r1, %r1 + %r3 = fsub half %add, %r2 + + store half %r3, half addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/fmuladd.f32.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -0,0 +1,562 @@ +; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH-STRICT -check-prefix=GCN-FLUSH -check-prefix=SI-FLUSH -check-prefix=GCN-FLUSH-FASTFMA -check-prefix=GCN-FLUSH-FASTFMA-STRICT -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-DENORM-STRICT -check-prefix=GCN-DENORM -check-prefix=SI-DENORM -check-prefix=GCN-DENORM-FASTFMA -check-prefix=GCN-DENORM-FASTFMA-STRICT -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH-STRICT -check-prefix=GCN-FLUSH -check-prefix=SI-FLUSH -check-prefix=GCN-FLUSH-SLOWFMA -check-prefix=GCN-FLUSH-SLOWFMA-STRICT -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-DENORM-STRICT -check-prefix=GCN-DENORM -check-prefix=SI-DENORM -check-prefix=GCN-DENORM-SLOWFMA -check-prefix=GCN-DENORM-SLOWFMA-STRICT -check-prefix=SI %s + +; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH-CONTRACT -check-prefix=GCN-FLUSH -check-prefix=SI-FLUSH -check-prefix=GCN-FLUSH-FASTFMA -check-prefix=GCN-FLUSH-FASTFMA-CONTRACT -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-DENORM-CONTRACT -check-prefix=GCN-DENORM -check-prefix=SI-DENORM -check-prefix=GCN-DENORM-FASTFMA -check-prefix=GCN-DENORM-FASTFMA-CONTRACT -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH-CONTRACT -check-prefix=GCN-FLUSH -check-prefix=SI-FLUSH -check-prefix=GCN-FLUSH-SLOWFMA -check-prefix=GCN-FLUSH-SLOWFMA-CONTRACT -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-DENORM-CONTRACT -check-prefix=GCN-DENORM -check-prefix=SI-DENORM -check-prefix=GCN-DENORM-SLOWFMA -check-prefix=GCN-DENORM-SLOWFMA-CONTRACT -check-prefix=SI %s + +; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. + +target triple = "amdgcn--" + + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare float @llvm.fmuladd.f32(float, float, float) #1 +declare half @llvm.fmuladd.f16(half, half, half) #1 +declare float @llvm.fabs.f32(float) #1 + +; GCN-LABEL: {{^}}fmuladd_f32: +; GCN-FLUSH: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { + %r0 = load float, float addrspace(1)* %in1 + %r1 = load float, float addrspace(1)* %in2 + %r2 = load float, float addrspace(1)* %in3 + %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) + store float %r3, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI-DENORM buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_a_a_b_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_a_a_b_f32(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r0 = load volatile float, float addrspace(1)* %gep.0 + %r1 = load volatile float, float addrspace(1)* %gep.1 + + %add.0 = fadd float %r0, %r0 + %add.1 = fadd float %add.0, %r1 + store float %add.1, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_b_a_a_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_b_a_a_f32(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r0 = load volatile float, float addrspace(1)* %gep.0 + %r1 = load volatile float, float addrspace(1)* %gep.1 + + %add.0 = fadd float %r0, %r0 + %add.1 = fadd float %r1, %add.0 + store float %add.1, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] + +; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] + +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r1.fneg = fsub float -0.000000e+00, %r1 + + %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] + +; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r1.fneg = fsub float -0.000000e+00, %r1 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] +; SI-FLUSH: buffer_store_dword [[RESULT]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] + +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r2.fneg = fsub float -0.000000e+00, %r2 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}mad_sub_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %mul = fmul float %a, %b + %sub = fsub float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_inv_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] + +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] + +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %mul = fmul float %a, %b + %sub = fsub float %c, %mul + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_fabs_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| + +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %c.abs = call float @llvm.fabs.f32(float %c) #0 + %mul = fmul float %a, %b + %sub = fsub float %mul, %c.abs + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] + +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %c.abs = call float @llvm.fabs.f32(float %c) #0 + %mul = fmul float %a, %b + %sub = fsub float %c.abs, %mul + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}neg_neg_mad_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] + +; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]] +; SI-FLUSH: buffer_store_dword [[REGC]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] + +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %nega = fsub float -0.000000e+00, %a + %negb = fsub float -0.000000e+00, %b + %mul = fmul float %nega, %negb + %sub = fadd float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}mad_fabs_sub_f32: +; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| +; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| +; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %gep0, align 4 + %b = load volatile float, float addrspace(1)* %gep1, align 4 + %c = load volatile float, float addrspace(1)* %gep2, align 4 + %b.abs = call float @llvm.fabs.f32(float %b) #0 + %mul = fmul float %a, %b.abs + %sub = fsub float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; SI-FLUSH: buffer_store_dword [[R2]] +; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] + +; SI-DENORM: buffer_store_dword [[RESULT]] +; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %add = fadd float %r1, %r1 + %r3 = fsub float %r2, %add + + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] + +; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] + +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %add = fadd float %r1, %r1 + %r3 = fsub float %add, %r2 + + store float %r3, float addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/fmuladd.f64.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fmuladd.f64.ll @@ -0,0 +1,119 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-STRICT -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-STRICT -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-CONTRACT -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-CONTRACT -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-STRICT -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tonga -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-CONTRACT -check-prefix=VI %s + +; GCN-LABEL: {{^}}fmuladd_f64: +; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = load double, double addrspace(1)* %in3 + %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2) + store double %r3, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmul_fadd_f64: +; GCN-CONTRACT: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} + +; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = load double, double addrspace(1)* %in3 + %tmp = fmul double %r0, %r1 + %r3 = fadd double %tmp, %r2 + store double %r3, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fadd_a_a_b_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]], +; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]], + +; GCN-STRICT: v_add_f64 [[TMP:v\[[0-9]+:[0-9]+\]]], [[R1]], [[R1]] +; GCN-STRICT: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]], [[R2]] + +; GCN-CONTRACT: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R1]], 2.0, [[R2]] + +; SI: buffer_store_dwordx2 [[RESULT]] +; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_a_a_b_f64(double addrspace(1)* %out, + double addrspace(1)* %in1, + double addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %r0 = load volatile double, double addrspace(1)* %gep.0 + %r1 = load volatile double, double addrspace(1)* %gep.1 + + %add.0 = fadd double %r0, %r0 + %add.1 = fadd double %add.0, %r1 + store double %add.1, double addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_b_a_a_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[R1:v\[[0-9]+:[0-9]+\]]], +; GCN: {{buffer|flat}}_load_dwordx2 [[R2:v\[[0-9]+:[0-9]+\]]], + +; GCN-STRICT: v_add_f64 [[TMP:v\[[0-9]+:[0-9]+\]]], [[R1]], [[R1]] +; GCN-STRICT: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R2]], [[TMP]] + +; GCN-CONTRACT: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[R1]], 2.0, [[R2]] + +; SI: buffer_store_dwordx2 [[RESULT]] +; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_b_a_a_f64(double addrspace(1)* %out, + double addrspace(1)* %in1, + double addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %r0 = load volatile double, double addrspace(1)* %gep.0 + %r1 = load volatile double, double addrspace(1)* %gep.1 + + %add.0 = fadd double %r0, %r0 + %add.1 = fadd double %r1, %add.0 + store double %add.1, double addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}mad_sub_f64: +; GCN-STRICT: v_mul_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} + +; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} +define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext + %a = load volatile double, double addrspace(1)* %gep0, align 8 + %b = load volatile double, double addrspace(1)* %gep1, align 8 + %c = load volatile double, double addrspace(1)* %gep2, align 8 + %mul = fmul double %a, %b + %sub = fsub double %mul, %c + store double %sub, double addrspace(1)* %outgep, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare double @llvm.fmuladd.f64(double, double, double) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/fmuladd.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.ll +++ /dev/null @@ -1,400 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -declare double @llvm.fmuladd.f64(double, double, double) #1 -declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare float @llvm.fabs.f32(float) #1 -declare float @llvm.fmuladd.f32(float, float, float) #1 -declare half @llvm.fabs.f16(half) #1 -declare half @llvm.fmuladd.f16(half, half, half) #1 - -; GCN-LABEL: {{^}}fmuladd_f64: -; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2, double addrspace(1)* %in3) #0 { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = load double, double addrspace(1)* %in3 - %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2) - store double %r3, double addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_f32: -; GCN: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, - float addrspace(1)* %in2, float addrspace(1)* %in3) #0 { - %r0 = load float, float addrspace(1)* %in1 - %r1 = load float, float addrspace(1)* %in2 - %r2 = load float, float addrspace(1)* %in3 - %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) - store float %r3, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fadd_a_a_b_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fadd_a_a_b_f32(float addrspace(1)* %out, - float addrspace(1)* %in1, - float addrspace(1)* %in2) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r0 = load volatile float, float addrspace(1)* %gep.0 - %r1 = load volatile float, float addrspace(1)* %gep.1 - - %add.0 = fadd float %r0, %r0 - %add.1 = fadd float %add.0, %r1 - store float %add.1, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fadd_b_a_a_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fadd_b_a_a_f32(float addrspace(1)* %out, - float addrspace(1)* %in1, - float addrspace(1)* %in2) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r0 = load volatile float, float addrspace(1)* %gep.0 - %r1 = load volatile float, float addrspace(1)* %gep.1 - - %add.0 = fadd float %r0, %r0 - %add.1 = fadd float %r1, %add.0 - store float %add.1, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], -2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r1.fneg = fsub float -0.000000e+00, %r1 - - %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], -2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r1.fneg = fsub float -0.000000e+00, %r1 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] - -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %r2.fneg = fsub float -0.000000e+00, %r2 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_f16: -; VI: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, - half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { - %r0 = load half, half addrspace(1)* %in1 - %r1 = load half, half addrspace(1)* %in2 - %r2 = load half, half addrspace(1)* %in3 - %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2) - store half %r3, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], 2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], 2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fadd_a_a_b_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], 2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fadd_a_a_b_f16(half addrspace(1)* %out, - half addrspace(1)* %in1, - half addrspace(1)* %in2) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r0 = load volatile half, half addrspace(1)* %gep.0 - %r1 = load volatile half, half addrspace(1)* %gep.1 - - %add.0 = fadd half %r0, %r0 - %add.1 = fadd half %add.0, %r1 - store half %add.1, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fadd_b_a_a_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], 2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fadd_b_a_a_f16(half addrspace(1)* %out, - half addrspace(1)* %in1, - half addrspace(1)* %in2) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r0 = load volatile half, half addrspace(1)* %gep.0 - %r1 = load volatile half, half addrspace(1)* %gep.1 - - %add.0 = fadd half %r0, %r0 - %add.1 = fadd half %r1, %add.0 - store half %add.1, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], -2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], 2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r1.fneg = fsub half -0.000000e+00, %r1 - - %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], -2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r1.fneg = fsub half -0.000000e+00, %r1 - - %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mad_f16 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %r2.fneg = fsub half -0.000000e+00, %r2 - - %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg) - store half %r3, half addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/mad-sub.ll =================================================================== --- test/CodeGen/AMDGPU/mad-sub.ll +++ /dev/null @@ -1,420 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare float @llvm.fabs.f32(float) #0 -declare half @llvm.fabs.f16(half) #0 - -; GCN-LABEL: {{^}}mad_sub_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] - -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %mul = fmul float %a, %b - %sub = fsub float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_inv_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] - -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %mul = fmul float %a, %b - %sub = fsub float %c, %mul - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_f64: -; GCN: v_mul_f64 -; GCN: v_add_f64 -define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext - %a = load volatile double, double addrspace(1)* %gep0, align 8 - %b = load volatile double, double addrspace(1)* %gep1, align 8 - %c = load volatile double, double addrspace(1)* %gep2, align 8 - %mul = fmul double %a, %b - %sub = fsub double %mul, %c - store double %sub, double addrspace(1)* %outgep, align 8 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_fabs_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %c.abs = call float @llvm.fabs.f32(float %c) #0 - %mul = fmul float %a, %b - %sub = fsub float %mul, %c.abs - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %c.abs = call float @llvm.fabs.f32(float %c) #0 - %mul = fmul float %a, %b - %sub = fsub float %c.abs, %mul - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}neg_neg_mad_f32: -; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %nega = fsub float -0.000000e+00, %a - %negb = fsub float -0.000000e+00, %b - %mul = fmul float %nega, %negb - %sub = fadd float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}mad_fabs_sub_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load volatile float, float addrspace(1)* %gep0, align 4 - %b = load volatile float, float addrspace(1)* %gep1, align 4 - %c = load volatile float, float addrspace(1)* %gep2, align 4 - %b.abs = call float @llvm.fabs.f32(float %b) #0 - %mul = fmul float %a, %b.abs - %sub = fsub float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mac_f32_e32 [[R2]], -2.0, [[R1]] - -; SI: buffer_store_dword [[R2]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %add = fadd float %r1, %r1 - %r3 = fsub float %r2, %add - - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] - -; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load volatile float, float addrspace(1)* %gep.0 - %r2 = load volatile float, float addrspace(1)* %gep.1 - - %add = fadd float %r1, %r1 - %r3 = fsub float %add, %r2 - - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}mad_sub_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] - -; VI: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %mul = fmul half %a, %b - %sub = fsub half %mul, %c - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_inv_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] -; VI: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %mul = fmul half %a, %b - %sub = fsub half %c, %mul - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_fabs_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] -; VI: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %c.abs = call half @llvm.fabs.f16(half %c) #0 - %mul = fmul half %a, %b - %sub = fsub half %mul, %c.abs - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] - -; VI: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %c.abs = call half @llvm.fabs.f16(half %c) #0 - %mul = fmul half %a, %b - %sub = fsub half %c.abs, %mul - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}neg_neg_mad_f16: -; VI: v_mac_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %nega = fsub half -0.000000e+00, %a - %negb = fsub half -0.000000e+00, %b - %mul = fmul half %nega, %negb - %sub = fadd half %mul, %c - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}mad_fabs_sub_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] - -; VI: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext - %a = load volatile half, half addrspace(1)* %gep0, align 2 - %b = load volatile half, half addrspace(1)* %gep1, align 2 - %c = load volatile half, half addrspace(1)* %gep2, align 2 - %b.abs = call half @llvm.fabs.f16(half %b) #0 - %mul = fmul half %a, %b.abs - %sub = fsub half %mul, %c - store half %sub, half addrspace(1)* %outgep, align 2 - ret void -} - -; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI: v_mac_f16_e32 [[R2]], -2.0, [[R1]] - -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %add = fadd half %r1, %r1 - %r3 = fsub half %r2, %add - - store half %r3, half addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], - -; VI: v_mad_f16 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { - %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone - %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid - - %r1 = load volatile half, half addrspace(1)* %gep.0 - %r2 = load volatile half, half addrspace(1)* %gep.1 - - %add = fadd half %r1, %r1 - %r3 = fsub half %add, %r2 - - store half %r3, half addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind }