Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -516,7 +516,7 @@ // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); - if (!Subtarget->hasFP16Denormals()) + if (!Subtarget->hasFP16Denormals() && STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { @@ -8718,8 +8718,10 @@ // Only do this if we are not trying to support denormals. v_mad_f32 does not // support denormals ever. - if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) + if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || + (VT == MVT::f16 && !Subtarget->hasFP16Denormals() && + getSubtarget()->hasMadF16())) && + isOperationLegal(ISD::FMAD, VT)) return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2071,7 +2071,9 @@ } if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { + Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) { // Don't fold if we are using source or output modifiers. The new VOP2 // instructions don't have them. if (hasAnyModifiersSet(UseMI)) @@ -2086,7 +2088,10 @@ if (isInlineConstant(UseMI, *Src0, *ImmOp)) return false; - bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; + bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -2099,6 +2104,12 @@ if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) return false; + unsigned NewOpc = + IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) + : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); + if (pseudoToMCOpcode(NewOpc) == -1) + return false; + // We need to swap operands 0 and 1 since madmk constant is at operand 1. const int64_t Imm = ImmOp->getImm(); @@ -2119,14 +2130,16 @@ Src0->setIsKill(Src1->isKill()); if (Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_MAC_F16_e64) + Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); Src1->ChangeToImmediate(Imm); removeModOperands(UseMI); - UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); + UseMI.setDesc(get(NewOpc)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -2176,6 +2189,12 @@ // VGPR is okay as Src1 - fallthrough } + unsigned NewOpc = + IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) + : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); + if (pseudoToMCOpcode(NewOpc) == -1) + return false; + const int64_t Imm = ImmOp->getImm(); // FIXME: This would be a lot easier if we could return a new instruction @@ -2188,7 +2207,9 @@ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); if (Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_MAC_F16_e64) + Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -2197,7 +2218,7 @@ // These come before src2. removeModOperands(UseMI); - UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); + UseMI.setDesc(get(NewOpc)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -2310,18 +2331,21 @@ LiveVariables *LV) const { unsigned Opc = MI.getOpcode(); bool IsF16 = false; - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; switch (Opc) { default: return nullptr; case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_FMAC_F16_e64: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_FMAC_F32_e64: break; case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_FMAC_F16_e32: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e32: @@ -2350,32 +2374,38 @@ const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod && + if (!Src0Mods && !Src1Mods && !Clamp && !Omod && // If we have an SGPR input, we will violate the constant bus restriction. (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { if (auto Imm = getFoldableImm(Src2)) { - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32)) - .add(*Dst) - .add(*Src0) - .add(*Src1) - .addImm(Imm); + unsigned NewOpc = + IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) + : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); + if (pseudoToMCOpcode(NewOpc) != -1) + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) + .add(*Dst) + .add(*Src0) + .add(*Src1) + .addImm(Imm); } + unsigned NewOpc = + IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) + : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); if (auto Imm = getFoldableImm(Src1)) { - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) - .add(*Dst) - .add(*Src0) - .addImm(Imm) - .add(*Src2); + if (pseudoToMCOpcode(NewOpc) != -1) + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) + .add(*Dst) + .add(*Src0) + .addImm(Imm) + .add(*Src2); } if (auto Imm = getFoldableImm(Src0)) { - if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32, + if (pseudoToMCOpcode(NewOpc) != -1 && + isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), Src1)) - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) .add(*Src1) .addImm(Imm) @@ -2383,9 +2413,11 @@ } } - assert((!IsFMA || !IsF16) && "fmac only expected with f32"); - unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 : - (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); + unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32) + : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); + if (pseudoToMCOpcode(NewOpc) == -1) + return nullptr; + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) .addImm(Src0Mods ? Src0Mods->getImm() : 0) @@ -2678,6 +2710,7 @@ case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMAC_F16_e64: if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) return false; @@ -3410,13 +3443,16 @@ MachineBasicBlock *MBB = MI.getParent(); MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const SIRegisterInfo *TRI = + static_cast(MRI.getTargetRegisterInfo()); unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); - unsigned Opcode = AMDGPU::V_MOV_B32_e32; + unsigned Size = TRI->getRegSizeInBits(*RC); + unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; if (MO.isReg()) Opcode = AMDGPU::COPY; else if (RI.isSGPRClass(RC)) - Opcode = AMDGPU::S_MOV_B32; + Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) @@ -5332,6 +5368,12 @@ } uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { + if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { + return (16ULL << 44) | // IMG_FORMAT_32_FLOAT + (1ULL << 56) | // RESOURCE_LEVEL = 1 + (3ULL << 60); // OOB_SELECT = 3 + } + uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; if (ST.isAmdHsaOS()) { // Set ATC = 1. GFX9 doesn't have this bit. @@ -5358,12 +5400,14 @@ Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; } - // IndexStride = 64. - Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; + // IndexStride = 64 / 32. + uint64_t IndexStride = ST.getGeneration() <= AMDGPUSubtarget::GFX9 ? 3 : 2; + Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && + ST.getGeneration() <= AMDGPUSubtarget::GFX9) Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; return Rsrc23; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1462,7 +1462,7 @@ def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), - (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) + (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) >; } @@ -1523,6 +1523,14 @@ >; } // End OtherPredicates = [HasDLInsts] +let SubtargetPredicate = isGFX10Plus in +def : GCNPat < + (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)), + (f16 (VOP3NoMods f32:$src2))), + (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + SRCMODS.NONE, $src2, $clamp, $omod) +>; // Allow integer inputs class ExpPattern : GCNPat< Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -418,7 +418,9 @@ } assert(Src && Src->isReg()); - if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && !isSameReg(*Src, *getReplacedOperand())) { // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to @@ -460,7 +462,9 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused - if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || + if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || + MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || + MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && getDstSel() != AMDGPU::SDWA::DWORD) { // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD @@ -964,10 +968,16 @@ return false; } - if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 || + if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || + Opc == AMDGPU::V_FMAC_F32_e32 || + Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F32_e32)) return false; + // Check if target supports this SDWA opcode + if (TII->pseudoToMCOpcode(Opc) == -1) + return false; + // FIXME: has SDWA but require handling of implicit VCC use if (Opc == AMDGPU::V_CNDMASK_B32_e32) return false; @@ -1038,7 +1048,9 @@ SDWAInst.add(*Src1); } - if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || + if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || + SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || + SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { // v_mac_f16/32 has additional src2 operand tied to vdst MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); Index: test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fdiv.f16.ll +++ test/CodeGen/AMDGPU/fdiv.f16.ll @@ -1,7 +1,8 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s ; Make sure fdiv is promoted to f32. @@ -21,17 +22,17 @@ ; SI: v_div_fixup_f32 ; SI: v_cvt_f16_f32 -; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] -; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]] -; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]] +; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]] +; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]] -; GFX8_9-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]] -; GFX8_9: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]] -; GFX8_9: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] -; GFX8_9: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] -; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]] +; GFX8_9_10: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]] +; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] +; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16( half addrspace(1)* %r, half addrspace(1)* %a, @@ -50,11 +51,11 @@ } ; GCN-LABEL: {{^}}v_rcp_f16: -; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9-NOT: [[VAL]] -; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] -; GFX8_9-NOT: [[RESULT]] -; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] +; GFX8_9_10-NOT: [[VAL]] +; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; GFX8_9_10-NOT: [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -68,11 +69,11 @@ } ; GCN-LABEL: {{^}}v_rcp_f16_abs: -; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9-NOT: [[VAL]] -; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]| -; GFX8_9-NOT: [RESULT]] -; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] +; GFX8_9_10-NOT: [[VAL]] +; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]| +; GFX8_9_10-NOT: [RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -87,11 +88,11 @@ } ; GCN-LABEL: {{^}}v_rcp_f16_arcp: -; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9-NOT: [[VAL]] -; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] -; GFX8_9-NOT: [[RESULT]] -; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] +; GFX8_9_10-NOT: [[VAL]] +; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; GFX8_9_10-NOT: [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -105,11 +106,11 @@ } ; GCN-LABEL: {{^}}v_rcp_f16_neg: -; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9-NOT: [[VAL]] -; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]] -; GFX8_9-NOT: [RESULT]] -; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] +; GFX8_9_10-NOT: [[VAL]] +; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]] +; GFX8_9_10-NOT: [RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -123,11 +124,11 @@ } ; GCN-LABEL: {{^}}v_rsq_f16: -; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9-NOT: [[VAL]] -; GFX8_9: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] -; GFX8_9-NOT: [RESULT]] -; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] +; GFX8_9_10-NOT: [[VAL]] +; GFX8_9_10: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; GFX8_9_10-NOT: [RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -142,12 +143,12 @@ } ; GCN-LABEL: {{^}}v_rsq_f16_neg: -; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] -; GFX8_9-NOT: [[VAL]] -; GFX8_9: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]] -; GFX8_9-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]] -; GFX8_9-NOT: [RESULT]] -; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]] +; GFX8_9_10-NOT: [[VAL]] +; GFX8_9_10: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]] +; GFX8_9_10-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]] +; GFX8_9_10-NOT: [RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -162,13 +163,13 @@ } ; GCN-LABEL: {{^}}v_fdiv_f16_arcp: -; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] -; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] -; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] +; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] +; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] -; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -184,13 +185,13 @@ } ; GCN-LABEL: {{^}}v_fdiv_f16_unsafe: -; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]] -; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] -; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] +; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] +; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] -; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -208,8 +209,8 @@ ; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16: ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} -; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} -; GFX8_9: buffer_store_short [[MUL]] +; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} +; GFX8_9_10: buffer_store_short [[MUL]] define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef %rcp = fdiv arcp half %x, 2.0 @@ -220,8 +221,8 @@ ; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16: ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}} -; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}} -; GFX8_9: buffer_store_short [[MUL]] +; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}} +; GFX8_9_10: buffer_store_short [[MUL]] define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef %rcp = fdiv arcp half %x, 10.0 @@ -232,8 +233,8 @@ ; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16: ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}} -; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}} -; GFX8_9: buffer_store_short [[MUL]] +; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}} +; GFX8_9_10: buffer_store_short [[MUL]] define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 { %x = load half, half addrspace(1)* undef %rcp = fdiv arcp half %x, -10.0 Index: test/CodeGen/AMDGPU/fmac.sdwa.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fmac.sdwa.ll @@ -0,0 +1,76 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s + +; GCN-LABEL: {{^}}addMul2D: +; GFX1010: v_fmac_f16 +; GFX1010: v_fmac_f16 +define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly, float addrspace(4)* nocapture readonly, <2 x i32>, i32) local_unnamed_addr #0 { + %5 = extractelement <2 x i32> %2, i64 1 + %6 = icmp sgt i32 %5, 0 + br i1 %6, label %7, label %38 + +7: ; preds = %4 + %8 = extractelement <2 x i32> %2, i64 0 + %9 = icmp sgt i32 %8, 0 + br label %10 + +10: ; preds = %34, %7 + %11 = phi <4 x half> [ zeroinitializer, %7 ], [ %35, %34 ] + %12 = phi i32 [ 0, %7 ], [ %36, %34 ] + br i1 %9, label %13, label %34 + +13: ; preds = %10 + %14 = mul nsw i32 %12, %3 + %15 = mul nsw i32 %12, %8 + br label %16 + +16: ; preds = %16, %13 + %17 = phi <4 x half> [ %11, %13 ], [ %31, %16 ] + %18 = phi i32 [ 0, %13 ], [ %32, %16 ] + %19 = add nsw i32 %18, %14 + %20 = sext i32 %19 to i64 + %21 = getelementptr inbounds <4 x i8>, <4 x i8>* %0, i64 %20 + %22 = load <4 x i8>, <4 x i8>* %21, align 4 + %23 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %22) #8 + %24 = add nsw i32 %18, %15 + %25 = sext i32 %24 to i64 + %26 = getelementptr inbounds float, float addrspace(4)* %1, i64 %25 + %27 = load float, float addrspace(4)* %26, align 4 + %28 = fptrunc float %27 to half + %29 = insertelement <4 x half> undef, half %28, i32 0 + %30 = shufflevector <4 x half> %29, <4 x half> undef, <4 x i32> zeroinitializer + %31 = tail call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %23, <4 x half> %30, <4 x half> %17) + %32 = add nuw nsw i32 %18, 1 + %33 = icmp eq i32 %32, %8 + br i1 %33, label %34, label %16 + +34: ; preds = %16, %10 + %35 = phi <4 x half> [ %11, %10 ], [ %31, %16 ] + %36 = add nuw nsw i32 %12, 1 + %37 = icmp eq i32 %36, %5 + br i1 %37, label %38, label %10 + +38: ; preds = %34, %4 + %39 = phi <4 x half> [ zeroinitializer, %4 ], [ %35, %34 ] + ret <4 x half> %39 +} + +define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8>) local_unnamed_addr #1 { + %2 = extractelement <4 x i8> %0, i64 0 + %3 = uitofp i8 %2 to half + %4 = insertelement <4 x half> undef, half %3, i32 0 + %5 = extractelement <4 x i8> %0, i64 1 + %6 = uitofp i8 %5 to half + %7 = insertelement <4 x half> %4, half %6, i32 1 + %8 = extractelement <4 x i8> %0, i64 2 + %9 = uitofp i8 %8 to half + %10 = insertelement <4 x half> %7, half %9, i32 2 + %11 = extractelement <4 x i8> %0, i64 3 + %12 = uitofp i8 %11 to half + %13 = insertelement <4 x half> %10, half %12, i32 3 + ret <4 x half> %13 +} + +declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) + +attributes #0 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+dl-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx9-insts,+s-memrealtime,-code-object-v3,-sram-ecc,-xnack" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fp64-fp16-denormals,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/CodeGen/AMDGPU/fmuladd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fmuladd.f16.ll +++ test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -1,8 +1,13 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s + +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GFX10-FLUSH,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GFX10-FLUSH,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare half @llvm.fmuladd.f16(half, half, half) #1 @@ -12,6 +17,11 @@ ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX10-FLUSH: v_mul_f16_e32 +; GFX10-FLUSH: v_add_f16_e32 +; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2, half addrspace(1)* %in3) #0 { %r0 = load half, half addrspace(1)* %in1 @@ -23,13 +33,21 @@ } ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]] + +; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] +; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] + +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -45,13 +63,21 @@ } ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] + +; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] +; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] + ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -67,17 +93,25 @@ } ; GCN-LABEL: {{^}}fadd_a_a_b_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] -; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] +; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] +; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 { @@ -96,17 +130,25 @@ } ; GCN-LABEL: {{^}}fadd_b_a_a_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] -; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] +; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] +; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] + define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in1, half addrspace(1)* %in2) #0 { @@ -125,11 +167,17 @@ } ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] -; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] +; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] +; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] +; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] +; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -145,13 +193,20 @@ } ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] +; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] +; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] +; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -169,13 +224,20 @@ } ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] -; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] +; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] +; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] +; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -193,11 +255,14 @@ } ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16 -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], -; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] +; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], +; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] +; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] +; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] +; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -215,18 +280,22 @@ } ; GCN-LABEL: {{^}}mad_sub_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] +; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -246,17 +315,23 @@ } ; GCN-LABEL: {{^}}mad_sub_inv_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] +; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]] -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -276,17 +351,21 @@ } ; GCN-LABEL: {{^}}mad_sub_fabs_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| +; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| +; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| +; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -307,18 +386,22 @@ } ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| +; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] +; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] +; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -339,18 +422,24 @@ } ; GCN-LABEL: {{^}}neg_neg_mad_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] ; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] +; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]] -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -372,18 +461,22 @@ } ; GCN-LABEL: {{^}}mad_fabs_sub_f16: -; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]] ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] +; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] -; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| +; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| +; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -404,17 +497,24 @@ } ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] +; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] +; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] -; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -432,17 +532,21 @@ } ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16: -; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]], ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] -; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] +; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] -; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] +; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid Index: test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -2,6 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-DENORM %s declare half @llvm.fmuladd.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) @@ -23,6 +25,13 @@ ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; VI-DENORM: buffer_store_short [[RESULT]] +; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], v[[A_F16]], v[[B_F16]] +; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] +; GFX10-FLUSH: buffer_store_short [[ADD]] + +; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] +; GFX10-DENORM: buffer_store_short v[[C_F16]], + ; GCN: s_endpgm define amdgpu_kernel void @fmuladd_f16( half addrspace(1)* %r, @@ -53,6 +62,13 @@ ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]] ; VI-DENORM: buffer_store_short [[RESULT]] +; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[B_F16]] +; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] +; GFX10-FLUSH: buffer_store_short [[ADD]] + +; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]] +; GFX10-DENORM: buffer_store_short v[[C_F16]], + ; GCN: s_endpgm define amdgpu_kernel void @fmuladd_f16_imm_a( half addrspace(1)* %r, @@ -81,6 +97,12 @@ ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]] ; VI-DENORM buffer_store_short [[RESULT]] +; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[A_F16]] +; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] +; GFX10-FLUSH: buffer_store_short [[ADD]] + +; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]] +; GFX10-DENORM buffer_store_short v[[C_F16]], ; GCN: s_endpgm define amdgpu_kernel void @fmuladd_f16_imm_b( @@ -107,6 +129,9 @@ ; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; GFX10: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GFX10: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GFX10: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] @@ -116,7 +141,6 @@ ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] - ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] @@ -127,7 +151,6 @@ ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] - ; VI-FLUSH: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] @@ -144,6 +167,11 @@ ; VI-DENORM-NOT: v_and_b32 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]] +; GFX10-FLUSH: v_pk_mul_f16 [[MUL:v[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] +; GFX10-FLUSH: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[MUL]], v[[C_V2_F16]] + +; GFX10-DENORM: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] + ; GCN: buffer_store_dword v[[R_V2_F16]] define amdgpu_kernel void @fmuladd_v2f16( <2 x half> addrspace(1)* %r, Index: test/CodeGen/AMDGPU/madak.ll =================================================================== --- test/CodeGen/AMDGPU/madak.ll +++ test/CodeGen/AMDGPU/madak.ll @@ -1,6 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,MAD,GFX10-MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.fabs.f32(float) nounwind readnone @@ -12,7 +14,10 @@ ; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -33,17 +38,20 @@ ; it. ; GCN-LABEL: {{^}}madak_2_use_f32: -; GFX8_9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} -; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} -; GFX8_9: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} -; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -; GCN-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] -; GCN: s_endpgm +; GFX8_9_10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} +; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} +; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} +; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] +; FMA-DAG: v_fmac_f32_e32 [[VK]], [[VA]], [[VC]] +; GCN: s_endpgm define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -70,7 +78,8 @@ ; GCN-LABEL: {{^}}madak_m_inline_imm_f32: ; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]] -; GCN: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 +; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 +; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -94,7 +103,10 @@ ; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 +; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 +; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -112,11 +124,13 @@ ; We can't use an SGPR when forming madak ; GCN-LABEL: {{^}}s_v_madak_f32: -; GCN-DAG: s_load_dword [[SB:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]] -; GCN-NOT: v_madak_f32 -; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] +; GCN-DAG: s_load_dword [[SB:s[0-9]+]] +; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]] +; GCN-NOT: v_madak_f32 +; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] +; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 +; FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -131,11 +145,13 @@ } ; GCN-LABEL: @v_s_madak_f32 -; GCN-DAG: s_load_dword [[SB:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]] -; GCN-NOT: v_madak_f32 -; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] +; GCN-DAG: s_load_dword [[SB:s[0-9]+]] +; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]] +; GFX6_8_9-NOT: v_madak_f32 +; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] +; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 +; FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -151,7 +167,9 @@ ; GCN-LABEL: {{^}}s_s_madak_f32: ; GCN-NOT: v_madak_f32 -; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { %mul = fmul float %a, %b %madak = fadd float %mul, 10.0 @@ -160,12 +178,14 @@ } ; GCN-LABEL: {{^}}no_madak_src0_modifier_f32: -; GFX6: buffer_load_dword [[VA:v[0-9]+]] -; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} -; GCN: s_endpgm +; GFX6: buffer_load_dword [[VA:v[0-9]+]] +; GFX6: buffer_load_dword [[VB:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} +; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 +; FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 +; GCN: s_endpgm define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -184,12 +204,14 @@ } ; GCN-LABEL: {{^}}no_madak_src1_modifier_f32: -; GFX6: buffer_load_dword [[VA:v[0-9]+]] -; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} -; GCN: s_endpgm +; GFX6: buffer_load_dword [[VA:v[0-9]+]] +; GFX6: buffer_load_dword [[VB:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} +; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 +; FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 +; GCN: s_endpgm define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -207,16 +229,18 @@ ret void } -; SIFoldOperands should not fold the SGPR copy into the instruction +; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10 ; because the implicit immediate already uses the constant bus. +; On GFX10+ we can use two scalar operands. ; GCN-LABEL: {{^}}madak_constant_bus_violation: -; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} -; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] -; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] -; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] -; GFX6: buffer_store_dword [[MUL]] -; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]] +; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} +; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] +; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] +; MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] +; GFX6: buffer_store_dword [[MUL]] +; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]] define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { bb: %tmp = icmp eq i32 %arg1, 0 Index: test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir @@ -0,0 +1,293 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1010 -check-prefix=GCN %s + +# GCN-LABEL: {{^}}name: vop1_instructions + +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec + +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 6, 0, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec + +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 1, 5, 0, 5, implicit $exec + +--- +name: vop1_instructions +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32 } + - { id: 9, class: vgpr_32 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } + - { id: 16, class: vgpr_32 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vgpr_32 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vgpr_32 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vgpr_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vgpr_32 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: vgpr_32 } + - { id: 28, class: vgpr_32 } + - { id: 29, class: vgpr_32 } + - { id: 30, class: vgpr_32 } + - { id: 31, class: vgpr_32 } + - { id: 32, class: vgpr_32 } + - { id: 33, class: vgpr_32 } + - { id: 34, class: vgpr_32 } + - { id: 35, class: vgpr_32 } + - { id: 36, class: vgpr_32 } + - { id: 37, class: vgpr_32 } + - { id: 38, class: vgpr_32 } + - { id: 39, class: vgpr_32 } + - { id: 40, class: vgpr_32 } + - { id: 41, class: vgpr_32 } + - { id: 42, class: vgpr_32 } + - { id: 43, class: vgpr_32 } + - { id: 44, class: vgpr_32 } + - { id: 45, class: vgpr_32 } + - { id: 46, class: vgpr_32 } + - { id: 47, class: vgpr_32 } + - { id: 48, class: vgpr_32 } + - { id: 100, class: vgpr_32 } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31 + + %2 = COPY $sgpr30_sgpr31 + %1 = COPY $vgpr2_vgpr3 + %0 = COPY $vgpr0_vgpr1 + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + + %5 = S_MOV_B32 65535 + %6 = S_MOV_B32 65535 + + %10 = V_LSHRREV_B32_e64 16, %3, implicit $exec + %11 = V_MOV_B32_e32 %10, implicit $exec + %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec + %14 = V_FRACT_F32_e32 123, implicit $exec + %15 = V_LSHLREV_B32_e64 16, %14, implicit $exec + %16 = V_LSHRREV_B32_e64 16, %15, implicit $exec + %17 = V_SIN_F32_e32 %16, implicit $exec + %18 = V_LSHLREV_B32_e64 16, %17, implicit $exec + %19 = V_LSHRREV_B32_e64 16, %18, implicit $exec + %20 = V_CVT_U32_F32_e32 %19, implicit $exec + %21 = V_LSHLREV_B32_e64 16, %20, implicit $exec + %23 = V_CVT_F32_I32_e32 123, implicit $exec + %24 = V_LSHLREV_B32_e64 16, %23, implicit $exec + + %25 = V_LSHRREV_B32_e64 16, %3, implicit $exec + %26 = V_MOV_B32_e64 %25, implicit $exec + %26 = V_LSHLREV_B32_e64 16, %26, implicit $exec + %27 = V_FRACT_F32_e64 0, %6, 0, 0, implicit $exec + %28 = V_LSHLREV_B32_e64 16, %27, implicit $exec + %29 = V_LSHRREV_B32_e64 16, %28, implicit $exec + %30 = V_SIN_F32_e64 0, %29, 0, 0, implicit $exec + %31 = V_LSHLREV_B32_e64 16, %30, implicit $exec + %32 = V_LSHRREV_B32_e64 16, %31, implicit $exec + %33 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit $exec + %34 = V_LSHLREV_B32_e64 16, %33, implicit $exec + %35 = V_CVT_F32_I32_e64 %6, 0, 0, implicit $exec + %36 = V_LSHLREV_B32_e64 16, %35, implicit $exec + + + %37 = V_LSHRREV_B32_e64 16, %36, implicit $exec + %38 = V_FRACT_F32_e64 1, %37, 0, 0, implicit $exec + %39 = V_LSHLREV_B32_e64 16, %38, implicit $exec + %40 = V_LSHRREV_B32_e64 16, %39, implicit $exec + %41 = V_SIN_F32_e64 0, %40, 1, 0, implicit $exec + %42 = V_LSHLREV_B32_e64 16, %41, implicit $exec + %43 = V_LSHRREV_B32_e64 16, %42, implicit $exec + %44 = V_CVT_U32_F32_e64 1, %43, 0, 0, implicit $exec + %45 = V_LSHLREV_B32_e64 16, %44, implicit $exec + %46 = V_LSHRREV_B32_e64 16, %45, implicit $exec + %47 = V_CVT_F32_I32_e64 %46, 0, 1, implicit $exec + %48 = V_LSHLREV_B32_e64 16, %47, implicit $exec + + + %100 = V_MOV_B32_e32 %48, implicit $exec + + FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + $sgpr30_sgpr31 = COPY %2 + S_SETPC_B64_return $sgpr30_sgpr31 + +... +--- +# GCN-LABEL: {{^}}name: vop2_instructions + +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec + +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 0, 23, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec + +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 1, 23, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 0, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit $exec + +name: vop2_instructions +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32_xm0 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32 } + - { id: 9, class: vgpr_32 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } + - { id: 16, class: vgpr_32 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vgpr_32 } + - { id: 19, class: vgpr_32 } + - { id: 20, class: vgpr_32 } + - { id: 21, class: vgpr_32 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: vgpr_32 } + - { id: 24, class: vgpr_32 } + - { id: 25, class: vgpr_32 } + - { id: 26, class: vgpr_32 } + - { id: 27, class: vgpr_32 } + - { id: 28, class: vgpr_32 } + - { id: 29, class: vgpr_32 } + - { id: 30, class: vgpr_32 } + - { id: 31, class: vgpr_32 } + - { id: 32, class: vgpr_32 } + - { id: 33, class: vgpr_32 } + - { id: 34, class: vgpr_32 } + - { id: 35, class: vgpr_32 } + - { id: 36, class: vgpr_32 } + - { id: 37, class: vgpr_32 } + - { id: 38, class: vgpr_32 } + - { id: 39, class: vgpr_32 } + - { id: 40, class: vgpr_32 } + - { id: 41, class: vgpr_32 } + - { id: 42, class: vgpr_32 } + - { id: 43, class: vgpr_32 } + - { id: 44, class: vgpr_32 } + - { id: 45, class: vgpr_32 } + - { id: 46, class: vgpr_32 } + - { id: 47, class: vgpr_32 } + - { id: 48, class: vgpr_32 } + - { id: 49, class: vgpr_32 } + - { id: 50, class: vgpr_32 } + - { id: 51, class: vgpr_32 } + - { id: 52, class: vgpr_32 } + - { id: 53, class: vgpr_32 } + - { id: 54, class: vgpr_32 } + - { id: 55, class: vgpr_32 } + - { id: 56, class: vgpr_32 } + - { id: 57, class: vgpr_32 } + - { id: 58, class: vgpr_32 } + - { id: 59, class: vgpr_32 } + - { id: 60, class: vgpr_32 } + - { id: 100, class: vgpr_32 } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31 + + %2 = COPY $sgpr30_sgpr31 + %1 = COPY $vgpr2_vgpr3 + %0 = COPY $vgpr0_vgpr1 + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + + %5 = S_MOV_B32 65535 + %6 = S_MOV_B32 65535 + + %11 = V_LSHRREV_B32_e64 16, %3, implicit $exec + %12 = V_AND_B32_e32 %6, %11, implicit $exec + %13 = V_LSHLREV_B32_e64 16, %12, implicit $exec + %14 = V_LSHRREV_B32_e64 16, %13, implicit $exec + %15 = V_BFE_U32 %13, 8, 8, implicit $exec + %16 = V_ADD_F32_e32 %14, %15, implicit $exec + %17 = V_LSHLREV_B32_e64 16, %16, implicit $exec + %18 = V_LSHRREV_B32_e64 16, %17, implicit $exec + %19 = V_BFE_U32 %17, 8, 8, implicit $exec + %20 = V_SUB_F16_e32 %18, %19, implicit $exec + %21 = V_LSHLREV_B32_e64 16, %20, implicit $exec + %22 = V_BFE_U32 %20, 8, 8, implicit $exec + %23 = V_FMAC_F32_e32 %21, %22, %22, implicit $exec + %24 = V_LSHLREV_B32_e64 16, %23, implicit $exec + %25 = V_LSHRREV_B32_e64 16, %24, implicit $exec + %26 = V_BFE_U32 %24, 8, 8, implicit $exec + %27 = V_FMAC_F16_e32 %25, %26, %26, implicit $exec + %28 = V_LSHLREV_B32_e64 16, %27, implicit $exec + + %29 = V_LSHRREV_B32_e64 16, %28, implicit $exec + %30 = V_AND_B32_e64 23, %29, implicit $exec + %31 = V_LSHLREV_B32_e64 16, %30, implicit $exec + %32 = V_LSHRREV_B32_e64 16, %31, implicit $exec + %33 = V_BFE_U32 %31, 8, 8, implicit $exec + %34 = V_ADD_F32_e64 0, %32, 0, %33, 0, 0, implicit $exec + %35 = V_LSHLREV_B32_e64 16, %34, implicit $exec + %37 = V_BFE_U32 %35, 8, 8, implicit $exec + %38 = V_SUB_F16_e64 0, 23, 0, %37, 0, 0, implicit $exec + %39 = V_LSHLREV_B32_e64 16, %38, implicit $exec + %40 = V_BFE_U32 %39, 8, 8, implicit $exec + %41 = V_FMAC_F32_e64 0, 23, 0, %40, 0, %40, 0, 0, implicit $exec + %42 = V_LSHLREV_B32_e64 16, %41, implicit $exec + %43 = V_LSHRREV_B32_e64 16, %42, implicit $exec + %44 = V_BFE_U32 %42, 8, 8, implicit $exec + %45 = V_FMAC_F16_e64 0, %43, 0, %44, 0, %44, 0, 0, implicit $exec + %46 = V_LSHLREV_B32_e64 16, %45, implicit $exec + + %47 = V_LSHRREV_B32_e64 16, %46, implicit $exec + %48 = V_BFE_U32 %46, 8, 8, implicit $exec + %49 = V_ADD_F32_e64 0, %47, 1, %48, 0, 0, implicit $exec + %50 = V_LSHLREV_B32_e64 16, %49, implicit $exec + %51 = V_BFE_U32 %50, 8, 8, implicit $exec + %52 = V_SUB_F16_e64 1, 23, 1, %51, 0, 0, implicit $exec + %53 = V_LSHLREV_B32_e64 16, %52, implicit $exec + %54 = V_BFE_U32 %53, 8, 8, implicit $exec + %55 = V_FMAC_F32_e64 1, 23, 1, %54, 1, %54, 1, 0, implicit $exec + %56 = V_LSHLREV_B32_e64 16, %55, implicit $exec + %57 = V_LSHRREV_B32_e64 16, %56, implicit $exec + %58 = V_BFE_U32 %56, 8, 8, implicit $exec + %59 = V_FMAC_F16_e64 1, %57, 1, %58, 1, %58, 0, 2, implicit $exec + %60 = V_LSHLREV_B32_e64 16, %59, implicit $exec + + %100 = V_MOV_B32_e32 %60, implicit $exec + + FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + $sgpr30_sgpr31 = COPY %2 + S_SETPC_B64_return $sgpr30_sgpr31 + +... Index: test/CodeGen/AMDGPU/twoaddr-fma.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/twoaddr-fma.mir @@ -0,0 +1,183 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: test_fmamk_reg_imm_f32 +# GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec +--- +name: test_fmamk_reg_imm_f32 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = COPY %0.sub1 + %2 = V_MOV_B32_e32 1078523331, implicit $exec + %3 = V_FMAC_F32_e32 killed %0.sub0, %2, killed %1, implicit $exec + +... + +# GCN-LABEL: name: test_fmamk_imm_reg_f32 +# GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec +--- +name: test_fmamk_imm_reg_f32 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = COPY %0.sub1 + %2 = V_MOV_B32_e32 1078523331, implicit $exec + %3 = V_FMAC_F32_e32 %2, killed %0.sub0, killed %1, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_f32 +# GCN: V_FMAAK_F32 killed %0.sub0, %0.sub1, 1078523331, implicit $exec +--- +name: test_fmaak_f32 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = V_MOV_B32_e32 1078523331, implicit $exec + %2 = V_FMAC_F32_e32 killed %0.sub0, %0.sub1, %1, implicit $exec + +... + +# GCN-LABEL: name: test_fmamk_reg_imm_f16 +# GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec +--- +name: test_fmamk_reg_imm_f16 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = COPY %0.sub1 + %2 = V_MOV_B32_e32 1078523331, implicit $exec + %3 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $exec + +... + +# GCN-LABEL: name: test_fmamk_imm_reg_f16 +# GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec +--- +name: test_fmamk_imm_reg_f16 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = COPY %0.sub1 + %2 = V_MOV_B32_e32 1078523331, implicit $exec + %3 = V_FMAC_F16_e32 %2, killed %0.sub0, killed %1, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_f16 +# GCN: V_FMAAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $exec +--- +name: test_fmaak_f16 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = V_MOV_B32_e32 1078523331, implicit $exec + %2 = V_FMAC_F16_e32 killed %0.sub0, %0.sub1, %1, implicit $exec +... + +# GCN-LABEL: name: test_fmaak_sgpr_src0_f32 +# GCN: %2:vgpr_32 = V_FMAMK_F32 killed %0, 1078523331, %3:vgpr_32, implicit $exec + +--- +name: test_fmaak_sgpr_src0_f32 +registers: + - { id: 0, class: sreg_32_xm0 } + - { id: 1, class: vgpr_32} + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = V_MOV_B32_e32 1078523331, implicit $exec + %2 = V_FMAC_F32_e32 killed %0, %1, %3, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_inlineimm_src0_f32 +# GCN: %1:vgpr_32 = V_FMAMK_F32 1073741824, 1078523331, %2:vgpr_32, implicit $exec + +--- +name: test_fmaak_inlineimm_src0_f32 +registers: + - { id: 0, class: vgpr_32} + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + + %0 = V_MOV_B32_e32 1078523331, implicit $exec + %1 = V_FMAC_F32_e32 1073741824, %0, %2, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_otherimm_src0_f32 +# GCN: %1:vgpr_32 = V_FMAC_F32_e32 1120403456, %0, %1, implicit $exec + +--- +name: test_fmaak_otherimm_src0_f32 +registers: + - { id: 0, class: vgpr_32} + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: + + %0 = V_MOV_B32_e32 1078523331, implicit $exec + %1 = V_FMAC_F32_e32 1120403456, %0, %2, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_other_constantlike_src0_f32 +# GCN: %1:vgpr_32 = V_FMAC_F32_e32 %stack.0, %0, %1, implicit $exec +--- +name: test_fmaak_other_constantlike_src0_f32 +registers: + - { id: 0, class: vgpr_32} + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: "", type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, debug-info-variable: '', + debug-info-expression: '', debug-info-location: '' } +body: | + bb.0: + + %0 = V_MOV_B32_e32 1078523331, implicit $exec + %1 = V_FMAC_F32_e32 %stack.0, %0, %2, implicit $exec + +...