diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1012,7 +1012,7 @@ AMDGPURsrcIntrinsic<2, 0>; // gfx908 intrinsic -def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic; class AMDGPUStructBufferAtomic : Intrinsic < !if(NoRtn, [], [data_ty]), @@ -1049,7 +1049,7 @@ AMDGPURsrcIntrinsic<2, 0>; // gfx908 intrinsic -def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic; // Obsolescent tbuffer intrinsics. @@ -1181,6 +1181,19 @@ AMDGPURsrcIntrinsic<2, 0>; def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; + +class AMDGPUBufferAtomicFP : Intrinsic < + [llvm_anyfloat_ty], + [LLVMMatchType<0>, // vdata(VGPR) + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty], // slc(imm) + [ImmArg>], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; + +// Legacy form of the intrinsic. raw and struct forms should be preferred. +def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; } // defset AMDGPUBufferIntrinsics // Uses that do not set the done bit should set IntrWriteMem on the @@ -1800,27 +1813,7 @@ // gfx908 intrinsics // ===----------------------------------------------------------------------===// -class AMDGPUBufferAtomicNoRtn : Intrinsic < - [], - [llvm_anyfloat_ty, // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(SGPR/VGPR/imm) - llvm_i1_ty], // slc(imm) - [ImmArg>, IntrWillReturn], "", [SDNPMemOperand]>, - AMDGPURsrcIntrinsic<1, 0>; - -class AMDGPUGlobalAtomicNoRtn : Intrinsic < - [], - [llvm_anyptr_ty, // vaddr - llvm_anyfloat_ty], // vdata(VGPR) - [IntrArgMemOnly, IntrWillReturn, NoCapture>], "", - [SDNPMemOperand]>; - -def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicNoRtn; - -// Legacy form of the intrinsic. raw and struct forms should be preferred. -def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn; +def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn; // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp def int_amdgcn_mfma_f32_32x32x1f32 : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x1f32">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -141,6 +141,8 @@ bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const; bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const; + bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; + bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const; std::pair selectVOP3ModsImpl(MachineOperand &Root) const; @@ -180,11 +182,11 @@ selectSmrdSgpr(MachineOperand &Root) const; template - InstructionSelector::ComplexRendererFns + std::pair selectFlatOffsetImpl(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectFlatOffset(MachineOperand &Root) const; - InstructionSelector::ComplexRendererFns selectFlatOffsetSigned(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -1743,6 +1744,8 @@ return selectDSAppendConsume(I, false); case Intrinsic::amdgcn_s_barrier: return selectSBarrier(I); + case Intrinsic::amdgcn_global_atomic_fadd: + return selectGlobalAtomicFaddIntrinsic(I); default: { return selectImpl(I, *CoverageInfo); } @@ -2899,6 +2902,123 @@ return true; } +bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( + MachineInstr &MI) const { + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { + Function &F = MBB->getParent()->getFunction(); + DiagnosticInfoUnsupported + NoFpRet(F, "return versions of fp atomics not supported", + MI.getDebugLoc(), DS_Error); + F.getContext().diagnose(NoFpRet); + return false; + } + + // FIXME: This is only needed because tablegen requires number of dst operands + // in match and replace pattern to be the same. Otherwise patterns can be + // exported from SDag path. + MachineOperand &VDataIn = MI.getOperand(1); + MachineOperand &VIndex = MI.getOperand(3); + MachineOperand &VOffset = MI.getOperand(4); + MachineOperand &SOffset = MI.getOperand(5); + int16_t Offset = MI.getOperand(6).getImm(); + + bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI); + bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI); + + unsigned Opcode; + if (HasVOffset) { + Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN + : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN; + } else { + Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN + : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET; + } + + if (MRI->getType(VDataIn.getReg()).isVector()) { + switch (Opcode) { + case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN: + Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN; + break; + case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN: + Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN; + break; + case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN: + Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN; + break; + case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET: + Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET; + break; + } + } + + auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode)); + I.add(VDataIn); + + if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN || + Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) { + Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) + .addReg(VIndex.getReg()) + .addImm(AMDGPU::sub0) + .addReg(VOffset.getReg()) + .addImm(AMDGPU::sub1); + + I.addReg(IdxReg); + } else if (HasVIndex) { + I.add(VIndex); + } else if (HasVOffset) { + I.add(VOffset); + } + + I.add(MI.getOperand(2)); // rsrc + I.add(SOffset); + I.addImm(Offset); + renderExtractSLC(I, MI, 7); + I.cloneMemRefs(MI); + + MI.eraseFromParent(); + + return true; +} + +bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic( + MachineInstr &MI) const{ + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + + if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) { + Function &F = MBB->getParent()->getFunction(); + DiagnosticInfoUnsupported + NoFpRet(F, "return versions of fp atomics not supported", + MI.getDebugLoc(), DS_Error); + F.getContext().diagnose(NoFpRet); + return false; + } + + // FIXME: This is only needed because tablegen requires number of dst operands + // in match and replace pattern to be the same. Otherwise patterns can be + // exported from SDag path. + auto Addr = selectFlatOffsetImpl(MI.getOperand(2)); + + Register Data = MI.getOperand(3).getReg(); + const unsigned Opc = MRI->getType(Data).isVector() ? + AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) + .addReg(Addr.first) + .addReg(Data) + .addImm(Addr.second) + .addImm(0) // SLC + .cloneMemRefs(MI); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); @@ -3018,6 +3138,8 @@ assert(Intr && "not an image intrinsic with image pseudo"); return selectImageIntrinsic(I, Intr); } + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + return selectAMDGPU_BUFFER_ATOMIC_FADD(I); default: return selectImpl(I, *CoverageInfo); } @@ -3260,14 +3382,11 @@ } template -InstructionSelector::ComplexRendererFns +std::pair AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const { MachineInstr *MI = Root.getParent(); - InstructionSelector::ComplexRendererFns Default = {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // offset - }}; + auto Default = std::make_pair(Root.getReg(), 0); if (!STI.hasFlatInstOffsets()) return Default; @@ -3287,20 +3406,27 @@ Register BasePtr = OpDef->getOperand(1).getReg(); - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); }, - }}; + return std::make_pair(BasePtr, Offset.getValue()); } InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { - return selectFlatOffsetImpl(Root); + auto PtrWithOffset = selectFlatOffsetImpl(Root); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, + }}; } InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const { - return selectFlatOffsetImpl(Root); + auto PtrWithOffset = selectFlatOffsetImpl(Root); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, + }}; } /// Match a zero extend from a 32-bit value to 64-bits. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -483,6 +483,8 @@ defm atomic_load_umin : ret_noret_binary_atomic_op; defm atomic_load_xor : ret_noret_binary_atomic_op; defm atomic_load_fadd : ret_noret_binary_atomic_op; +let MemoryVT = v2f16 in +defm atomic_load_fadd_v2f16 : ret_noret_binary_atomic_op; defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op; def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -750,6 +750,9 @@ for (MachineInstr &MI : Range) { for (MachineOperand &Def : MI.defs()) { + if (MRI.use_nodbg_empty(Def.getReg())) + continue; + LLT ResTy = MRI.getType(Def.getReg()); const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); ResultRegs.push_back(Def.getReg()); @@ -2971,7 +2974,7 @@ } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, {1, 4}); + executeInWaterfallLoop(MI, MRI, {2, 5}); return; } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { @@ -3929,7 +3932,8 @@ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { // vdata_out OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); @@ -3952,23 +3956,6 @@ // initialized. break; } - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { - // vdata_in - OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); - - // rsrc - OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); - - // vindex - OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); - - // voffset - OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); - - // soffset - OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); - break; - } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { // vdata_out OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1094,14 +1094,12 @@ int_amdgcn_buffer_wbinvl1>; let SubtargetPredicate = HasAtomicFaddInsts in { - defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret + "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_noret_32 >; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32 >; - } // End SubtargetPredicate = HasAtomicFaddInsts //===----------------------------------------------------------------------===// @@ -1394,36 +1392,46 @@ defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; +class NoUseBufferAtomic : PatFrag < + (ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7), + (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)), + [{ return SDValue(N, 0).use_empty(); }]> { + + let GISelPredicateCode = [{ + return MRI.use_nodbg_empty(MI.getOperand(0).getReg()); + }]; +} + multiclass BufferAtomicPatterns_NO_RTN { def : GCNPat< - (name vt:$vdata_in, v4i32:$rsrc, 0, - 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, 0, + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (!cast(opcode # _OFFSET) getVregSrcForVT.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), + (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, i32:$vindex, + 0, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (!cast(opcode # _IDXEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name vt:$vdata_in, v4i32:$rsrc, 0, - i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, 0, + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, 0), (!cast(opcode # _OFFEN) getVregSrcForVT.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_slc $cachepolicy)) + (as_i16timm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< - (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, - i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), + (NoUseBufferAtomic vt:$vdata_in, v4i32:$rsrc, i32:$vindex, + i32:$voffset, i32:$soffset, timm:$offset, + timm:$cachepolicy, timm), (!cast(opcode # _BOTHEN) getVregSrcForVT.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -78,6 +78,7 @@ // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let AsmMatchConverter = ps.AsmMatchConverter; + let OtherPredicates = ps.OtherPredicates; let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; @@ -714,16 +715,16 @@ FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>; } // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1 -let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in { - -defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret ->; -defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret ->; - -} // End SubtargetPredicate = HasAtomicFaddInsts +let is_flat_global = 1 in { +let OtherPredicates = [HasAtomicFaddInsts] in { + defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < + "global_atomic_add_f32", VGPR_32, f32 + >; + defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < + "global_atomic_pk_add_f16", VGPR_32, v2f16 + >; +} // End OtherPredicates = [HasAtomicFaddInsts] +} // End is_flat_global = 1 //===----------------------------------------------------------------------===// // Flat Patterns @@ -1081,8 +1082,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>; -defm : GlobalFLATNoRtnAtomicPats ; -defm : GlobalFLATNoRtnAtomicPats ; +let OtherPredicates = [HasAtomicFaddInsts] in { +defm : GlobalFLATNoRtnAtomicPats ; +defm : GlobalFLATNoRtnAtomicPats ; +} } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1121,7 +1121,7 @@ case Intrinsic::amdgcn_buffer_atomic_fadd: { SIMachineFunctionInfo *MFI = MF.getInfo(); - Info.opc = ISD::INTRINSIC_VOID; + Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); Info.ptrVal = MFI->getBufferPSV( *MF.getSubtarget().getInstrInfo(), @@ -1135,18 +1135,6 @@ return true; } - case Intrinsic::amdgcn_global_atomic_fadd: { - Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = MVT::getVT(CI.getOperand(0)->getType() - ->getPointerElementType()); - Info.ptrVal = CI.getOperand(0); - Info.align.reset(); - - // FIXME: Should report an atomic ordering here. - Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - - return true; - } case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1171,6 +1159,17 @@ MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_global_atomic_fadd: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -7034,7 +7033,8 @@ case Intrinsic::amdgcn_buffer_atomic_umax: case Intrinsic::amdgcn_buffer_atomic_and: case Intrinsic::amdgcn_buffer_atomic_or: - case Intrinsic::amdgcn_buffer_atomic_xor: { + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_buffer_atomic_fadd: { unsigned Slc = cast(Op.getOperand(6))->getZExtValue(); unsigned IdxEn = 1; if (auto Idx = dyn_cast(Op.getOperand(4))) @@ -7094,6 +7094,17 @@ case Intrinsic::amdgcn_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_buffer_atomic_fadd: + if (!Op.getValue(0).use_empty()) { + DiagnosticInfoUnsupported + NoFpRet(DAG.getMachineFunction().getFunction(), + "return versions of fp atomics not supported", + DL.getDebugLoc(), DS_Error); + DAG.getContext()->diagnose(NoFpRet); + return SDValue(); + } + Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD; + break; default: llvm_unreachable("unhandled atomic opcode"); } @@ -7101,6 +7112,10 @@ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); case Intrinsic::amdgcn_raw_buffer_atomic_swap: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); case Intrinsic::amdgcn_raw_buffer_atomic_add: @@ -7226,6 +7241,27 @@ return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_global_atomic_fadd: { + if (!Op.getValue(0).use_empty()) { + DiagnosticInfoUnsupported + NoFpRet(DAG.getMachineFunction().getFunction(), + "return versions of fp atomics not supported", + DL.getDebugLoc(), DS_Error); + DAG.getContext()->diagnose(NoFpRet); + return SDValue(); + } + MemSDNode *M = cast(Op); + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + + EVT VT = Op.getOperand(3).getValueType(); + return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, + DAG.getVTList(VT, MVT::Other), Ops, + M->getMemOperand()); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) @@ -7547,39 +7583,6 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_raw_buffer_atomic_fadd: - return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); - case Intrinsic::amdgcn_struct_buffer_atomic_fadd: - return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); - case Intrinsic::amdgcn_buffer_atomic_fadd: { - unsigned Slc = cast(Op.getOperand(6))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast(Op.getOperand(4))) - IdxEn = Idx->getZExtValue() != 0; - SDValue Ops[] = { - Chain, - Op.getOperand(2), // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; - EVT VT = Op.getOperand(2).getValueType(); - - auto *M = cast(Op); - M->getMemOperand()->setOffset(Offset); - - return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_FADD, DL, - Op->getVTList(), Ops, VT, - M->getMemOperand()); - } case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -173,18 +173,6 @@ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; -class SDBufferAtomicNoRtn : SDNode , // rsrc - SDTCisVT<2, i32>, // vindex(VGPR) - SDTCisVT<3, i32>, // voffset(VGPR) - SDTCisVT<4, i32>, // soffset(SGPR) - SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) - SDTCisVT<7, i1>]>, // idxen(imm) - [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] ->; - def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">; def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">; def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">; @@ -198,7 +186,7 @@ def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; -def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD">; +def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -316,18 +304,6 @@ } // End let AddressSpaces = ... } // End foreach AddrSpace -def atomic_fadd_global_noret_impl : PatFrag< - (ops node:$ptr, node:$value), - (atomic_load_fadd node:$ptr, node:$value)> { - // FIXME: Move this - let MemoryVT = f32; - let IsAtomic = 1; - let AddressSpaces = StoreAddress_global.AddrSpaces; -} - -def atomic_fadd_global_noret : PatFrags<(ops node:$src0, node:$src1), - [(int_amdgcn_global_atomic_fadd node:$src0, node:$src1), - (atomic_fadd_global_noret_impl node:$src0, node:$src1)]>; //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2435,7 +2435,7 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; -def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction<1/*NoRtn*/>; +def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll @@ -0,0 +1,10 @@ +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 + +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #0 + +; GFX908: error: {{.*}} return versions of fp atomics not supported + +define float @global_atomic_fadd_f32_rtn(float addrspace(1)* %ptr, float %data) { + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) + ret float %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -8,7 +8,7 @@ ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void } @@ -26,7 +26,7 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data) ret void } @@ -44,7 +44,7 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(1)* %ptr, i64 -511 - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data) ret void } @@ -62,7 +62,7 @@ ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_endpgm %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data) ret void } @@ -73,7 +73,7 @@ ; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] - call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret void } @@ -91,11 +91,11 @@ ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511 - call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data) ret void } -declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #0 -declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #0 +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -16,7 +16,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -35,7 +35,7 @@ ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } @@ -52,7 +52,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void } @@ -70,7 +70,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } @@ -117,7 +117,7 @@ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] ; CHECK: bb.4: ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -162,7 +162,7 @@ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] ; CHECK: bb.4: ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } @@ -181,7 +181,7 @@ ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4095 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -200,7 +200,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void } @@ -218,7 +218,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -235,11 +235,11 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 -declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll @@ -0,0 +1,11 @@ +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 + +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 + +; GFX908: error: {{.*}} return versions of fp atomics not supported + +define amdgpu_ps float @buffer_atomic_add_f32_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +main_body: + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret float %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -18,7 +18,7 @@ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -39,7 +39,7 @@ ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) ret void } @@ -57,7 +57,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) ret void } @@ -76,7 +76,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -126,7 +126,7 @@ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] ; CHECK: bb.4: ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -173,7 +173,7 @@ ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] ; CHECK: bb.4: ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -194,7 +194,7 @@ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } @@ -212,7 +212,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) ret void } @@ -232,7 +232,7 @@ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -250,11 +250,11 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 - call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 -declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -58,14 +58,12 @@ ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -105,14 +103,12 @@ ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %8, %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -1961,16 +1961,12 @@ ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2013,16 +2009,12 @@ ; GREEDY: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]] ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GREEDY: bb.2: ; GREEDY: successors: %bb.3, %bb.2 - ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2074,16 +2066,12 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2127,16 +2115,12 @@ ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GREEDY: bb.2: ; GREEDY: successors: %bb.3, %bb.2 - ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2 - ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 - ; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2186,16 +2170,12 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2239,16 +2219,12 @@ ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GREEDY: bb.2: ; GREEDY: successors: %bb.3, %bb.2 - ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %31, %bb.2 - ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 - ; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %23(<4 x s32>), %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2297,16 +2273,12 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2349,16 +2321,12 @@ ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GREEDY: bb.2: ; GREEDY: successors: %bb.3, %bb.2 - ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2407,16 +2375,12 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2459,16 +2423,12 @@ ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GREEDY: bb.2: ; GREEDY: successors: %bb.3, %bb.2 - ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2517,16 +2477,12 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2569,16 +2525,12 @@ ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GREEDY: bb.2: ; GREEDY: successors: %bb.3, %bb.2 - ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2626,16 +2578,12 @@ ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2677,16 +2625,12 @@ ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; GREEDY: [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GREEDY: bb.2: ; GREEDY: successors: %bb.3, %bb.2 - ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %30, %bb.2 - ; GREEDY: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.1, %21(<4 x s32>), %bb.2 - ; GREEDY: [[PHI2:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %22(<4 x s32>), %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -15,27 +15,27 @@ ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4) ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4) @@ -49,13 +49,13 @@ ; GCN: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0 - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec - ; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4) - ; GCN: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4) - ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4) - ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 112, align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4) ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64 ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4) @@ -64,7 +64,7 @@ ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4) ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72 ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4) @@ -73,7 +73,7 @@ ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4) ; GCN: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80 ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4) @@ -82,7 +82,7 @@ ; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4) ; GCN: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0 @@ -101,7 +101,7 @@ ; GCN: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0 - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4) ; GCN: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96 ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4) @@ -110,7 +110,7 @@ ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4) ; GCN: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104 ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4) @@ -119,7 +119,7 @@ ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4) ; GCN: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112 @@ -135,7 +135,7 @@ ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4) ; GCN: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120 @@ -150,7 +150,7 @@ ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4) ; GCN: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -164,7 +164,7 @@ ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4) @@ -193,7 +193,7 @@ ; GCN: [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4) ; GCN: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0 - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4) ; GCN: [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] @@ -207,7 +207,7 @@ ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4) - ; GCN: INLINEASM &"", 1 + ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4) ; GCN: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152 @@ -268,10 +268,10 @@ call void asm sideeffect "", "" () - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2 - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2 - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2 + %fadd1 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2 + %fadd2 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 + %fadd3 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2 + %fadd4 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2 call void asm sideeffect "", "" () @@ -392,7 +392,7 @@ declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #2 declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #2 -declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2 +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0 declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #2 diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll @@ -68,7 +68,6 @@ declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 -declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #2 attributes #0 = { argmemonly nounwind } attributes #1 = { nounwind readnone willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s @@ -9,14 +8,14 @@ ; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32( ; OPT-NEXT: entry: ; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999 -; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3 +; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) [[ATTR3:#.*]] ; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT: if: ; OPT-NEXT: [[TMP0:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to i8 addrspace(1)* ; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to float addrspace(1)* -; OPT-NEXT: call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00) +; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00) ; OPT-NEXT: [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4 ; OPT-NEXT: br label [[ENDIF]] ; OPT: endif: @@ -57,7 +56,7 @@ br i1 %cmp, label %endif, label %if if: - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %in.gep, float 2.0) + %fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %in.gep, float 2.0) %val = load volatile float, float addrspace(1)* undef br label %endif @@ -71,7 +70,7 @@ } declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 -declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #2 +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #2 attributes #0 = { argmemonly nounwind } attributes #1 = { nounwind readnone willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll --- a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll @@ -8,12 +8,12 @@ ; have the instruction available. ; FIXME: Should also really make sure the v2f16 version fails. -; FAIL: LLVM ERROR: Cannot select: {{.+}}: ch = BUFFER_ATOMIC_FADD +; FAIL: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD define amdgpu_cs void @atomic_fadd(<4 x i32> inreg %arg0) { - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false) + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false) ret void } -declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0 +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900,CAS %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,CAS %s ; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32: -; GCN: [[LOOP:BB[0-9]+_[0-9]+]] -; GCN: v_add_f32_e32 -; GCN: global_atomic_cmpswap -; GCN: s_andn2_b64 exec, exec, -; GCN-NEXT: s_cbranch_execnz [[LOOP]] +; CAS: [[LOOP:BB[0-9]+_[0-9]+]] +; CAS: v_add_f32_e32 +; CAS: global_atomic_cmpswap +; CAS: s_andn2_b64 exec, exec, +; CAS-NEXT: s_cbranch_execnz [[LOOP]] define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) { %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll @@ -15,7 +15,7 @@ %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* - call void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep0, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep0, float %data) ret void } @@ -28,7 +28,7 @@ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* - call void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep1, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep1, float %data) ret void } @@ -40,7 +40,7 @@ %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)* - call void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep0, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep0, <2 x half> %data) ret void } @@ -53,11 +53,11 @@ %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)* - call void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep1, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep1, <2 x half> %data) ret void } -declare void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* nocapture, float) #0 -declare void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* nocapture, float) #0 +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 attributes #0 = { argmemonly nounwind willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll @@ -1,15 +1,15 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs | FileCheck %s -check-prefix=GCN -declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) -declare void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) -declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)*, float) -declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>) +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) +declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float) +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>) ; GCN-LABEL: {{^}}buffer_atomic_add_f32: ; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) ret void } @@ -17,7 +17,7 @@ ; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 slc define amdgpu_ps void @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) ret void } @@ -25,7 +25,7 @@ ; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) ret void } @@ -33,7 +33,7 @@ ; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 slc define amdgpu_ps void @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { main_body: - call void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) ret void } @@ -41,7 +41,7 @@ ; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off define amdgpu_kernel void @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) { main_body: - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void } @@ -50,7 +50,7 @@ define amdgpu_kernel void @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) { main_body: %p = getelementptr float, float addrspace(1)* %ptr, i64 1 - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %p, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data) ret void } @@ -59,7 +59,7 @@ define amdgpu_kernel void @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) { main_body: %p = getelementptr float, float addrspace(1)* %ptr, i64 -1 - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %p, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data) ret void } @@ -67,7 +67,7 @@ ; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off define amdgpu_kernel void @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { main_body: - call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret void } @@ -76,7 +76,7 @@ define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { main_body: %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1 - call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) ret void } @@ -85,7 +85,7 @@ define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { main_body: %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1 - call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) ret void } @@ -94,7 +94,7 @@ ; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget: ; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(float addrspace(1)* %ptr, float %data) #0 { - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data) + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -10,7 +10,7 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 24) ret void } @@ -23,7 +23,7 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[8:11], s6 ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } @@ -36,7 +36,7 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[8:11], s6 offen ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -49,7 +49,7 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[8:11], s6 offset:92 ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) ret void } @@ -62,11 +62,11 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 offen slc ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void } -declare void @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 -declare void @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -11,7 +11,7 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } @@ -25,7 +25,7 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s6 idxen ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } @@ -38,7 +38,7 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[8:11], s6 idxen offen slc ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) + %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } @@ -51,11 +51,11 @@ ; CHECK-NEXT: s_mov_b32 s8, s2 ; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[8:11], s6 idxen offen ; CHECK-NEXT: s_endpgm - call void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } -declare void @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 -declare void @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll @@ -29,12 +29,12 @@ %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64 %shl = shl i64 %cast, 2 %castback = inttoptr i64 %shl to float addrspace(1)* - call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %castback, float 100.0) + call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %castback, float 100.0) store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4 ret void } -declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #1 +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind willreturn }