Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -815,6 +815,123 @@ def int_amdgcn_buffer_store_format : AMDGPUBufferStore; def int_amdgcn_buffer_store : AMDGPUBufferStore; +// New buffer intrinsics with separate raw and struct variants. The raw +// variant never has an index. The struct variant always has an index, even if +// it is const 0. A struct intrinsic with constant 0 index is different to the +// corresponding raw intrinsic on gfx9+ because the behavior of bound checking +// and swizzling changes depending on whether idxen is set in the instruction. +// These new instrinsics also keep the offset and soffset arguments separate as +// they behave differently in bounds checking and swizzling. +class AMDGPURawBufferLoad : Intrinsic < + [llvm_anyfloat_ty], + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + [IntrReadMem], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<0>; +def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; +def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; + +class AMDGPUStructBufferLoad : Intrinsic < + [llvm_anyfloat_ty], + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + [IntrReadMem], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<0>; +def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; +def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; + +class AMDGPURawBufferStore : Intrinsic < + [], + [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + [IntrWriteMem], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1>; +def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore; +def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; + +class AMDGPUStructBufferStore : Intrinsic < + [], + [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32 + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc) + [IntrWriteMem], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1>; +def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; +def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; + +class AMDGPURawBufferAtomic : Intrinsic < + [llvm_i32_ty], + [llvm_i32_ty, // vdata(VGPR) + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + [], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; +def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< + [llvm_i32_ty], + [llvm_i32_ty, // src(VGPR) + llvm_i32_ty, // cmp(VGPR) + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + [], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<2, 0>; + +class AMDGPUStructBufferAtomic : Intrinsic < + [llvm_i32_ty], + [llvm_i32_ty, // vdata(VGPR) + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + [], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; +def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< + [llvm_i32_ty], + [llvm_i32_ty, // src(VGPR) + llvm_i32_ty, // cmp(VGPR) + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) + [], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<2, 0>; + // Obsolescent tbuffer intrinsics. def int_amdgcn_tbuffer_load : Intrinsic < [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -145,10 +145,6 @@ bool SelectMUBUFConstant(SDValue Constant, SDValue &SOffset, SDValue &ImmOffset) const; - bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset, - SDValue &ImmOffset) const; - bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset, - SDValue &ImmOffset, SDValue &VOffset) const; bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; @@ -1293,42 +1289,11 @@ SDValue &SOffset, SDValue &ImmOffset) const { SDLoc DL(Constant); - const uint32_t Align = 4; - const uint32_t MaxImm = alignDown(4095, Align); uint32_t Imm = cast(Constant)->getZExtValue(); - uint32_t Overflow = 0; - - if (Imm > MaxImm) { - if (Imm <= MaxImm + 64) { - // Use an SOffset inline constant for 4..64 - Overflow = Imm - MaxImm; - Imm = MaxImm; - } else { - // Try to keep the same value in SOffset for adjacent loads, so that - // the corresponding register contents can be re-used. - // - // Load values with all low-bits (except for alignment bits) set into - // SOffset, so that a larger range of values can be covered using - // s_movk_i32. - // - // Atomic operations fail to work correctly when individual address - // components are unaligned, even if their sum is aligned. - uint32_t High = (Imm + Align) & ~4095; - uint32_t Low = (Imm + Align) & 4095; - Imm = Low; - Overflow = High - Align; - } - } - - // There is a hardware bug in SI and CI which prevents address clamping in - // MUBUF instructions from working correctly with SOffsets. The immediate - // offset is unaffected. - if (Overflow > 0 && - Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + uint32_t Overflow; + if (!AMDGPU::splitMUBUFOffset(Imm, Overflow, Imm, Subtarget)) return false; - ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16); - if (Overflow <= 64) SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32); else @@ -1339,51 +1304,6 @@ return true; } -bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset, - SDValue &SOffset, - SDValue &ImmOffset) const { - SDLoc DL(Offset); - - if (!isa(Offset)) - return false; - - return SelectMUBUFConstant(Offset, SOffset, ImmOffset); -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, - SDValue &SOffset, - SDValue &ImmOffset, - SDValue &VOffset) const { - SDLoc DL(Offset); - - // Don't generate an unnecessary voffset for constant offsets. - if (isa(Offset)) { - SDValue Tmp1, Tmp2; - - // When necessary, use a voffset in <= CI anyway to work around a hardware - // bug. - if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS || - SelectMUBUFConstant(Offset, Tmp1, Tmp2)) - return false; - } - - if (CurDAG->isBaseWithConstantOffset(Offset)) { - SDValue N0 = Offset.getOperand(0); - SDValue N1 = Offset.getOperand(1); - if (cast(N1)->getSExtValue() >= 0 && - SelectMUBUFConstant(N1, SOffset, ImmOffset)) { - VOffset = N0; - return true; - } - } - - SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); - ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); - VOffset = Offset; - - return true; -} - template bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr, SDValue &VAddr, Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1043,37 +1043,33 @@ multiclass MUBUF_LoadIntrinsicPat { def : GCNPat< - (vt (name v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc)), + (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0)), (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc)), - (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0)), + (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (vt (name v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc)), - (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm)), + (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc)), + (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; } @@ -1100,40 +1096,34 @@ multiclass MUBUF_StoreIntrinsicPat { def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc), + (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc), - (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), 0) + (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), + (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc), - (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), 0) + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), + (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc), + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; } @@ -1164,36 +1154,36 @@ multiclass BufferAtomicPatterns { def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), (!cast(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $slc)) + (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), (!cast(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $slc)) + (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), (!cast(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $slc)) + (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), (!cast(opcode # _BOTHEN_RTN) $vdata_in, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) + $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; } @@ -1211,49 +1201,49 @@ def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), sub0) >; def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), sub0) >; def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), sub0) >; def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), sub0) >; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -184,6 +184,12 @@ /// global value \p GV, false otherwise. bool shouldEmitPCReloc(const GlobalValue *GV) const; + // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the + // three offsets (voffset, soffset and instoffset) into the SDValue[3] array + // pointed to by Offsets. + void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, + SDValue *Offsets) const; + public: SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -5075,17 +5075,82 @@ } case Intrinsic::amdgcn_buffer_load: case Intrinsic::amdgcn_buffer_load_format: { + unsigned Glc = cast(Op.getOperand(5))->getZExtValue(); + unsigned Slc = cast(Op.getOperand(6))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast(Op.getOperand(3))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc Op.getOperand(3), // vindex - Op.getOperand(4), // offset - Op.getOperand(5), // glc - Op.getOperand(6) // slc + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; + setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + + EVT VT = Op.getValueType(); + EVT IntVT = VT.changeTypeToInteger(); + auto *M = cast(Op); + EVT LoadVT = Op.getValueType(); + + if (LoadVT.getScalarType() == MVT::f16) + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + M, DAG, Ops); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: { + auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(4), // soffset + Offsets.second, // offset + Op.getOperand(5), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + + unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ? + AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + + EVT VT = Op.getValueType(); + EVT IntVT = VT.changeTypeToInteger(); + auto *M = cast(Op); + EVT LoadVT = Op.getValueType(); + + if (LoadVT.getScalarType() == MVT::f16) + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + M, DAG, Ops); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load_format: { + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idxen + }; + + unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ? + AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); auto *M = cast(Op); @@ -5185,14 +5250,22 @@ case Intrinsic::amdgcn_buffer_atomic_and: case Intrinsic::amdgcn_buffer_atomic_or: case Intrinsic::amdgcn_buffer_atomic_xor: { + unsigned Slc = cast(Op.getOperand(6))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast(Op.getOperand(4))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex - Op.getOperand(5), // offset - Op.getOperand(6) // slc + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); EVT VT = Op.getValueType(); auto *M = cast(Op); @@ -5236,16 +5309,193 @@ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_raw_buffer_atomic_xor: { + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + EVT VT = Op.getValueType(); + + auto *M = cast(Op); + unsigned Opcode = 0; + + switch (IntrID) { + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_add: + Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_and: + Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_or: + Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; + break; + default: + llvm_unreachable("unhandled atomic opcode"); + } + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: { + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idxen + }; + EVT VT = Op.getValueType(); + auto *M = cast(Op); + unsigned Opcode = 0; + + switch (IntrID) { + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_add: + Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_and: + Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_or: + Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; + break; + default: + llvm_unreachable("unhandled atomic opcode"); + } + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } case Intrinsic::amdgcn_buffer_atomic_cmpswap: { + unsigned Slc = cast(Op.getOperand(7))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast(Op.getOperand(5))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // src Op.getOperand(3), // cmp Op.getOperand(4), // rsrc Op.getOperand(5), // vindex - Op.getOperand(6), // offset - Op.getOperand(7) // slc + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + }; + setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + EVT VT = Op.getValueType(); + auto *M = cast(Op); + + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, + Op->getVTList(), Ops, VT, M->getMemOperand()); + } + case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: { + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Op.getOperand(4), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + EVT VT = Op.getValueType(); + auto *M = cast(Op); + + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, + Op->getVTList(), Ops, VT, M->getMemOperand()); + } + case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: { + auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Op.getOperand(4), // rsrc + Op.getOperand(5), // vindex + Offsets.first, // voffset + Op.getOperand(7), // soffset + Offsets.second, // offset + Op.getOperand(8), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast(Op); @@ -5520,15 +5770,23 @@ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); if (IsD16) VData = handleD16VData(VData, DAG); + unsigned Glc = cast(Op.getOperand(6))->getZExtValue(); + unsigned Slc = cast(Op.getOperand(7))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast(Op.getOperand(4))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { Chain, - VData, // vdata + VData, Op.getOperand(3), // rsrc Op.getOperand(4), // vindex - Op.getOperand(5), // offset - Op.getOperand(6), // glc - Op.getOperand(7) // slc + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; @@ -5536,6 +5794,59 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + SDValue Ops[] = { + Chain, + VData, + Op.getOperand(3), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ? + AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + + case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_buffer_store_format: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + SDValue Ops[] = { + Chain, + VData, + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idxen + }; + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ? + AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -5546,12 +5857,12 @@ } } -// The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset -// (the offset that is included in bounds checking and swizzling, to be split -// between the instruction's voffset and immoffset fields) and soffset (the -// offset that is excluded from bounds checking and swizzling, to go in the -// instruction's soffset field). This function takes the first kind of offset -// and figures out how to split it between voffset and immoffset. +// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: +// offset (the offset that is included in bounds checking and swizzling, to be +// split between the instruction's voffset and immoffset fields) and soffset +// (the offset that is excluded from bounds checking and swizzling, to go in +// the instruction's soffset field). This function takes the first kind of +// offset and figures out how to split it between voffset and immoffset. std::pair SITargetLowering::splitBufferOffsets( SDValue Offset, SelectionDAG &DAG) const { SDLoc DL(Offset); @@ -5590,6 +5901,41 @@ return std::pair(N0, SDValue(C1, 0)); } +// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the +// three offsets (voffset, soffset and instoffset) into the SDValue[3] array +// pointed to by Offsets. +void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, + SelectionDAG &DAG, + SDValue *Offsets) const { + SDLoc DL(CombinedOffset); + if (auto C = dyn_cast(CombinedOffset)) { + uint32_t Imm = C->getZExtValue(); + uint32_t SOffset, ImmOffset; + if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) { + Offsets[0] = DAG.getConstant(0, DL, MVT::i32); + Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); + Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); + return; + } + } + if (DAG.isBaseWithConstantOffset(CombinedOffset)) { + SDValue N0 = CombinedOffset.getOperand(0); + SDValue N1 = CombinedOffset.getOperand(1); + uint32_t SOffset, ImmOffset; + int Offset = cast(N1)->getSExtValue(); + if (Offset >= 0 + && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) { + Offsets[0] = N0; + Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); + Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); + return; + } + } + Offsets[0] = CombinedOffset; + Offsets[1] = DAG.getConstant(0, DL, MVT::i32); + Offsets[2] = DAG.getConstant(0, DL, MVT::i32); +} + static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT) { Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -108,13 +108,15 @@ SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; -def SDTBufferLoad : SDTypeProfile<1, 5, +def SDTBufferLoad : SDTypeProfile<1, 7, [ // vdata SDTCisVT<1, v4i32>, // rsrc - SDTCisVT<2, i32>, // vindex - SDTCisVT<3, i32>, // offset - SDTCisVT<4, i1>, // glc - SDTCisVT<5, i1>]>; // slc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; @@ -124,13 +126,15 @@ SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; -def SDTBufferStore : SDTypeProfile<0, 6, +def SDTBufferStore : SDTypeProfile<0, 8, [ // vdata SDTCisVT<1, v4i32>, // rsrc - SDTCisVT<2, i32>, // vindex - SDTCisVT<3, i32>, // offset - SDTCisVT<4, i1>, // glc - SDTCisVT<5, i1>]>; // slc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; @@ -142,13 +146,16 @@ [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; class SDBufferAtomic : SDNode , // dst SDTCisVT<1, i32>, // vdata SDTCisVT<2, v4i32>, // rsrc - SDTCisVT<3, i32>, // vindex - SDTCisVT<4, i32>, // offset - SDTCisVT<5, i1>]>, // slc + SDTCisVT<3, i32>, // vindex(VGPR) + SDTCisVT<4, i32>, // voffset(VGPR) + SDTCisVT<5, i32>, // soffset(SGPR) + SDTCisVT<6, i32>, // offset(imm) + SDTCisVT<7, i32>, // cachepolicy(imm) + SDTCisVT<8, i1>]>, // idxen(imm) [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; @@ -164,14 +171,17 @@ def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", - SDTypeProfile<1, 6, + SDTypeProfile<1, 9, [SDTCisVT<0, i32>, // dst SDTCisVT<1, i32>, // src SDTCisVT<2, i32>, // cmp SDTCisVT<3, v4i32>, // rsrc - SDTCisVT<4, i32>, // vindex - SDTCisVT<5, i32>, // offset - SDTCisVT<6, i1>]>, // slc + SDTCisVT<4, i32>, // vindex(VGPR) + SDTCisVT<5, i32>, // voffset(VGPR) + SDTCisVT<6, i32>, // soffset(SGPR) + SDTCisVT<7, i32>, // offset(imm) + SDTCisVT<8, i32>, // cachepolicy(imm) + SDTCisVT<9, i1>]>, // idxen(imm) [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -26,8 +26,10 @@ namespace llvm { class Argument; +class AMDGPUSubtarget; class FeatureBitset; class Function; +class GCNSubtarget; class GlobalValue; class MCContext; class MCRegisterClass; @@ -447,6 +449,12 @@ /// not the encoded offset. bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); +// Given Imm, split it into the values to put into the SOffset and ImmOffset +// fields in an MUBUF instruction. Return false if it is not possible (due to a +// hardware bug needing a workaround). +bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, + const GCNSubtarget *Subtarget); + /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -935,6 +935,49 @@ isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset); } +// Given Imm, split it into the values to put into the SOffset and ImmOffset +// fields in an MUBUF instruction. Return false if it is not possible (due to a +// hardware bug needing a workaround). +bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, + const GCNSubtarget *Subtarget) { + const uint32_t Align = 4; + const uint32_t MaxImm = alignDown(4095, Align); + uint32_t Overflow = 0; + + if (Imm > MaxImm) { + if (Imm <= MaxImm + 64) { + // Use an SOffset inline constant for 4..64 + Overflow = Imm - MaxImm; + Imm = MaxImm; + } else { + // Try to keep the same value in SOffset for adjacent loads, so that + // the corresponding register contents can be re-used. + // + // Load values with all low-bits (except for alignment bits) set into + // SOffset, so that a larger range of values can be covered using + // s_movk_i32. + // + // Atomic operations fail to work correctly when individual address + // components are unaligned, even if their sum is aligned. + uint32_t High = (Imm + Align) & ~4095; + uint32_t Low = (Imm + Align) & 4095; + Imm = Low; + Overflow = High - Align; + } + } + + // There is a hardware bug in SI and CI which prevents address clamping in + // MUBUF instructions from working correctly with SOffsets. The immediate + // offset is unaffected. + if (Overflow > 0 && + Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + return false; + + ImmOffset = Imm; + SOffset = Overflow; + return true; +} + } // end namespace AMDGPU } // end namespace llvm Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll @@ -0,0 +1,115 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}test1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 offen offset:42 glc +;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} +define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) { +main_body: + %o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %off5 = add i32 %voffset, 42 + %o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %off5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 4, i32 8188, i32 0) + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %out = bitcast i32 %o6 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 offen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 offen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 offen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 offen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 offen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 offen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 offen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 offen glc +define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) { +main_body: + %t1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %t2 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) + %t3 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %t4 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) + %t5 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %t6 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) + %t7 = call i32 @llvm.amdgcn.raw.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %t8 = call i32 @llvm.amdgcn.raw.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) + %t9 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %out = bitcast i32 %t9 to float + ret float %out +} + +; Ideally, we would teach tablegen & friends that cmpswap only modifies the +; first vgpr. Since we don't do that yet, the register allocator will have to +; create copies which we don't bother to track here. +; +;CHECK-LABEL: {{^}}test3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 offen offset:44 glc +;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc +define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { +main_body: + %o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %ofs.5 = add i32 %voffset, 44 + %o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %ofs.5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 4, i32 8188, i32 0) + +; Detecting the no-return variant doesn't work right now because of how the +; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. +; Since there probably isn't a reasonable use-case of cmpswap that discards +; the return value, that seems okay. +; +; %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %out = bitcast i32 %o6 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test4: +;CHECK: buffer_atomic_add v0, +define amdgpu_ps float @test4() { +main_body: + %v = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> undef, i32 4, i32 0, i32 0) + %v.float = bitcast i32 %v to float + ret float %v.float +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.and(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.or(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i32) #0 + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}buffer_load_format_d16_x: +; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 +define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret half %data +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xy: +; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: +; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32>, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32) Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll @@ -0,0 +1,87 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}buffer_load: +;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 +;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc +;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 2) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs: +;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 42, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092 +;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092 +;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +main_body: + %d.0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4092, i32 60, i32 0) + %d.1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4092, i32 32764, i32 0) + %d.2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4, i32 36860, i32 0) + %d.3 = fadd <4 x float> %d.0, %d.1 + %data = fadd <4 x float> %d.2, %d.3 + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs: +;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 60 + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_x: +;CHECK: buffer_load_format_x v0, off, s[0:3], 0 +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_xy: +;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0 +;CHECK: s_waitcnt +define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <2 x float> %data +} + +declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0 + +attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -0,0 +1,206 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}buffer_load: +;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc +;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 2) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs: +;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc +;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 60 + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_x1: +;CHECK: buffer_load_dword v0, v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) { +main_body: + %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_x2: +;CHECK: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) { +main_body: + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) + ret <2 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_negative_offset: +;CHECK: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, 0xfffff000, v0 +;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen offset:4080 +define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +main_body: + %ofs.1 = add i32 %ofs, -16 + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs.1, i32 0, i32 0) + ret <4 x float> %data +} + +; SI won't merge ds memory operations, because of the signed offset bug, so +; we only have check lines for VI. +; CHECK-LABEL: buffer_load_mmo: +; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 +define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) { +entry: + store float 0.0, float addrspace(3)* %lds + %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 + store float 0.0, float addrspace(3)* %tmp2 + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +main_body: + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) { +main_body: + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 1) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 1) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 3) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 3) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +main_body: + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + %r3 = extractelement <2 x float> %vr2, i32 0 + %r4 = extractelement <2 x float> %vr2, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) + %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + %r3 = extractelement <2 x float> %vr2, i32 0 + %r4 = extractelement <2 x float> %vr2, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + +declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll @@ -0,0 +1,57 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX81 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX9 %s + +; GCN-LABEL: {{^}}buffer_store_format_d16_x: +; GCN: s_load_dword s[[LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] +; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %voffset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xy: + +; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen + +; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %voffset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] + +; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen + +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] + +; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32) Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll @@ -0,0 +1,76 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 +;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc +;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 2) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen +;VERDE: s_waitcnt expcnt(0) +;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 offen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -0,0 +1,151 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc +;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 2) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +;VERDE: s_waitcnt expcnt(0) +;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) #0 { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} +define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 3) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 3) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll @@ -0,0 +1,127 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}test1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen offset:42 glc +;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], [[SOFS]] idxen offset:4 glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen{{$}} +define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { +main_body: + %o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0) + %o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0) + %ofs.5 = add i32 %voffset, 42 + %o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0) + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %out = bitcast i32 %o6 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc +define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { +main_body: + %t1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t2 = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t3 = call i32 @llvm.amdgcn.struct.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t4 = call i32 @llvm.amdgcn.struct.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t5 = call i32 @llvm.amdgcn.struct.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t6 = call i32 @llvm.amdgcn.struct.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t7 = call i32 @llvm.amdgcn.struct.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t8 = call i32 @llvm.amdgcn.struct.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t9 = call i32 @llvm.amdgcn.struct.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %out = bitcast i32 %t9 to float + ret float %out +} + +; Ideally, we would teach tablegen & friends that cmpswap only modifies the +; first vgpr. Since we don't do that yet, the register allocator will have to +; create copies which we don't bother to track here. +; +;CHECK-LABEL: {{^}}test3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen offset:44 glc +;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[SOFS]] idxen offset:4 glc +define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { +main_body: + %o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0) + %o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0) + %offs.5 = add i32 %voffset, 44 + %o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %offs.5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0) + +; Detecting the no-return variant doesn't work right now because of how the +; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. +; Since there probably isn't a reasonable use-case of cmpswap that discards +; the return value, that seems okay. +; +; %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %out = bitcast i32 %o6 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test4: +;CHECK: buffer_atomic_add v0, +define amdgpu_ps float @test4() { +main_body: + %v = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> undef, i32 0, i32 4, i32 0, i32 0) + %v.float = bitcast i32 %v to float + ret float %v.float +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.and(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.or(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i32, i32) #0 + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}buffer_load_format_d16_x: +; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret half %data +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xy: +; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: +; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32>, i32, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32, i32) Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll @@ -0,0 +1,115 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}buffer_load: +;CHECK: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: buffer_load_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: buffer_load_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs: +;CHECK: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 60 idxen offset:4092 +;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[OFS1]] idxen offset:4092 +;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[OFS2]] idxen offset:4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +main_body: + %d.0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 60, i32 0) + %d.1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 32764, i32 0) + %d.2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4, i32 36860, i32 0) + %d.3 = fadd <4 x float> %d.0, %d.1 + %data = fadd <4 x float> %d.2, %d.3 + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_idx: +;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs: +;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 60 + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both: +;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both_reversed: +;CHECK: v_mov_b32_e32 v2, v0 +;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_x: +;CHECK: buffer_load_format_x v0, {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_xy: +;CHECK: buffer_load_format_xy v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: s_waitcnt +define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %data +} + +declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #0 + +attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -0,0 +1,133 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}buffer_load: +;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: buffer_load_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs: +;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc +;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 4, i32 8188, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_idx: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs: +;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 60 + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both: +;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both_reversed: +;CHECK: v_mov_b32_e32 v2, v0 +;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_x1: +;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_x2: +;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + ret <2 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_negative_offset: +;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, 0xfffff000, v0 +;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen offset:4080 +define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +main_body: + %ofs.1 = add i32 %ofs, -16 + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i32 0, i32 0) + ret <4 x float> %data +} + +; SI won't merge ds memory operations, because of the signed offset bug, so +; we only have check lines for VI. +; CHECK-LABEL: buffer_load_mmo: +; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 +define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) { +entry: + store float 0.0, float addrspace(3)* %lds + %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 + store float 0.0, float addrspace(3)* %tmp2 + ret float %val +} + +declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll @@ -0,0 +1,57 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX81 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX9 %s + +; GCN-LABEL: {{^}}buffer_store_format_d16_x: +; GCN: s_load_dword s[[LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] +; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xy: + +; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen + +; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] + +; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen + +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] + +; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll @@ -0,0 +1,104 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: buffer_store_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: buffer_store_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_idx: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both_reversed: +;CHECK: v_mov_b32_e32 v6, v4 +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen +;VERDE: s_waitcnt expcnt(0) +;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -0,0 +1,104 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_idx: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both_reversed: +;CHECK: v_mov_b32_e32 v6, v4 +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +;VERDE: s_waitcnt expcnt(0) +;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly }