Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -208,10 +208,8 @@ def int_amdgcn_buffer_load_format : Intrinsic < [llvm_v4f32_ty], [llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // offset(imm) llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // voffset(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) [IntrReadMem]>; @@ -220,10 +218,8 @@ [], [llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select v4f32 llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // offset(imm) llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // voffset(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) llvm_i1_ty, // glc(imm) llvm_i1_ty], // slc(imm) []>; Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -127,6 +127,13 @@ SDValue &TFE) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &GLC) const; + void SelectMUBUFConstant(SDValue Constant, + SDValue &SOffset, + SDValue &ImmOffset) const; + bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset, + SDValue &ImmOffset) const; + bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset, + SDValue &ImmOffset, SDValue &VOffset) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, @@ -1112,6 +1119,78 @@ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); } +void AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, + SDValue &SOffset, + SDValue &ImmOffset) const { + SDLoc DL(Constant); + uint32_t Imm = cast(Constant)->getZExtValue(); + uint32_t Overflow = 0; + + if (Imm >= 4096) { + if (Imm <= 4095 + 64) { + // Use an SOffset inline constant for 1..64 + Overflow = Imm - 4095; + Imm = 4095; + } else { + // Try to keep the same value in SOffset for adjacent loads, so that + // the corresponding register contents can be re-used. + // + // Load values with all low-bits set into SOffset, so that a larger + // range of values can be covered using s_movk_i32 + uint32_t High = (Imm + 1) & ~4095; + uint32_t Low = (Imm + 1) & 4095; + Imm = Low; + Overflow = High - 1; + } + } + + ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16); + + if (Overflow <= 64) + SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32); + else + SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(Overflow, DL, MVT::i32)), + 0); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset, + SDValue &SOffset, + SDValue &ImmOffset) const { + SDLoc DL(Offset); + + if (!isa(Offset)) + return false; + + SelectMUBUFConstant(Offset, SOffset, ImmOffset); + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, + SDValue &SOffset, + SDValue &ImmOffset, + SDValue &VOffset) const { + SDLoc DL(Offset); + + // Don't generate an unnecessary voffset for constant offsets. + if (isa(Offset)) + return false; + + if (CurDAG->isBaseWithConstantOffset(Offset)) { + SDValue N0 = Offset.getOperand(0); + SDValue N1 = Offset.getOperand(1); + SelectMUBUFConstant(N1, SOffset, ImmOffset); + VOffset = N0; + } else { + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); + VOffset = Offset; + } + + return true; +} + /// /// \param EncodedOffset This is the immediate value that will be encoded /// directly into the instruction. On SI/CI the \p EncodedOffset Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -722,6 +722,8 @@ def MUBUFScratch : ComplexPattern; def MUBUFOffset : ComplexPattern; def MUBUFOffsetAtomic : ComplexPattern; +def MUBUFIntrinsicOffset : ComplexPattern; +def MUBUFIntrinsicVOffset : ComplexPattern; def SMRDImm : ComplexPattern; def SMRDImm32 : ComplexPattern; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -2107,28 +2107,38 @@ // buffer_load/store_format patterns //===----------------------------------------------------------------------===// def : Pat< - (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset, 0, 0, + (int_amdgcn_buffer_load_format v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, + i16:$offset), imm:$glc, imm:$slc), (BUFFER_LOAD_FORMAT_XYZW_OFFSET $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< - (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset, i32:$vindex, 0, + (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, + i16:$offset), imm:$glc, imm:$slc), (BUFFER_LOAD_FORMAT_XYZW_IDXEN $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< - (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset, 0, i32:$voffset, + (int_amdgcn_buffer_load_format v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, + i16:$offset, + i32:$voffset), imm:$glc, imm:$slc), (BUFFER_LOAD_FORMAT_XYZW_OFFEN $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< - (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset, i32:$vindex, i32:$voffset, + (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, + i16:$offset, + i32:$voffset), imm:$glc, imm:$slc), (BUFFER_LOAD_FORMAT_XYZW_BOTHEN (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), @@ -2137,32 +2147,38 @@ >; def : Pat< - (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, - i32:$soffset, imm:$offset, 0, 0, + (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, + i16:$offset), imm:$glc, imm:$slc), (BUFFER_STORE_FORMAT_XYZW_OFFSET $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< - (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, - i32:$soffset, imm:$offset, i32:$vindex, 0, - imm:$glc, imm:$slc), + (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, + i16:$offset), + imm:$glc, imm:$slc), (BUFFER_STORE_FORMAT_XYZW_IDXEN $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< - (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, - i32:$soffset, imm:$offset, 0, i32:$voffset, + (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, + i16:$offset, + i32:$voffset), imm:$glc, imm:$slc), (BUFFER_STORE_FORMAT_XYZW_OFFEN $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< - (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$soffset, - imm:$offset, i32:$vindex, i32:$voffset, + (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, + i16:$offset, + i32:$voffset), imm:$glc, imm:$slc), (BUFFER_STORE_FORMAT_XYZW_BOTHEN $vdata, Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll @@ -2,15 +2,15 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: {{^}}buffer_load: -;CHECK: buffer_load_format_xyzw v[0:3], s[0:3], s4 -;CHECK: buffer_load_format_xyzw v[4:7], s[0:3], s4 glc -;CHECK: buffer_load_format_xyzw v[8:11], s[0:3], s4 slc +;CHECK: buffer_load_format_xyzw v[0:3], s[0:3], 0 +;CHECK: buffer_load_format_xyzw v[4:7], s[0:3], 0 glc +;CHECK: buffer_load_format_xyzw v[8:11], s[0:3], 0 slc ;CHECK: s_waitcnt -define {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg, i32 inreg) #0 { +define {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 0, i1 0) - %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 1, i1 0) - %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 0, i1 1) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) + %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) + %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1) %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 @@ -18,11 +18,42 @@ } ;CHECK-LABEL: {{^}}buffer_load_immoffs: -;CHECK: buffer_load_format_xyzw v[0:3], s[0:3], s4 offset:42 +;CHECK: buffer_load_format_xyzw v[0:3], s[0:3], 0 offset:42 ;CHECK: s_waitcnt -define <4 x float> @buffer_load_immoffs(<4 x i32> inreg, i32 inreg) #0 { +define <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 42, i32 0, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 61 offset:4095 +;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff +;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS1]] offset:4093 +;CHECK: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff +;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS2]] offset:1 +;CHECK: s_waitcnt +define <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 { +main_body: + %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0) + %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0) + %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0) + %d.3 = fadd <4 x float> %d.0, %d.1 + %data = fadd <4 x float> %d.2, %d.3 + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse: +;CHECK: s_movk_i32 [[OFS:s[0-9]+]], 0xfff +;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS]] offset:65 +;CHECK-NOT: s_mov +;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS]] offset:81 +;CHECK: s_waitcnt +define <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) #0 { +main_body: + %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0) + %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0) + %data = fadd <4 x float> %d.0, %d.1 ret <4 x float> %data } @@ -31,7 +62,7 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i32 %1, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) ret <4 x float> %data } @@ -40,7 +71,17 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i32 0, i32 %1, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:58 +;CHECK: s_waitcnt +define <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 { +main_body: + %ofs = add i32 %1, 58 + %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) ret <4 x float> %data } @@ -49,7 +90,7 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i32 %1, i32 %2, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) ret <4 x float> %data } @@ -59,11 +100,11 @@ ;CHECK: s_waitcnt define <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 { main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i32 %2, i32 %1, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) ret <4 x float> %data } -declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i1, i1) #1 attributes #0 = { "ShaderType"="0" } attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll @@ -2,55 +2,55 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: {{^}}buffer_store: -;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], s4 -;CHECK: buffer_store_format_xyzw v[4:7], s[0:3], s4 glc -;CHECK: buffer_store_format_xyzw v[8:11], s[0:3], s4 slc -define void @buffer_store(<4 x i32> inreg, i32 inreg, <4 x float>, <4 x float>, <4 x float>) #0 { +;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], 0 +;CHECK: buffer_store_format_xyzw v[4:7], s[0:3], 0 glc +;CHECK: buffer_store_format_xyzw v[8:11], s[0:3], 0 slc +define void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 1, i1 0) - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %4, <4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 0, i1 1) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1) ret void } ;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], s4 offset:42 -define void @buffer_store_immoffs(<4 x i32> inreg, i32 inreg, <4 x float>) #0 { +;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], 0 offset:42 +define void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 %1, i32 42, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_idx: ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen -define void @buffer_store_idx(<4 x i32> inreg, i32 inreg, <4 x float>, i32) #0 { +define void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 %3, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen -define void @buffer_store_ofs(<4 x i32> inreg, i32 inreg, <4 x float>, i32) #0 { +define void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 %3, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_both: ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen -define void @buffer_store_both(<4 x i32> inreg, i32 inreg, <4 x float>, i32, i32) #0 { +define void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 %3, i32 %4, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0) ret void } ;CHECK-LABEL: {{^}}buffer_store_both_reversed: ;CHECK: v_mov_b32_e32 v6, v4 ;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen -define void @buffer_store_both_reversed(<4 x i32> inreg, i32 inreg, <4 x float>, i32, i32) #0 { +define void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 %4, i32 %3, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0) ret void } @@ -62,16 +62,16 @@ ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen -define void @buffer_store_wait(<4 x i32> inreg, i32 inreg, <4 x float>, i32, i32, i32) #0 { +define void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 { main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 %3, i32 0, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i32 %4, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 0, i32 0, i32 %5, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0) ret void } -declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i32, i32, i1, i1) #2 +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i1, i1) #2 attributes #0 = { "ShaderType"="0" } attributes #1 = { nounwind }