diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -198,11 +198,13 @@ /// global value \p GV, false otherwise. bool shouldEmitPCReloc(const GlobalValue *GV) const; - // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the - // three offsets (voffset, soffset and instoffset) into the SDValue[3] array - // pointed to by Offsets. - void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, - SDValue *Offsets, unsigned Align = 4) const; + /// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the + /// three offsets (voffset, soffset and instoffset) into the SDValue[3] array + /// pointed to by Offsets. + /// \returns 0 If there is a non-constant offset or if the offset is 0. + /// Otherwise returns the constant offset. + unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, + SDValue *Offsets, unsigned Align = 4) const; // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6069,6 +6069,25 @@ } } +static unsigned getBufferOffsetForMMO(const SDValue &VOffset, + const SDValue &SOffset, + const SDValue &Offset, + const SDValue *VIndex = nullptr) { + + if (!isa(VOffset) || !isa(SOffset) || + !isa(Offset)) + return 0; + + if (VIndex) { + if (!isa(*VIndex) || !cast(*VIndex)->isNullValue()) + return 0; + } + + return cast(VOffset)->getSExtValue() + + cast(SOffset)->getSExtValue() + + cast(Offset)->getSExtValue(); +} + SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); @@ -6215,13 +6234,18 @@ DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); + unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; + unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); auto *M = cast(Op); + M->getMemOperand()->setOffset(Offset); EVT LoadVT = Op.getValueType(); if (LoadVT.getScalarType() == MVT::f16) @@ -6252,7 +6276,9 @@ DAG.getConstant(0, DL, MVT::i1), // idxen }; - return lowerIntrinsicLoad(cast(Op), IsFormat, DAG, Ops); + auto *M = cast(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5])); + return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); } case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: { @@ -6270,6 +6296,8 @@ DAG.getConstant(1, DL, MVT::i1), // idxen }; + auto *M = cast(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5], &Ops[2])); return lowerIntrinsicLoad(cast(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_tbuffer_load: { @@ -6375,10 +6403,14 @@ DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getValueType(); auto *M = cast(Op); + M->getMemOperand()->setOffset(Offset); unsigned Opcode = 0; switch (IntrID) { @@ -6446,6 +6478,7 @@ EVT VT = Op.getValueType(); auto *M = cast(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); unsigned Opcode = 0; switch (IntrID) { @@ -6519,6 +6552,7 @@ EVT VT = Op.getValueType(); auto *M = cast(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], &Ops[3])); unsigned Opcode = 0; switch (IntrID) { @@ -6582,9 +6616,13 @@ DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getValueType(); auto *M = cast(Op); + M->getMemOperand()->setOffset(Offset); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6605,6 +6643,7 @@ }; EVT VT = Op.getValueType(); auto *M = cast(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7])); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6625,6 +6664,7 @@ }; EVT VT = Op.getValueType(); auto *M = cast(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7], &Ops[4])); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6874,11 +6914,15 @@ DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + M->getMemOperand()->setOffset(Offset); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -6923,6 +6967,7 @@ IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) @@ -6967,6 +7012,7 @@ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], &Ops[3])); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -6993,10 +7039,14 @@ DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getOperand(2).getValueType(); auto *M = cast(Op); + M->getMemOperand()->setOffset(Offset); unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD : AMDGPUISD::BUFFER_ATOMIC_FADD; @@ -7090,7 +7140,7 @@ // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. -void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, +unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, unsigned Align) const { SDLoc DL(CombinedOffset); @@ -7101,7 +7151,7 @@ Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); - return; + return SOffset + ImmOffset; } } if (DAG.isBaseWithConstantOffset(CombinedOffset)) { @@ -7114,12 +7164,13 @@ Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); - return; + return 0; } } Offsets[0] = CombinedOffset; Offsets[1] = DAG.getConstant(0, DL, MVT::i32); Offsets[2] = DAG.getConstant(0, DL, MVT::i32); + return 0; } // Handle 8 bit and 16 bit buffer loads diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -0,0 +1,414 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -march=amdgcn -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s + +define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias dereferenceable(18446744073709551615) %arg0, i32 %arg1) { + ; GCN-LABEL: name: mmo_offsets0 + ; GCN: bb.0.bb.0: + ; GCN: liveins: $sgpr0, $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; GCN: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sreg_128 = S_LOAD_DWORDX4_IMM killed [[REG_SEQUENCE]], 0, 0, 0 :: (dereferenceable invariant load 16 from %ir.arg0, addrspace 6) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 16, align 1, addrspace 4) + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 32, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 48, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 64, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 80, align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 96, align 1, addrspace 4) + ; GCN: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF]].sub0 + ; GCN: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF1]].sub0 + ; GCN: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY4:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF2]].sub0 + ; GCN: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY5:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF3]].sub0 + ; GCN: INLINEASM &"", 1 + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7 + 112, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 64 + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 128 + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 72 + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_4:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 144 + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_5:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 80 + ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_6:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 160 + ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4) + ; GCN: [[COPY9:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF4]].sub0 + ; GCN: [[S_MOV_B32_7:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 88 + ; GCN: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4) + ; GCN: [[COPY10:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF5]].sub0 + ; GCN: [[S_MOV_B32_8:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 176 + ; GCN: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4) + ; GCN: [[COPY11:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF6]].sub0 + ; GCN: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY12:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF7]].sub0 + ; GCN: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY14:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF8]].sub0 + ; GCN: INLINEASM &"", 1 + ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_9:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 96 + ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_10:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 192 + ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_11:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 104 + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_12:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 208 + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_13:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 112 + ; GCN: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_14:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 224 + ; GCN: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4) + ; GCN: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_15:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 120 + ; GCN: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_16:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 240 + ; GCN: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4) + ; GCN: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_17:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256 + ; GCN: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4) + ; GCN: [[COPY33:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF9]].sub0 + ; GCN: [[S_MOV_B32_18:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 136 + ; GCN: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[DEF10:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4) + ; GCN: [[COPY35:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF10]].sub0 + ; GCN: [[S_MOV_B32_19:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 272 + ; GCN: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[DEF11:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4) + ; GCN: [[COPY37:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF11]].sub0 + ; GCN: [[DEF12:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY38:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF12]].sub0 + ; GCN: [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[DEF13:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY41:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF13]].sub0 + ; GCN: [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY42:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF14]].sub0 + ; GCN: [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY43:%[0-9]+]]:sreg_32_xm0 = COPY [[DEF15]].sub0 + ; GCN: INLINEASM &"", 1 + ; GCN: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4) + ; GCN: [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_20:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 288 + ; GCN: [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: INLINEASM &"", 1 + ; GCN: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_21:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 152 + ; GCN: [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4) + ; GCN: [[S_MOV_B32_22:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 304 + ; GCN: [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: S_ENDPGM 0 +bb.0: + %tmp0 = load <4 x i32>, <4 x i32> addrspace(6)* %arg0, align 16, !invariant.load !0 + %buffer0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 16, i1 false, i1 false) #0 + %buffer1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0 + %buffer2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 1, i32 16, i1 false, i1 false) #0 + %buffer3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 16, i1 false, i1 false) #0 + + ; Insert inline asm to keep the different instruction types from being mixed. This makes the output easier to read. + call void asm sideeffect "", "" () + + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer0, <4 x i32> %tmp0, i32 0, i32 32, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer2, <4 x i32> %tmp0, i32 1, i32 32, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer3, <4 x i32> %tmp0, i32 %arg1, i32 32, i1 false, i1 false) #1 + + call void asm sideeffect "", "" () + + %buffer_format0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 48, i1 false, i1 false) #0 + %buffer_format1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0 + %buffer_format2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 1, i32 48, i1 false, i1 false) #0 + %buffer_format3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 48, i1 false, i1 false) #0 + + call void asm sideeffect "", "" () + + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format0, <4 x i32> %tmp0, i32 0, i32 64, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format2, <4 x i32> %tmp0, i32 1, i32 64, i1 false, i1 false) #1 + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format3, <4 x i32> %tmp0, i32 %arg1, i32 64, i1 false, i1 false) #1 + + call void asm sideeffect "", "" () + + %atomic_add0 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 80, i1 false) #2 + %atomic_add1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 + %atomic_add2 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 1, i32 80, i1 false) #2 + %atomic_add3 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 80, i1 false) #2 + + call void asm sideeffect "", "" () + + %atomic_cmpswap0 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 96, i1 false) #2 + %atomic_cmpswap1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 + %atomic_cmpswap2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 1, i32 96, i1 false) #2 + %atomic_cmpswap3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 96, i1 false) #2 + + call void asm sideeffect "", "" () + + call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2 + call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 + call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2 + call void @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2 + + call void asm sideeffect "", "" () + + ; rsrc, offset, soffset, cachepolicy + %raw_buffer0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 128, i32 0, i32 0) #0 + %raw_buffer1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 64, i32 64, i32 0) #0 + %raw_buffer2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 128, i32 0) #0 + %raw_buffer3 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 128, i32 0) #0 + %raw_buffer4 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 128, i32 %arg1, i32 0) #0 + + call void asm sideeffect "", "" () + + %raw_buffer_format0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 144, i32 0, i32 0) #0 + %raw_buffer_format1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 72, i32 72, i32 0) #0 + %raw_buffer_format2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 144, i32 0) #0 + %raw_buffer_format3 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 144, i32 0) #0 + %raw_buffer_format4 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 144, i32 %arg1, i32 0) #0 + + call void asm sideeffect "", "" () + + %raw_atomic_add0 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 160, i32 0, i32 0) #2 + %raw_atomic_add1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 80, i32 80, i32 0) #2 + %raw_atomic_add2 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 160, i32 0) #2 + %raw_atomic_add3 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 160, i32 0) #2 + %raw_atomic_add4 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 160, i32 %arg1, i32 0) #2 + + call void asm sideeffect "", "" () + + %raw_atomic_cmpswap0 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 176, i32 0, i32 0) #2 + %raw_atomic_cmpswap1 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 88, i32 88, i32 0) #2 + %raw_atomic_cmpswap2 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 176, i32 0) #2 + %raw_atomic_cmpswap3 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 176, i32 0) #2 + %raw_atomic_cmpswap4 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 176, i32 %arg1, i32 0) #2 + + call void asm sideeffect "", "" () + + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer0, <4 x i32> %tmp0, i32 192, i32 0, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer1, <4 x i32> %tmp0, i32 96, i32 96, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer2, <4 x i32> %tmp0, i32 0, i32 192, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer3, <4 x i32> %tmp0, i32 %arg1, i32 192, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %raw_buffer4, <4 x i32> %tmp0, i32 192, i32 %arg1, i32 0) #2 + + call void asm sideeffect "", "" () + + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format0, <4 x i32> %tmp0, i32 208, i32 0, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format1, <4 x i32> %tmp0, i32 104, i32 104, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format2, <4 x i32> %tmp0, i32 0, i32 208, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format3, <4 x i32> %tmp0, i32 %arg1, i32 208, i32 0) #2 + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %raw_buffer_format4, <4 x i32> %tmp0, i32 208, i32 %arg1, i32 0) #2 + + call void asm sideeffect "", "" () + + ; rsrc, vindex, offset, soffset, cachepolicy + %struct_buffer0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 224, i32 0, i32 0) #0 + %struct_buffer1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 112, i32 112, i32 0) #0 + %struct_buffer2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 0, i32 224, i32 0) #0 + %struct_buffer3 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i32 224, i32 0) #0 + %struct_buffer4 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 224, i32 %arg1, i32 0) #0 + %struct_buffer5 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 1, i32 224, i32 0, i32 0) #0 + %struct_buffer6 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 224, i32 0, i32 0) #0 + + call void asm sideeffect "", "" () + + %struct_buffer_format0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 240, i32 0, i32 0) #0 + %struct_buffer_format1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 120, i32 120, i32 0) #0 + %struct_buffer_format2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 0, i32 240, i32 0) #0 + %struct_buffer_format3 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i32 240, i32 0) #0 + %struct_buffer_format4 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 240, i32 %arg1, i32 0) #0 + %struct_buffer_format5 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 1, i32 240, i32 0, i32 0) #0 + %struct_buffer_format6 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 240, i32 0, i32 0) #0 + + call void asm sideeffect "", "" () + + %struct_atomic_add0 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 256, i32 0, i32 0) #2 + %struct_atomic_add1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 128, i32 128, i32 0) #2 + %struct_atomic_add2 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 0, i32 256, i32 0) #2 + %struct_atomic_add3 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i32 256, i32 0) #2 + %struct_atomic_add4 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 256, i32 %arg1, i32 0) #2 + %struct_atomic_add5 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 1, i32 256, i32 0, i32 0) #2 + %struct_atomic_add6 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 256, i32 0, i32 0) #2 + + call void asm sideeffect "", "" () + + %struct_atomic_cmpswap0 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 272, i32 0, i32 0) #2 + %struct_atomic_cmpswap1 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 136, i32 136, i32 0) #2 + %struct_atomic_cmpswap2 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 0, i32 272, i32 0) #2 + %struct_atomic_cmpswap3 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i32 272, i32 0) #2 + %struct_atomic_cmpswap4 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 272, i32 %arg1, i32 0) #2 + %struct_atomic_cmpswap5 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 1, i32 272, i32 0, i32 0) #2 + %struct_atomic_cmpswap6 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 272, i32 0, i32 0) #2 + + call void asm sideeffect "", "" () + + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer0, <4 x i32> %tmp0, i32 0, i32 288, i32 0, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer1, <4 x i32> %tmp0, i32 0, i32 144, i32 144, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer2, <4 x i32> %tmp0, i32 0, i32 0, i32 288, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer3, <4 x i32> %tmp0, i32 0, i32 %arg1, i32 288, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer4, <4 x i32> %tmp0, i32 0, i32 288, i32 %arg1, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer5, <4 x i32> %tmp0, i32 1, i32 288, i32 0, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %struct_buffer6, <4 x i32> %tmp0, i32 %arg1, i32 288, i32 0, i32 0) #2 + + call void asm sideeffect "", "" () + + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format0, <4 x i32> %tmp0, i32 0, i32 304, i32 0, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format1, <4 x i32> %tmp0, i32 0, i32 152, i32 152, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format2, <4 x i32> %tmp0, i32 0, i32 0, i32 304, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format3, <4 x i32> %tmp0, i32 0, i32 %arg1, i32 304, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format4, <4 x i32> %tmp0, i32 0, i32 304, i32 %arg1, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format5, <4 x i32> %tmp0, i32 1, i32 304, i32 0, i32 0) #2 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %struct_buffer_format6, <4 x i32> %tmp0, i32 %arg1, i32 304, i32 0, i32 0) #2 + + ret void +} + +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #2 +declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #2 +declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32) #2 +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 +declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 + +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32) #2 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32) #2 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2 +declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2 + +attributes #0 = { nounwind readonly } +attributes #1 = { nounwind writeonly } +attributes #2 = { nounwind } + +!0 = !{}