Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -721,6 +721,10 @@ def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<"FeatureGFX9Insts">; + +def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; +def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; + def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, AssemblerPredicate<"FeatureGFX9Insts">; Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -337,7 +337,8 @@ } SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { - if (cast(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS) + if (cast(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS || + !Subtarget->ldsRequiresM0Init()) return N; const SITargetLowering& Lowering = Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -462,6 +462,12 @@ return getGeneration() >= GFX9; } + /// Return if most LDS instructions have an m0 use that require m0 to be + /// iniitalized. + bool ldsRequiresM0Init() const { + return getGeneration() < GFX9; + } + bool hasAddNoCarry() const { return AddNoCarryInsts; } Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -600,6 +600,20 @@ (inst $ptr, (as_i16imm $offset), (i1 0)) >; +// FIXME: Passing name of PatFrag in workaround. Why doesn't +// !cast(frag.NAME#"_m0") work!? +multiclass DSReadPat_mc { + + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSReadPat(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSReadPat(inst.NAME#"_gfx9"), vt, !cast(frag)>; + } +} + + multiclass DSReadPat_Hi16 { def : GCNPat < (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))), @@ -624,30 +638,22 @@ >; } - -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; +defm : DSReadPat_mc ; let AddedComplexity = 100 in { -def : DSReadPat ; +defm : DSReadPat_mc ; } // End AddedComplexity = 100 -def : GCNPat < - (v2i32 (load_local_m0 (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1))), - (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) ->; - - let OtherPredicates = [HasD16LoadStore] in { let AddedComplexity = 100 in { defm : DSReadPat_Hi16; @@ -666,71 +672,119 @@ (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; -def : DSWritePat ; -def : DSWritePat ; -def : DSWritePat ; -def : DSWritePat ; -def : DSWritePat ; +multiclass DSWritePat_mc { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSWritePat(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSWritePat(inst.NAME#"_gfx9"), vt, !cast(frag)>; + } +} + +defm : DSWritePat_mc ; +defm : DSWritePat_mc ; +defm : DSWritePat_mc ; +defm : DSWritePat_mc ; +defm : DSWritePat_mc ; let OtherPredicates = [HasD16LoadStore] in { def : DSWritePat ; def : DSWritePat ; } -let AddedComplexity = 100 in { -def : DSWritePat ; -} // End AddedComplexity = 100 +class DS64Bit4ByteAlignedReadPat : GCNPat < + (v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), + (inst $ptr, $offset0, $offset1, (i1 0)) +>; -def : GCNPat < - (store_local_m0 v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1)), - (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)), - (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1, - (i1 0)) +class DS64Bit4ByteAlignedWritePat : GCNPat< + (frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), + (inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)), + (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1, + (i1 0)) >; +let OtherPredicates = [LDSRequiresM0Init] in { +def : DS64Bit4ByteAlignedReadPat; +def : DS64Bit4ByteAlignedWritePat; +} + +let OtherPredicates = [NotLDSRequiresM0Init] in { +def : DS64Bit4ByteAlignedReadPat; +def : DS64Bit4ByteAlignedWritePat; +} + + +let AddedComplexity = 100 in { + +defm : DSWritePat_mc ; +} // End AddedComplexity = 100 class DSAtomicRetPat : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; +multiclass DSAtomicRetPat_mc { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSAtomicRetPat(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSAtomicRetPat(inst.NAME#"_gfx9"), vt, !cast(frag)>; + } +} + + + class DSAtomicCmpXChg : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) >; +multiclass DSAtomicCmpXChg_mc { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSAtomicCmpXChg(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSAtomicCmpXChg(inst.NAME#"_gfx9"), vt, !cast(frag)>; + } +} + + // 32-bit atomics. -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicCmpXChg; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicCmpXChg_mc; // 64-bit atomics. -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; - -def : DSAtomicCmpXChg; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; + +defm : DSAtomicCmpXChg_mc; //===----------------------------------------------------------------------===// // Real instructions Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -161,6 +161,9 @@ defm atomic_inc_global : global_binary_atomic_op; defm atomic_dec_global : global_binary_atomic_op; +def atomic_inc_local : local_binary_atomic_op; +def atomic_dec_local : local_binary_atomic_op; + //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. // This is for SDNodes and PatFrag for local loads and stores to Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -112,7 +112,13 @@ static bool offsetsCanBeCombined(CombineInfo &CI); bool findMatchingInst(CombineInfo &CI); + + unsigned read2Opcode(unsigned EltSize) const; + unsigned read2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); + + unsigned write2Opcode(unsigned EltSize) const; + unsigned write2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); @@ -435,6 +441,20 @@ return false; } +unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { + if (STM->ldsRequiresM0Init()) + return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; + return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; +} + +unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { + if (STM->ldsRequiresM0Init()) + return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; + + return (EltSize == 4) ? + AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; +} + MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); @@ -448,12 +468,8 @@ unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32 - : AMDGPU::DS_READ2_B64; - - if (CI.UseST64) - Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 - : AMDGPU::DS_READ2ST64_B64; + unsigned Opc = CI.UseST64 ? + read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; @@ -516,6 +532,20 @@ return Next; } +unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { + if (STM->ldsRequiresM0Init()) + return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; + return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; +} + +unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { + if (STM->ldsRequiresM0Init()) + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + + return (EltSize == 4) ? + AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; +} + MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); @@ -529,12 +559,8 @@ unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32 - : AMDGPU::DS_WRITE2_B64; - - if (CI.UseST64) - Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 - : AMDGPU::DS_WRITE2ST64_B64; + unsigned Opc = CI.UseST64 ? + write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -785,9 +811,13 @@ CombineInfo CI; CI.I = I; unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { + if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || + Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { + CI.InstClass = DS_READ_WRITE; - CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; + CI.EltSize = + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; + if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); @@ -796,10 +826,13 @@ } continue; - } - if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { + } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || + Opc == AMDGPU::DS_WRITE_B32_gfx9 || + Opc == AMDGPU::DS_WRITE_B64_gfx9) { CI.InstClass = DS_READ_WRITE; - CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; + CI.EltSize + = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; + if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); Index: test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -1,13 +1,17 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SICI,SICIVI,GCN %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SICI,CIVI,SICIVI,GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,SICIVI,GFX89,GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 +; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: +; GFX9-NOT: m0 +; SICIVI-DAG: s_mov_b32 m0 + +; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 @@ -20,18 +24,21 @@ ret void } -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: +; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: +; GFX9-NOT: m0 +; SICIVI-DAG: s_mov_b32 m0 + ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] ; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 -; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: [[RESULT]] ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 @@ -41,9 +48,11 @@ ret void } -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset +; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset +; GFX9-NOT: m0 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GFX9: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind { %sub = sub i32 %a, %b @@ -55,11 +64,15 @@ ret void } -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset: -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 +; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset: +; GFX9-NOT: m0 +; SICIVI-DAG: s_mov_b32 m0 + + +; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa +; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] @@ -72,11 +85,14 @@ ret void } -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset: -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset: +; GFX9-NOT: m0 +; SICIVI-DAG: s_mov_b32 m0 + +; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 ; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] Index: test/CodeGen/AMDGPU/atomic_load_add.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_load_add.ll +++ test/CodeGen/AMDGPU/atomic_load_add.ll @@ -1,18 +1,24 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=R600,FUNC %s ; FUNC-LABEL: {{^}}atomic_add_local: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; R600: LDS_ADD * -; SI: ds_add_u32 +; GCN: ds_add_u32 define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) { %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst ret void } ; FUNC-LABEL: {{^}}atomic_add_local_const_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; R600: LDS_ADD * -; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst @@ -20,8 +26,11 @@ } ; FUNC-LABEL: {{^}}atomic_add_ret_local: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; R600: LDS_ADD_RET * -; SI: ds_add_rtn_u32 +; GCN: ds_add_rtn_u32 define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst store i32 %val, i32 addrspace(1)* %out @@ -29,8 +38,11 @@ } ; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; R600: LDS_ADD_RET * -; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 +; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst Index: test/CodeGen/AMDGPU/atomic_load_sub.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_load_sub.ll +++ test/CodeGen/AMDGPU/atomic_load_sub.ll @@ -1,18 +1,25 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s ; FUNC-LABEL: {{^}}atomic_sub_local: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; R600: LDS_SUB * -; SI: ds_sub_u32 +; GCN: ds_sub_u32 define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) { %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst ret void } ; FUNC-LABEL: {{^}}atomic_sub_local_const_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; R600: LDS_SUB * -; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst @@ -20,8 +27,11 @@ } ; FUNC-LABEL: {{^}}atomic_sub_ret_local: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; R600: LDS_SUB_RET * -; SI: ds_sub_rtn_u32 +; GCN: ds_sub_rtn_u32 define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst store i32 %val, i32 addrspace(1)* %out @@ -29,8 +39,11 @@ } ; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; R600: LDS_SUB_RET * -; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 +; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst Index: test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2.ll +++ test/CodeGen/AMDGPU/ds_read2.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX9 %s ; FIXME: We don't get cases where the address was an SGPR because we ; get a copy to the address register for each one. @@ -6,12 +7,16 @@ @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 -; SI-LABEL: @simple_read2_f32 -; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: {{^}}simple_read2_f32: +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] +; CI: buffer_store_dword [[RESULT]] +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -25,12 +30,16 @@ ret void } -; SI-LABEL: @simple_read2_f32_max_offset -; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: {{^}}simple_read2_f32_max_offset: +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] + +; CI: buffer_store_dword [[RESULT]] +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -44,11 +53,14 @@ ret void } -; SI-LABEL: @simple_read2_f32_too_far -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_too_far +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT ds_read2_b32 +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -62,10 +74,13 @@ ret void } -; SI-LABEL: @simple_read2_f32_x2 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_x2 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 0 @@ -93,11 +108,14 @@ } ; Make sure there is an instruction between the two sets of reads. -; SI-LABEL: @simple_read2_f32_x2_barrier -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 -; SI: s_barrier -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_x2_barrier +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 +; GCN: s_barrier +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 0 @@ -129,10 +147,13 @@ ; For some reason adding something to the base address for the first ; element results in only folding the inner pair. -; SI-LABEL: @simple_read2_f32_x2_nonzero_base -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_x2_nonzero_base +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 @@ -165,11 +186,14 @@ ; Base pointers come from different subregister of same super ; register. We can't safely merge this. -; SI-LABEL: @read2_ptr_is_subreg_arg_f32 -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 -; SI: s_endpgm +; GCN-LABEL: @read2_ptr_is_subreg_arg_f32 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read2_b32 +; GCN: ds_read_b32 +; GCN: ds_read_b32 +; GCN: s_endpgm define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 @@ -191,11 +215,14 @@ ; sure we are really rejecting it because of the different ; subregisters. -; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32 -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 -; SI: s_endpgm +; GCN-LABEL: @read2_ptr_is_subreg_arg_offset_f32 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read2_b32 +; GCN: ds_read_b32 +; GCN: ds_read_b32 +; GCN: s_endpgm define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 @@ -216,9 +243,12 @@ ret void } -; SI-LABEL: {{^}}read2_ptr_is_subreg_f32: -; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}} -; SI: s_endpgm +; GCN-LABEL: {{^}}read2_ptr_is_subreg_f32: +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}} +; GCN: s_endpgm define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 @@ -238,11 +268,14 @@ ret void } -; SI-LABEL: @simple_read2_f32_volatile_0 -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_volatile_0 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT ds_read2_b32 +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -256,11 +289,14 @@ ret void } -; SI-LABEL: @simple_read2_f32_volatile_1 -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f32_volatile_1 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT ds_read2_b32 +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -277,9 +313,12 @@ ; Can't fold since not correctly aligned. ; XXX: This isn't really testing anything useful now. I think CI ; allows unaligned LDS accesses, which would be a problem here. -; SI-LABEL: @unaligned_read2_f32 -; SI-NOT: ds_read2_b32 -; SI: s_endpgm +; GCN-LABEL: @unaligned_read2_f32 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read2_b32 +; GCN: s_endpgm define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i @@ -293,9 +332,12 @@ ret void } -; SI-LABEL: @misaligned_2_simple_read2_f32 -; SI-NOT: ds_read2_b32 -; SI: s_endpgm +; GCN-LABEL: @misaligned_2_simple_read2_f32 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read2_b32 +; GCN: s_endpgm define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i @@ -309,12 +351,16 @@ ret void } -; SI-LABEL: @simple_read2_f64 -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} -; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f64 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} +; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 +; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} + +; CI: buffer_store_dwordx2 [[RESULT]] +; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i @@ -328,9 +374,12 @@ ret void } -; SI-LABEL: @simple_read2_f64_max_offset -; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f64_max_offset +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i @@ -344,11 +393,14 @@ ret void } -; SI-LABEL: @simple_read2_f64_too_far -; SI-NOT ds_read2_b64 -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056 -; SI: s_endpgm +; GCN-LABEL: @simple_read2_f64_too_far +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT ds_read2_b64 +; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056 +; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i @@ -363,10 +415,13 @@ } ; Alignment only 4 -; SI-LABEL: @misaligned_read2_f64 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15 -; SI: s_endpgm +; GCN-LABEL: @misaligned_read2_f64 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15 +; GCN: s_endpgm define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i @@ -382,9 +437,12 @@ @foo = addrspace(3) global [4 x i32] undef, align 4 -; SI-LABEL: @load_constant_adjacent_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +; GCN-LABEL: @load_constant_adjacent_offsets +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -393,9 +451,12 @@ ret void } -; SI-LABEL: @load_constant_disjoint_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 +; GCN-LABEL: @load_constant_disjoint_offsets +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -406,10 +467,13 @@ @bar = addrspace(3) global [4 x i64] undef, align 4 -; SI-LABEL: @load_misaligned64_constant_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 +; GCN-LABEL: @load_misaligned64_constant_offsets +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -420,12 +484,15 @@ @bar.large = addrspace(3) global [4096 x i64] undef, align 4 -; SI-LABEL: @load_misaligned64_constant_large_offsets -; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 -; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 -; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 -; SI: s_endpgm +; GCN-LABEL: @load_misaligned64_constant_large_offsets +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} +; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 +; GCN: s_endpgm define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 @@ -437,6 +504,10 @@ @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 +; GCN-LABEL: {{^}}sgemm_inner_loop_read2_sequence: +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1 @@ -481,20 +552,29 @@ ret void } +; GCN-LABEL: {{^}}misaligned_read2_v2i32: +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 ret void } +; GCN-LABEL: {{^}}misaligned_read2_i64: +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { %load = load i64, i64 addrspace(3)* %in, align 4 store i64 %load, i64 addrspace(1)* %out, align 8 ret void } -; SI-LABEL: ds_read_diff_base_interleaving -; SI-NOT: ds_read_b32 +; GCN-LABEL: ds_read_diff_base_interleaving +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read_b32 define amdgpu_kernel void @ds_read_diff_base_interleaving( float addrspace(1)* nocapture %arg, [4 x [4 x float]] addrspace(3)* %arg1, @@ -533,19 +613,10 @@ ret void } -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workgroup.id.x() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workgroup.id.y() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.x() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() #1 - -; Function Attrs: convergent nounwind declare void @llvm.amdgcn.s.barrier() #2 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/ds_read2st64.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2st64.ll +++ test/CodeGen/AMDGPU/ds_read2st64.ll @@ -1,15 +1,19 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefixes=GCN,GFX9 %s @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 -; SI-LABEL: @simple_read2st64_f32_0_1 -; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2st64_f32_0_1 +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] +; CI: buffer_store_dword [[RESULT]] +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -23,12 +27,15 @@ ret void } -; SI-LABEL: @simple_read2st64_f32_1_2 -; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2st64_f32_1_2 +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] +; CI: buffer_store_dword [[RESULT]] +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 @@ -43,12 +50,15 @@ ret void } -; SI-LABEL: @simple_read2st64_f32_max_offset -; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2st64_f32_max_offset +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] +; CI: buffer_store_dword [[RESULT]] +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 @@ -63,12 +73,15 @@ ret void } -; SI-LABEL: @simple_read2st64_f32_over_max_offset -; SI-NOT: ds_read2st64_b32 -; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} -; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 -; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}} -; SI: s_endpgm +; GCN-LABEL: @simple_read2st64_f32_over_max_offset +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read2st64_b32 +; GCN-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} +; GCN-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 +; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}} +; GCN: s_endpgm define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 @@ -83,9 +96,12 @@ ret void } -; SI-LABEL: @odd_invalid_read2st64_f32_0 -; SI-NOT: ds_read2st64_b32 -; SI: s_endpgm +; GCN-LABEL: @odd_invalid_read2st64_f32_0 +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read2st64_b32 +; GCN: s_endpgm define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -99,9 +115,12 @@ ret void } -; SI-LABEL: @odd_invalid_read2st64_f32_1 -; SI-NOT: ds_read2st64_b32 -; SI: s_endpgm +; GCN-LABEL: @odd_invalid_read2st64_f32_1 +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read2st64_b32 +; GCN: s_endpgm define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 @@ -116,12 +135,15 @@ ret void } -; SI-LABEL: @simple_read2st64_f64_0_1 -; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2st64_f64_0_1 +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} +; CI: buffer_store_dwordx2 [[RESULT]] +; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i @@ -135,12 +157,16 @@ ret void } -; SI-LABEL: @simple_read2st64_f64_1_2 -; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2st64_f64_1_2 +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} + +; CI: buffer_store_dwordx2 [[RESULT]] +; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 @@ -157,10 +183,13 @@ ; Alignment only -; SI-LABEL: @misaligned_read2st64_f64 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129 -; SI: s_endpgm +; GCN-LABEL: @misaligned_read2st64_f64 +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129 +; GCN: s_endpgm define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i @@ -175,12 +204,16 @@ } ; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff -; SI-LABEL: @simple_read2st64_f64_max_offset -; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2st64_f64_max_offset +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} + +; CI: buffer_store_dwordx2 [[RESULT]] +; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 256 @@ -195,12 +228,15 @@ ret void } -; SI-LABEL: @simple_read2st64_f64_over_max_offset -; SI-NOT: ds_read2st64_b64 -; SI-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 -; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] -; SI: s_endpgm +; GCN-LABEL: @simple_read2st64_f64_over_max_offset +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read2st64_b64 +; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 +; GCN-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} +; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] +; GCN: s_endpgm define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 @@ -215,9 +251,12 @@ ret void } -; SI-LABEL: @invalid_read2st64_f64_odd_offset -; SI-NOT: ds_read2st64_b64 -; SI: s_endpgm +; GCN-LABEL: @invalid_read2st64_f64_odd_offset +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read2st64_b64 +; GCN: s_endpgm define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 @@ -235,10 +274,13 @@ ; The stride of 8 elements is 8 * 8 bytes. We need to make sure the ; stride in elements, not bytes, is a multiple of 64. -; SI-LABEL: @byte_size_only_divisible_64_read2_f64 -; SI-NOT: ds_read2st_b64 -; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8 -; SI: s_endpgm +; GCN-LABEL: @byte_size_only_divisible_64_read2_f64 +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_read2st_b64 +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i @@ -252,10 +294,7 @@ ret void } -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.x() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -1,14 +1,18 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX9 %s @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 -; SI-LABEL: @simple_write2_one_val_f32 -; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: {{^}}simple_write2_one_val_f32: +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: {{buffer|global}}_load_dword [[VAL:v[0-9]+]] +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i @@ -21,12 +25,19 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: {{^}}simple_write2_two_val_f32: +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 + +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 + +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i @@ -41,11 +52,14 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32_volatile_0 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f32_volatile_0 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_write2_b32 +; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} +; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i @@ -60,11 +74,14 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32_volatile_1 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f32_volatile_1 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_write2_b32 +; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} +; GCN: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i @@ -80,12 +97,19 @@ } ; 2 data subregisters from different super registers. -; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32 -; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} -; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: {{^}}simple_write2_two_val_subreg2_mixed_f32: +; GFX9-NOT: m0 + +; CI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} +; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} +; CI-DAG: s_mov_b32 m0 + +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} + +; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} +; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i @@ -102,11 +126,14 @@ ret void } -; SI-LABEL: @simple_write2_two_val_subreg2_f32 -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_subreg2_f32 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i @@ -121,11 +148,14 @@ ret void } -; SI-LABEL: @simple_write2_two_val_subreg4_f32 -; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_subreg4_f32 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i @@ -140,12 +170,19 @@ ret void } -; SI-LABEL: @simple_write2_two_val_max_offset_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_max_offset_f32 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 + +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 + +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i @@ -160,10 +197,13 @@ ret void } -; SI-LABEL: @simple_write2_two_val_too_far_f32 -; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_too_far_f32 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i @@ -178,10 +218,13 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32_x2 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f32_x2 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 +; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x @@ -208,10 +251,13 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f32_x2_nonzero_base +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 +; GCN: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x @@ -238,11 +284,14 @@ ret void } -; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: s_endpgm +; GCN-LABEL: @write2_ptr_subreg_arg_two_val_f32 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_write2_b32 +; GCN: ds_write_b32 +; GCN: ds_write_b32 +; GCN: s_endpgm define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i @@ -265,11 +314,14 @@ ret void } -; SI-LABEL: @simple_write2_one_val_f64 -; SI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_one_val_f64 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: {{buffer|global}}_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; GCN: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i @@ -282,12 +334,15 @@ ret void } -; SI-LABEL: @misaligned_simple_write2_one_val_f64 -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 -; SI: s_endpgm +; GCN-LABEL: @misaligned_simple_write2_one_val_f64 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 +; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 +; GCN: s_endpgm define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i @@ -300,12 +355,20 @@ ret void } -; SI-LABEL: @simple_write2_two_val_f64 -; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 -; SI: s_endpgm +; GCN-LABEL: @simple_write2_two_val_f64 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 + +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 + + +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; GCN: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i @@ -322,19 +385,25 @@ @foo = addrspace(3) global [4 x i32] undef, align 4 -; SI-LABEL: @store_constant_adjacent_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-LABEL: @store_constant_adjacent_offsets +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 define amdgpu_kernel void @store_constant_adjacent_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 ret void } -; SI-LABEL: @store_constant_disjoint_offsets -; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} -; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 +; GCN-LABEL: @store_constant_disjoint_offsets +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 define amdgpu_kernel void @store_constant_disjoint_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -343,11 +412,14 @@ @bar = addrspace(3) global [4 x i64] undef, align 4 -; SI-LABEL: @store_misaligned64_constant_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 -; SI: s_endpgm +; GCN-LABEL: @store_misaligned64_constant_offsets +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -356,12 +428,15 @@ @bar.large = addrspace(3) global [4096 x i64] undef, align 4 -; SI-LABEL: @store_misaligned64_constant_large_offsets -; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} -; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: s_endpgm +; GCN-LABEL: @store_misaligned64_constant_large_offsets +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} +; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} +; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 @@ -406,10 +481,12 @@ ret void } -; CI-LABEL: {{^}}simple_write2_v4f32_superreg_align4: -; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}} -; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}} -; CI: s_endpgm +; GCN-LABEL: {{^}}simple_write2_v4f32_superreg_align4: +; CI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}} +; GCN: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}} define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in @@ -419,16 +496,9 @@ ret void } -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workgroup.id.x() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workgroup.id.y() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.x() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/ds_write2st64.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2st64.ll +++ test/CodeGen/AMDGPU/ds_write2st64.ll @@ -1,12 +1,16 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s @lds = addrspace(3) global [512 x float] undef, align 4 -; SI-LABEL: @simple_write2st64_one_val_f32_0_1 -; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1 -; SI: s_endpgm +; GCN-LABEL: @simple_write2st64_one_val_f32_0_1 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0n + +; GCN-DAG: {{buffer|global}}_load_dword [[VAL:v[0-9]+]] +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i @@ -19,12 +23,20 @@ ret void } -; SI-LABEL: @simple_write2st64_two_val_f32_2_5 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 -; SI: s_endpgm +; GCN-LABEL: @simple_write2st64_two_val_f32_2_5 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 + +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 + + +; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i @@ -40,12 +52,20 @@ ret void } -; SI-LABEL: @simple_write2st64_two_val_max_offset_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 -; SI: s_endpgm +; GCN-LABEL: @simple_write2st64_two_val_max_offset_f32 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 + +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 + +; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}} +; GCN: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, s{{[0-9]+}}, [[SHL]] +; GCN: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i @@ -60,12 +80,20 @@ ret void } -; SI-LABEL: @simple_write2st64_two_val_max_offset_f64 -; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], -; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 -; SI: s_endpgm +; GCN-LABEL: @simple_write2st64_two_val_max_offset_f64 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 + +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 + +; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}} +; GCN: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, s{{[0-9]+}}, [[SHL]] +; GCN: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 +; GCN: s_endpgm define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i @@ -81,10 +109,13 @@ ret void } -; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64 -; SI-NOT: ds_write2st64_b64 -; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8 -; SI: s_endpgm +; GCN-LABEL: @byte_size_only_divisible_64_write2st64_f64 +; CI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: ds_write2st64_b64 +; GCN: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8 +; GCN: s_endpgm define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i @@ -97,10 +128,7 @@ ret void } -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.x() #1 - -; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll +++ test/CodeGen/AMDGPU/hsa-metadata-kernel-debug-props.ll @@ -15,7 +15,7 @@ ; CHECK: ReservedNumVGPRs: 4 ; GFX700: ReservedFirstVGPR: 8 ; GFX800: ReservedFirstVGPR: 8 -; GFX900: ReservedFirstVGPR: 11 +; GFX900: ReservedFirstVGPR: 10 ; CHECK: PrivateSegmentBufferSGPR: 0 ; CHECK: WavefrontPrivateSegmentOffsetSGPR: 11 define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !7 !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !15 { Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1,7 +1,7 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s ; Tests for indirect addressing on SI, which is implemented using dynamic ; indexing of vectors. @@ -603,7 +603,8 @@ ; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT2]], -4.0 ; IDXMODE: s_set_gpr_idx_off -; GCN: s_mov_b32 m0, -1 +; PREGFX9: s_mov_b32 m0, -1 +; GFX9-NOT: s_mov_b32 m0 ; GCN: ds_write_b32 ; GCN: ds_write_b32 ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -14,6 +14,8 @@ ; Make sure no crash on invalid non-constant ; GCN-LABEL: {{^}}invalid_variable_order_lds_atomic_dec_ret_i32: +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @invalid_variable_order_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %order.var) #0 { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 %order.var, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out @@ -22,6 +24,8 @@ ; Make sure no crash on invalid non-constant ; GCN-LABEL: {{^}}invalid_variable_scope_lds_atomic_dec_ret_i32: +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @invalid_variable_scope_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %scope.var) #0 { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 %scope.var, i1 false) store i32 %result, i32 addrspace(1)* %out @@ -37,7 +41,10 @@ } ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) @@ -46,7 +53,10 @@ } ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 @@ -56,9 +66,12 @@ } ; GCN-LABEL: {{^}}lds_atomic_dec_noret_i32: -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]], +; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_dec_u32 [[VPTR]], [[DATA]] define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) @@ -66,7 +79,10 @@ } ; GCN-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 @@ -277,7 +293,10 @@ @lds0 = addrspace(3) global [512 x i32] undef ; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -290,6 +309,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64: +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} @@ -300,6 +322,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64_offset: +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 @@ -311,6 +336,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_dec_noret_i64: +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} @@ -320,6 +348,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_dec_noret_i64_offset: +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}} @@ -406,7 +437,10 @@ @lds1 = addrspace(3) global [512 x i64] undef, align 8 ; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -13,7 +13,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) @@ -22,7 +25,10 @@ } ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 @@ -32,9 +38,12 @@ } ; GCN-LABEL: {{^}}lds_atomic_inc_noret_i32: -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]], +; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_inc_u32 [[VPTR]], [[DATA]] define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) @@ -42,7 +51,10 @@ } ; GCN-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; CIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 Index: test/CodeGen/AMDGPU/load-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/load-hi16.ll +++ test/CodeGen/AMDGPU/load-hi16.ll @@ -69,7 +69,6 @@ ; FIXME: Remove m0 initialization ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift: ; GCN: s_waitcnt -; GFX9-NEXT: s_mov_b32 m0, -1 ; GFX9-NEXT: ds_read_u16 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -563,7 +562,6 @@ ; FIXME: Is there a cost to using the extload over not? ; GCN-LABEL: {{^}}load_local_v2i16_split: ; GCN: s_waitcnt -; GFX9-NEXT: s_mov_b32 m0, -1 ; GFX9-NEXT: ds_read_u16 v1, v0 ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 offset:2 Index: test/CodeGen/AMDGPU/load-local-f32.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-f32.ll +++ test/CodeGen/AMDGPU/load-local-f32.ll @@ -1,9 +1,10 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}load_f32_local: -; GCN: s_mov_b32 m0 +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; GCN: ds_read_b32 ; EG: LDS_READ_RET @@ -15,7 +16,9 @@ } ; FUNC-LABEL: {{^}}load_v2f32_local: -; GCN: s_mov_b32 m0 +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read_b64 ; EG: LDS_READ_RET @@ -29,6 +32,9 @@ ; FIXME: should this do a read2_b64? ; FUNC-LABEL: {{^}}local_load_v3f32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8 ; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} ; GCN: s_waitcnt @@ -46,6 +52,9 @@ } ; FUNC-LABEL: {{^}}local_load_v4f32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; EG: LDS_READ_RET @@ -60,6 +69,9 @@ } ; FUNC-LABEL: {{^}}local_load_v8f32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 @@ -79,6 +91,9 @@ } ; FUNC-LABEL: {{^}}local_load_v16f32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 Index: test/CodeGen/AMDGPU/load-local-f64.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-f64.ll +++ test/CodeGen/AMDGPU/load-local-f64.ll @@ -1,9 +1,13 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}local_load_f64: +; SICIV: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}} ; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]] @@ -16,6 +20,9 @@ } ; FUNC-LABEL: {{^}}local_load_v2f64: +; SICIV: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; EG: LDS_READ_RET @@ -30,6 +37,9 @@ } ; FUNC-LABEL: {{^}}local_load_v3f64: +; SICIV: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN-DAG: ds_read2_b64 ; GCN-DAG: ds_read_b64 @@ -47,6 +57,9 @@ } ; FUNC-LABEL: {{^}}local_load_v4f64: +; SICIV: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 @@ -67,6 +80,9 @@ } ; FUNC-LABEL: {{^}}local_load_v8f64: +; SICIV: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 @@ -96,6 +112,9 @@ } ; FUNC-LABEL: {{^}}local_load_v16f64: +; SICIV: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 Index: test/CodeGen/AMDGPU/load-local-i1.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i1.ll +++ test/CodeGen/AMDGPU/load-local-i1.ll @@ -1,8 +1,12 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}local_load_i1: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read_u8 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1 ; GCN: ds_write_b8 @@ -17,6 +21,8 @@ } ; FUNC-LABEL: {{^}}local_load_v2i1: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in store <2 x i1> %load, <2 x i1> addrspace(3)* %out @@ -24,6 +30,8 @@ } ; FUNC-LABEL: {{^}}local_load_v3i1: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in store <3 x i1> %load, <3 x i1> addrspace(3)* %out @@ -31,6 +39,8 @@ } ; FUNC-LABEL: {{^}}local_load_v4i1: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in store <4 x i1> %load, <4 x i1> addrspace(3)* %out @@ -38,6 +48,8 @@ } ; FUNC-LABEL: {{^}}local_load_v8i1: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in store <8 x i1> %load, <8 x i1> addrspace(3)* %out @@ -45,6 +57,8 @@ } ; FUNC-LABEL: {{^}}local_load_v16i1: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in store <16 x i1> %load, <16 x i1> addrspace(3)* %out @@ -52,6 +66,8 @@ } ; FUNC-LABEL: {{^}}local_load_v32i1: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in store <32 x i1> %load, <32 x i1> addrspace(3)* %out @@ -59,6 +75,8 @@ } ; FUNC-LABEL: {{^}}local_load_v64i1: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in store <64 x i1> %load, <64 x i1> addrspace(3)* %out @@ -66,6 +84,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_i1_to_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read_u8 ; GCN: ds_write_b32 define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 { @@ -76,6 +97,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_i1_to_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read_u8 ; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}} ; GCN: ds_write_b32 @@ -90,6 +114,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = zext <1 x i1> %load to <1 x i32> @@ -98,6 +124,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = sext <1 x i1> %load to <1 x i32> @@ -106,6 +134,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = zext <2 x i1> %load to <2 x i32> @@ -114,6 +144,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = sext <2 x i1> %load to <2 x i32> @@ -122,6 +154,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = zext <3 x i1> %load to <3 x i32> @@ -130,6 +164,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = sext <3 x i1> %load to <3 x i32> @@ -138,6 +174,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = zext <4 x i1> %load to <4 x i32> @@ -146,6 +184,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = sext <4 x i1> %load to <4 x i32> @@ -154,6 +194,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = zext <8 x i1> %load to <8 x i32> @@ -162,6 +204,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = sext <8 x i1> %load to <8 x i32> @@ -170,6 +214,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = zext <16 x i1> %load to <16 x i32> @@ -178,6 +224,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = sext <16 x i1> %load to <16 x i32> @@ -186,6 +234,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = zext <32 x i1> %load to <32 x i32> @@ -194,6 +244,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = sext <32 x i1> %load to <32 x i32> @@ -202,6 +254,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = zext <64 x i1> %load to <64 x i32> @@ -210,6 +264,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = sext <64 x i1> %load to <64 x i32> @@ -218,6 +274,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_i1_to_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]], ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} ; GCN: ds_write_b64 @@ -229,6 +288,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_i1_to_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read_u8 [[LOAD:v[0-9]+]], ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] @@ -241,6 +303,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = zext <1 x i1> %load to <1 x i64> @@ -249,6 +313,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 { %load = load <1 x i1>, <1 x i1> addrspace(3)* %in %ext = sext <1 x i1> %load to <1 x i64> @@ -257,6 +323,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = zext <2 x i1> %load to <2 x i64> @@ -265,6 +333,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 { %load = load <2 x i1>, <2 x i1> addrspace(3)* %in %ext = sext <2 x i1> %load to <2 x i64> @@ -273,6 +343,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = zext <3 x i1> %load to <3 x i64> @@ -281,6 +353,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 { %load = load <3 x i1>, <3 x i1> addrspace(3)* %in %ext = sext <3 x i1> %load to <3 x i64> @@ -289,6 +363,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = zext <4 x i1> %load to <4 x i64> @@ -297,6 +373,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 { %load = load <4 x i1>, <4 x i1> addrspace(3)* %in %ext = sext <4 x i1> %load to <4 x i64> @@ -305,6 +383,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = zext <8 x i1> %load to <8 x i64> @@ -313,6 +393,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 { %load = load <8 x i1>, <8 x i1> addrspace(3)* %in %ext = sext <8 x i1> %load to <8 x i64> @@ -321,6 +403,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = zext <16 x i1> %load to <16 x i64> @@ -329,6 +413,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 { %load = load <16 x i1>, <16 x i1> addrspace(3)* %in %ext = sext <16 x i1> %load to <16 x i64> @@ -337,6 +423,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = zext <32 x i1> %load to <32 x i64> @@ -345,6 +433,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 { %load = load <32 x i1>, <32 x i1> addrspace(3)* %in %ext = sext <32 x i1> %load to <32 x i64> @@ -353,6 +443,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = zext <64 x i1> %load to <64 x i64> @@ -361,6 +453,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 { %load = load <64 x i1>, <64 x i1> addrspace(3)* %in %ext = sext <64 x i1> %load to <64 x i64> Index: test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i16.ll +++ test/CodeGen/AMDGPU/load-local-i16.ll @@ -1,8 +1,12 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,GFX89,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}local_load_i16: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_u16 v{{[0-9]+}} ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z @@ -18,6 +22,9 @@ } ; FUNC-LABEL: {{^}}local_load_v2i16: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_b32 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z @@ -33,6 +40,9 @@ } ; FUNC-LABEL: {{^}}local_load_v3i16: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_b64 ; GCN-DAG: ds_write_b32 ; GCN-DAG: ds_write_b16 @@ -47,6 +57,9 @@ } ; FUNC-LABEL: {{^}}local_load_v4i16: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_b64 ; EG: LDS_READ_RET @@ -59,6 +72,9 @@ } ; FUNC-LABEL: {{^}}local_load_v8i16: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; EG: LDS_READ_RET @@ -73,6 +89,9 @@ } ; FUNC-LABEL: {{^}}local_load_v16i16: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} @@ -94,6 +113,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_i16_to_i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_u16 ; GCN: ds_write_b32 @@ -111,7 +133,10 @@ ; FUNC-LABEL: {{^}}local_sextload_i16_to_i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 + +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_i16 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z @@ -129,6 +154,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_u16 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z @@ -144,6 +172,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_i16 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z @@ -162,7 +193,9 @@ ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_b32 ; EG: LDS_READ_RET @@ -175,7 +208,9 @@ ; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_b32 ; EG: LDS_READ_RET @@ -189,6 +224,9 @@ } ; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_b64 ; GCN-DAG: ds_write_b32 ; GCN-DAG: ds_write_b64 @@ -203,6 +241,9 @@ } ; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_b64 ; GCN-DAG: ds_write_b32 ; GCN-DAG: ds_write_b64 @@ -221,7 +262,9 @@ ; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_b64 ; EG: LDS_READ_RET @@ -235,7 +278,9 @@ ; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read_b64 ; EG: LDS_READ_RET @@ -252,6 +297,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; EG: LDS_READ_RET @@ -266,6 +314,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; EG: LDS_READ_RET @@ -288,6 +339,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} @@ -312,6 +366,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} @@ -348,6 +405,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 @@ -377,6 +437,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} @@ -414,6 +477,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 @@ -479,6 +545,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -520,6 +588,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_i16_to_i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]], ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} @@ -538,13 +609,16 @@ } ; FUNC-LABEL: {{^}}local_sextload_i16_to_i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; FIXME: Need to optimize this sequence to avoid an extra shift. ; t25: i32,ch = load t12, t10, undef:i32 ; t28: i64 = any_extend t25 ; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16 ; SI: ds_read_i16 v[[LO:[0-9]+]], -; VI: ds_read_u16 v[[ULO:[0-9]+]] -; VI: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 +; GFX89: ds_read_u16 v[[ULO:[0-9]+]] +; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]] @@ -565,6 +639,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] @@ -579,6 +656,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]] @@ -596,6 +676,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 { @@ -606,6 +689,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: LDS_READ_RET ; EG-DAG: BFE_INT @@ -618,6 +704,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -629,6 +718,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -644,6 +736,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -657,6 +752,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -678,6 +776,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -695,6 +796,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -728,6 +832,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -753,6 +860,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64: +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 + ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/load-local-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i32.ll +++ test/CodeGen/AMDGPU/load-local-i32.ll @@ -1,11 +1,12 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - ; FUNC-LABEL: {{^}}local_load_i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0, -1 +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 ; GCN: ds_read_b32 ; EG: LDS_READ_RET @@ -17,6 +18,9 @@ } ; FUNC-LABEL: {{^}}local_load_v2i32: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + ; GCN: ds_read_b64 define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { entry: @@ -26,6 +30,9 @@ } ; FUNC-LABEL: {{^}}local_load_v3i32: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + ; GCN-DAG: ds_read_b64 ; GCN-DAG: ds_read_b32 define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 { @@ -36,6 +43,9 @@ } ; FUNC-LABEL: {{^}}local_load_v4i32: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { @@ -46,6 +56,9 @@ } ; FUNC-LABEL: {{^}}local_load_v8i32: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { @@ -56,6 +69,9 @@ } ; FUNC-LABEL: {{^}}local_load_v16i32: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} @@ -72,6 +88,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_i32_to_i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { %ld = load i32, i32 addrspace(3)* %in %ext = zext i32 %ld to i64 @@ -80,6 +99,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_i32_to_i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 { %ld = load i32, i32 addrspace(3)* %in %ext = sext i32 %ld to i64 @@ -88,6 +110,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in %ext = zext <1 x i32> %ld to <1 x i64> @@ -96,6 +121,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in %ext = sext <1 x i32> %ld to <1 x i64> @@ -104,6 +132,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in %ext = zext <2 x i32> %ld to <2 x i64> @@ -112,6 +143,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in %ext = sext <2 x i32> %ld to <2 x i64> @@ -120,6 +154,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in %ext = zext <4 x i32> %ld to <4 x i64> @@ -128,6 +165,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in %ext = sext <4 x i32> %ld to <4 x i64> @@ -136,6 +176,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in %ext = zext <8 x i32> %ld to <8 x i64> @@ -144,6 +187,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in %ext = sext <8 x i32> %ld to <8 x i64> @@ -152,6 +198,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in %ext = sext <16 x i32> %ld to <16 x i64> @@ -160,6 +209,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64 +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in %ext = zext <16 x i32> %ld to <16 x i64> @@ -168,6 +220,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in %ext = sext <32 x i32> %ld to <32 x i64> @@ -176,6 +231,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64: +; SICIVI: s_mov_b32 m0, -1 +; GFX9-NOT: m0 + define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in %ext = zext <32 x i32> %ld to <32 x i64> Index: test/CodeGen/AMDGPU/load-local-i64.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i64.ll +++ test/CodeGen/AMDGPU/load-local-i64.ll @@ -1,9 +1,13 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}local_load_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read_b64 [[VAL:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}} ; GCN: ds_write_b64 v{{[0-9]+}}, [[VAL]] @@ -16,6 +20,9 @@ } ; FUNC-LABEL: {{^}}local_load_v2i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; EG: LDS_READ_RET @@ -30,6 +37,9 @@ } ; FUNC-LABEL: {{^}}local_load_v3i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN-DAG: ds_read2_b64 ; GCN-DAG: ds_read_b64 @@ -47,6 +57,9 @@ } ; FUNC-LABEL: {{^}}local_load_v4i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 @@ -67,6 +80,9 @@ } ; FUNC-LABEL: {{^}}local_load_v8i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 @@ -96,6 +112,9 @@ } ; FUNC-LABEL: {{^}}local_load_v16i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 ; GCN: ds_read2_b64 Index: test/CodeGen/AMDGPU/load-local-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i8.ll +++ test/CodeGen/AMDGPU/load-local-i8.ll @@ -1,11 +1,13 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mtriple=r600---amdgiz-mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}local_load_i8: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; GCN: ds_read_u8 ; EG: LDS_UBYTE_READ_RET @@ -18,7 +20,8 @@ ; FUNC-LABEL: {{^}}local_load_v2i8: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; GCN: ds_read_u16 ; EG: LDS_USHORT_READ_RET @@ -30,6 +33,7 @@ } ; FUNC-LABEL: {{^}}local_load_v3i8: +; GFX9-NOT: m0 ; GCN: ds_read_b32 ; EG: DS_READ_RET @@ -41,6 +45,7 @@ } ; FUNC-LABEL: {{^}}local_load_v4i8: +; GFX9-NOT: m0 ; GCN: ds_read_b32 ; EG: LDS_READ_RET @@ -52,6 +57,7 @@ } ; FUNC-LABEL: {{^}}local_load_v8i8: +; GFX9-NOT: m0 ; GCN: ds_read_b64 ; EG: LDS_READ_RET @@ -64,6 +70,7 @@ } ; FUNC-LABEL: {{^}}local_load_v16i8: +; GFX9-NOT: m0 ; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}} @@ -79,8 +86,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_i8_to_i32: +; GFX9-NOT: m0 ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; SICIVI: s_mov_b32 m0 ; GCN: ds_read_u8 ; EG: LDS_UBYTE_READ_RET @@ -93,7 +101,8 @@ ; FUNC-LABEL: {{^}}local_sextload_i8_to_i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 ; GCN: ds_read_i8 ; EG: LDS_UBYTE_READ_RET @@ -116,6 +125,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i32: +; GFX9-NOT: m0 ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT @@ -127,6 +137,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32: +; GFX9-NOT: m0 ; GCN: ds_read_u16 ; EG: LDS_USHORT_READ_RET @@ -139,7 +150,8 @@ ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 ; GCN: ds_read_u16 ; FIXME: Need to optimize this sequence to avoid extra shift on VI. ; t23: i16 = srl t39, Constant:i32<8> @@ -164,6 +176,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32: +; GFX9-NOT: m0 ; GCN: ds_read_b32 ; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 @@ -182,7 +195,8 @@ ; FUNC-LABEL: {{^}}local_sextload_v3i8_to_v3i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 ; GCN: ds_read_b32 ; GCN-DAG: v_bfe_i32 @@ -207,7 +221,8 @@ ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 ; GCN: ds_read_b32 ; EG: LDS_READ_RET @@ -223,7 +238,8 @@ ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32: ; GCN-NOT: s_wqm_b64 -; GCN: s_mov_b32 m0 +; GFX9-NOT: m0 +; SICIVI: s_mov_b32 m0 ; GCN: ds_read_b32 ; EG-DAG: LDS_READ_RET @@ -239,6 +255,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET @@ -256,6 +274,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET @@ -275,6 +295,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET @@ -300,6 +322,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET @@ -329,6 +353,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET @@ -346,6 +372,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET @@ -363,6 +391,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET @@ -388,6 +418,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG-DAG: LDS_READ_RET ; EG-DAG: LDS_READ_RET @@ -413,6 +445,9 @@ } ; FUNC-LABEL: {{^}}local_zextload_i8_to_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]], ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]] @@ -428,6 +463,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_i8_to_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_read_i8 v[[LO:[0-9]+]], ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] @@ -445,6 +483,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_UBYTE_READ_RET ; EG: MOV {{.*}}, literal @@ -458,6 +498,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_UBYTE_READ_RET ; EG: ASHR @@ -471,6 +513,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_USHORT_READ_RET define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { @@ -481,6 +525,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_USHORT_READ_RET ; EG: BFE_INT @@ -493,6 +539,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { @@ -503,6 +551,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { @@ -513,6 +563,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -524,6 +576,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -544,6 +598,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -557,6 +613,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -570,6 +628,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -587,6 +647,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -620,6 +682,8 @@ ; } ; FUNC-LABEL: {{^}}local_zextload_i8_to_i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; GCN: ds_read_u8 v[[VAL:[0-9]+]], ; GCN: ds_write_b16 v[[VAL:[0-9]+]] @@ -633,6 +697,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_i8_to_i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; GCN: ds_read_i8 v[[VAL:[0-9]+]], ; GCN: ds_write_b16 v{{[0-9]+}}, v[[VAL]] @@ -647,6 +713,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_UBYTE_READ_RET ; EG: LDS_SHORT_WRITE @@ -658,6 +726,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_UBYTE_READ_RET ; EG: BFE_INT @@ -670,6 +740,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_USHORT_READ_RET ; EG: LDS_WRITE @@ -681,6 +753,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_USHORT_READ_RET ; EG: BFE_INT @@ -694,6 +768,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_WRITE @@ -706,6 +782,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR @@ -723,6 +801,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -738,6 +818,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -762,6 +844,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -783,6 +867,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -821,6 +907,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -854,6 +942,8 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 ; EG: LDS_READ_RET ; EG: LDS_READ_RET Index: test/CodeGen/AMDGPU/local-64.ll =================================================================== --- test/CodeGen/AMDGPU/local-64.ll +++ test/CodeGen/AMDGPU/local-64.ll @@ -1,10 +1,14 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s +; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,SICIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; BOTH-LABEL: {{^}}local_i32_load -; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 -; BOTH: buffer_store_dword [[REG]], +; GCN-LABEL: {{^}}local_i32_load +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 +; GCN: buffer_store_dword [[REG]], define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 %val = load i32, i32 addrspace(3)* %gep, align 4 @@ -12,19 +16,25 @@ ret void } -; BOTH-LABEL: {{^}}local_i32_load_0_offset -; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} -; BOTH: buffer_store_dword [[REG]], +; GCN-LABEL: {{^}}local_i32_load_0_offset +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} +; GCN: buffer_store_dword [[REG]], define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { %val = load i32, i32 addrspace(3)* %in, align 4 store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset: -; BOTH-NOT: ADD -; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 -; BOTH: buffer_store_byte [[REG]], +; GCN-LABEL: {{^}}local_i8_load_i16_max_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: add +; GCN: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 +; GCN: buffer_store_byte [[REG]], define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535 %val = load i8, i8 addrspace(3)* %gep, align 4 @@ -32,14 +42,20 @@ ret void } -; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset: +; GCN-LABEL: {{^}}local_i8_load_over_i16_max_offset: +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on ; SI, which is why it is being OR'd with the base pointer. -; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 -; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 -; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] -; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] -; BOTH: buffer_store_byte [[REG]], +; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 +; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 +; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 +; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 + +; GCN-DAG: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] +; GCN: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] +; GCN: buffer_store_byte [[REG]], define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536 %val = load i8, i8 addrspace(3)* %gep, align 4 @@ -47,10 +63,13 @@ ret void } -; BOTH-LABEL: {{^}}local_i64_load: -; BOTH-NOT: ADD -; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 -; BOTH: buffer_store_dwordx2 [[REG]], +; GCN-LABEL: {{^}}local_i64_load: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: add +; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 +; GCN: buffer_store_dwordx2 [[REG]], define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7 %val = load i64, i64 addrspace(3)* %gep, align 8 @@ -58,19 +77,25 @@ ret void } -; BOTH-LABEL: {{^}}local_i64_load_0_offset -; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} -; BOTH: buffer_store_dwordx2 [[REG]], +; GCN-LABEL: {{^}}local_i64_load_0_offset +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; GCN: buffer_store_dwordx2 [[REG]], define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { %val = load i64, i64 addrspace(3)* %in, align 8 store i64 %val, i64 addrspace(1)* %out, align 8 ret void } -; BOTH-LABEL: {{^}}local_f64_load: -; BOTH-NOT: ADD -; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 -; BOTH: buffer_store_dwordx2 [[REG]], +; GCN-LABEL: {{^}}local_f64_load: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: add +; GCN: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 +; GCN: buffer_store_dwordx2 [[REG]], define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { %gep = getelementptr double, double addrspace(3)* %in, i32 7 %val = load double, double addrspace(3)* %gep, align 8 @@ -78,83 +103,110 @@ ret void } -; BOTH-LABEL: {{^}}local_f64_load_0_offset -; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} -; BOTH: buffer_store_dwordx2 [[REG]], +; GCN-LABEL: {{^}}local_f64_load_0_offset +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; GCN: buffer_store_dwordx2 [[REG]], define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { %val = load double, double addrspace(3)* %in, align 8 store double %val, double addrspace(1)* %out, align 8 ret void } -; BOTH-LABEL: {{^}}local_i64_store: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 +; GCN-LABEL: {{^}}local_i64_store: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: add +; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7 store i64 5678, i64 addrspace(3)* %gep, align 8 ret void } -; BOTH-LABEL: {{^}}local_i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; GCN-LABEL: {{^}}local_i64_store_0_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: add +; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { store i64 1234, i64 addrspace(3)* %out, align 8 ret void } -; BOTH-LABEL: {{^}}local_f64_store: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 +; GCN-LABEL: {{^}}local_f64_store: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: add +; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind { %gep = getelementptr double, double addrspace(3)* %out, i32 7 store double 16.0, double addrspace(3)* %gep, align 8 ret void } -; BOTH-LABEL: {{^}}local_f64_store_0_offset -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; GCN-LABEL: {{^}}local_f64_store_0_offset +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { store double 20.0, double addrspace(3)* %out, align 8 ret void } -; BOTH-LABEL: {{^}}local_v2i64_store: -; BOTH-NOT: ADD -; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 -; BOTH: s_endpgm +; GCN-LABEL: {{^}}local_v2i64_store: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: add +; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 +; GCN: s_endpgm define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 store <2 x i64> , <2 x i64> addrspace(3)* %gep, align 16 ret void } -; BOTH-LABEL: {{^}}local_v2i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 -; BOTH: s_endpgm +; GCN-LABEL: {{^}}local_v2i64_store_0_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: add +; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 +; GCN: s_endpgm define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16 ret void } -; BOTH-LABEL: {{^}}local_v4i64_store: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 -; BOTH: s_endpgm +; GCN-LABEL: {{^}}local_v4i64_store: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: add +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 +; GCN: s_endpgm define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 store <4 x i64> , <4 x i64> addrspace(3)* %gep, align 16 ret void } -; BOTH-LABEL: {{^}}local_v4i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 -; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 -; BOTH: s_endpgm +; GCN-LABEL: {{^}}local_v4i64_store_0_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-NOT: add +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1 +; GCN: s_endpgm define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16 ret void Index: test/CodeGen/AMDGPU/local-atomics.ll =================================================================== --- test/CodeGen/AMDGPU/local-atomics.ll +++ test/CodeGen/AMDGPU/local-atomics.ll @@ -1,13 +1,18 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32: ; EG: LDS_WRXCHG_RET * -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] + +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]], +; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -18,6 +23,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_WRXCHG_RET * ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -31,9 +39,13 @@ ; XXX - Is it really necessary to load 4 into VGPR? ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32: ; EG: LDS_ADD_RET * -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] + +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]], +; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -44,6 +56,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_ADD_RET * ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -55,6 +70,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_ADD_RET * ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 @@ -70,7 +88,11 @@ ; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32: ; EG: LDS_ADD_RET * -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} + +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -81,7 +103,11 @@ ; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32_offset: ; EG: LDS_ADD_RET * -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} + +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -92,6 +118,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_add1_ret_i32_bad_si_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_ADD_RET * ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 @@ -107,6 +136,10 @@ ; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32: ; EG: LDS_SUB_RET * + +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_sub_rtn_u32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -117,6 +150,10 @@ ; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset: ; EG: LDS_SUB_RET * + +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -128,7 +165,11 @@ ; FUNC-LABEL: {{^}}lds_atomic_sub1_ret_i32: ; EG: LDS_SUB_RET * -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} + +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -139,7 +180,11 @@ ; FUNC-LABEL: {{^}}lds_atomic_sub1_ret_i32_offset: ; EG: LDS_SUB_RET * -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} + +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -151,6 +196,10 @@ ; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32: ; EG: LDS_AND_RET * + +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_and_rtn_b32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { @@ -160,6 +209,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_AND_RET * ; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -171,6 +223,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_OR_RET * ; GCN: ds_or_rtn_b32 ; GCN: s_endpgm @@ -181,6 +236,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_OR_RET * ; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -192,6 +250,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_XOR_RET * ; GCN: ds_xor_rtn_b32 ; GCN: s_endpgm @@ -202,6 +263,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_XOR_RET * ; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -221,6 +285,9 @@ ; } ; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_MIN_INT_RET * ; GCN: ds_min_rtn_i32 ; GCN: s_endpgm @@ -231,6 +298,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_MIN_INT_RET * ; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -242,6 +312,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_MAX_INT_RET * ; GCN: ds_max_rtn_i32 ; GCN: s_endpgm @@ -252,6 +325,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_MAX_INT_RET * ; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -263,6 +339,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_MIN_UINT_RET * ; GCN: ds_min_rtn_u32 ; GCN: s_endpgm @@ -273,6 +352,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_MIN_UINT_RET * ; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -284,6 +366,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_MAX_UINT_RET * ; GCN: ds_max_rtn_u32 ; GCN: s_endpgm @@ -294,6 +379,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_MAX_UINT_RET * ; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -305,9 +393,12 @@ } ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32: -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]], +; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -316,6 +407,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -325,9 +419,12 @@ } ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32: -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]], +; GCN-DAG: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] ; GCN: ds_add_u32 [[VPTR]], [[DATA]] ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -336,6 +433,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -345,6 +445,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -357,7 +460,10 @@ } ; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32: -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -366,7 +472,10 @@ } ; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32_offset: -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -376,6 +485,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_add1_noret_i32_bad_si_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm @@ -388,6 +500,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_sub_u32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -396,6 +511,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -405,7 +523,10 @@ } ; FUNC-LABEL: {{^}}lds_atomic_sub1_noret_i32: -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -414,7 +535,10 @@ } ; FUNC-LABEL: {{^}}lds_atomic_sub1_noret_i32_offset: -; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}} ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -424,6 +548,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_and_b32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -432,6 +559,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -441,6 +571,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_or_b32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -449,6 +582,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -458,6 +594,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_xor_b32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -466,6 +605,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -482,6 +624,9 @@ ; } ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_i32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -490,6 +635,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -499,6 +647,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_i32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -507,6 +658,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -516,6 +670,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_u32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -524,6 +681,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { @@ -533,6 +693,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_u32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind { @@ -541,6 +704,9 @@ } ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { Index: test/CodeGen/AMDGPU/local-atomics64.ll =================================================================== --- test/CodeGen/AMDGPU/local-atomics64.ll +++ test/CodeGen/AMDGPU/local-atomics64.ll @@ -1,7 +1,11 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI,SICIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI,SICIVI,GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_wrxchg_rtn_b64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -11,6 +15,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -21,6 +28,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_add_rtn_u64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -30,10 +40,13 @@ } ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64_offset: -; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; SI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 ; GCN: buffer_store_dwordx2 [[RESULT]], @@ -46,9 +59,12 @@ } ; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}} -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} -; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}} +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} +; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: buffer_store_dwordx2 [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -58,6 +74,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_add_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -68,6 +87,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_sub_rtn_u64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -77,6 +99,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_sub_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -87,9 +112,12 @@ } ; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}} -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} -; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}} +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} +; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: buffer_store_dwordx2 [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -99,6 +127,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_sub_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -109,6 +140,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_and_rtn_b64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -118,6 +152,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_and_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -128,6 +165,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_or_rtn_b64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -137,6 +177,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_or_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -147,6 +190,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_xor_rtn_b64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -156,6 +202,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_xor_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -174,6 +223,9 @@ ; } ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_rtn_i64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -183,6 +235,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_rtn_i64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -193,6 +248,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_rtn_i64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -202,6 +260,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_rtn_i64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -212,6 +273,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_rtn_u64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -221,6 +285,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -231,6 +298,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_rtn_u64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -240,6 +310,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_rtn_u64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { @@ -250,6 +323,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_wrxchg_rtn_b64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind { @@ -258,6 +334,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { @@ -267,6 +346,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_add_noret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_add_u64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind { @@ -275,12 +357,15 @@ } ; GCN-LABEL: {{^}}lds_atomic_add_noret_i64_offset: -; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; SI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 +; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 @@ -289,9 +374,12 @@ } ; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64: +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} -; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst @@ -299,6 +387,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_add_u64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { @@ -308,6 +399,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_sub_u64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind { @@ -316,6 +410,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_sub_u64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { @@ -325,9 +422,12 @@ } ; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}} -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} -; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}} +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} +; GCN: ds_sub_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind { %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst @@ -335,6 +435,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_sub_u64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { @@ -344,6 +447,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_and_b64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind { @@ -352,6 +458,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_and_b64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { @@ -361,6 +470,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_or_b64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind { @@ -369,6 +481,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_or_b64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { @@ -378,6 +493,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_xor_b64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind { @@ -386,6 +504,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_xor_b64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { @@ -402,6 +523,9 @@ ; } ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_i64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind { @@ -410,6 +534,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_i64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { @@ -419,6 +546,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_i64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind { @@ -427,6 +557,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_i64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { @@ -436,6 +569,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_u64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind { @@ -444,6 +580,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_min_u64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { @@ -453,6 +592,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_u64 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind { @@ -461,6 +603,9 @@ } ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; GCN: ds_max_u64 {{.*}} offset:32 ; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { Index: test/CodeGen/AMDGPU/packed-op-sel.ll =================================================================== --- test/CodeGen/AMDGPU/packed-op-sel.ll +++ test/CodeGen/AMDGPU/packed-op-sel.ll @@ -233,7 +233,7 @@ ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] ; GCN: ds_read_u16 [[PACKED:v[0-9]+]] -; GCN-NEXT: s_waitcnt +; GCN: s_waitcnt ; GCN: ds_read_u16_d16_hi [[PACKED]] ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} Index: test/CodeGen/AMDGPU/store-local.ll =================================================================== --- test/CodeGen/AMDGPU/store-local.ll +++ test/CodeGen/AMDGPU/store-local.ll @@ -1,9 +1,13 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s +; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cayman < %s | FileCheck -check-prefixes=CM,FUNC %s ; FUNC-LABEL: {{^}}store_local_i1: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_BYTE_WRITE ; CM: LDS_BYTE_WRITE @@ -16,6 +20,9 @@ } ; FUNC-LABEL: {{^}}store_local_i8: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_BYTE_WRITE ; CM: LDS_BYTE_WRITE @@ -27,6 +34,9 @@ } ; FUNC-LABEL: {{^}}store_local_i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_SHORT_WRITE ; CM: LDS_SHORT_WRITE @@ -38,6 +48,9 @@ } ; FUNC-LABEL: {{^}}store_local_v2i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_WRITE ; CM: LDS_WRITE @@ -50,6 +63,9 @@ } ; FUNC-LABEL: {{^}}store_local_v4i8: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_WRITE ; CM: LDS_WRITE @@ -62,6 +78,9 @@ } ; FUNC-LABEL: {{^}}store_local_v4i8_unaligned: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_BYTE_WRITE ; EG: LDS_BYTE_WRITE ; EG: LDS_BYTE_WRITE @@ -85,6 +104,9 @@ } ; FUNC-LABEL: {{^}}store_local_v4i8_halfaligned: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_SHORT_WRITE ; EG: LDS_SHORT_WRITE ; EG-NOT: LDS_WRITE @@ -102,6 +124,9 @@ } ; FUNC-LABEL: {{^}}store_local_v2i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG-NOT: LDS_WRITE @@ -118,6 +143,9 @@ } ; FUNC-LABEL: {{^}}store_local_v4i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE @@ -136,6 +164,9 @@ } ; FUNC-LABEL: {{^}}store_local_v4i32_align4: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE @@ -155,6 +186,9 @@ } ; FUNC-LABEL: {{^}}store_local_i64_i8: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_BYTE_WRITE ; GCN: ds_write_b8 define amdgpu_kernel void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { @@ -165,6 +199,9 @@ } ; FUNC-LABEL: {{^}}store_local_i64_i16: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + ; EG: LDS_SHORT_WRITE ; GCN: ds_write_b16 define amdgpu_kernel void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {