diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1288,7 +1288,7 @@ AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, - getElementByteSizeValue(STM.getMaxPrivateElementSize())); + getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); if (MFI->hasPrivateSegmentBuffer()) { Out.code_properties |= diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -493,8 +493,8 @@ return LDSBankCount; } - unsigned getMaxPrivateElementSize() const { - return MaxPrivateElementSize; + unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { + return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; } unsigned getConstantBusLimit(unsigned Opcode) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8604,7 +8604,8 @@ return SplitVectorStore(Op, DAG); return SDValue(); case 16: - if (NumElements > 4 || NumElements == 3) + if (NumElements > 4 || + (NumElements == 3 && !Subtarget->enableFlatScratch())) return SplitVectorStore(Op, DAG); return SDValue(); default: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6569,7 +6569,7 @@ // GFX9 doesn't have ELEMENT_SIZE. if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1; Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; } diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -6,40 +6,23 @@ ; GFX9-LABEL: zero_init_kernel: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 +; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_mov_b32 s2, s0 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:76 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:72 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:68 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:64 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_kernel: @@ -48,24 +31,19 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: scratch_store_dword off, v0, off offset:76 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:72 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:68 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:64 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:60 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:56 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:52 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:48 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:44 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:40 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:36 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:32 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:28 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:24 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:20 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:16 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_mov_b32 s2, s0 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 ; GFX10-NEXT: s_endpgm %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* @@ -77,23 +55,18 @@ ; GFX9-LABEL: zero_init_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:60 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:56 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:52 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:48 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:44 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:40 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:36 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:32 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:28 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:24 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:20 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:16 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:12 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:8 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 -; GFX9-NEXT: scratch_store_dword off, v0, s32 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_mov_b32 s2, s0 +; GFX9-NEXT: s_mov_b32 s3, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -101,24 +74,19 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:60 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:56 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:52 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:48 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:44 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:40 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:36 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:32 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:28 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:24 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:20 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:16 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:12 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:8 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 -; GFX10-NEXT: scratch_store_dword off, v0, s32 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_mov_b32 s2, s0 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i16], align 2, addrspace(5) @@ -332,40 +300,23 @@ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_mov_b32 s2, s0 +; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:284 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:280 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:276 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:272 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:300 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:296 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:292 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:288 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:316 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:312 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:308 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:304 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:332 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:328 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:324 -; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:320 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_small_offset_kernel: @@ -375,25 +326,20 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: scratch_store_dword off, v0, off offset:284 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:280 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:276 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:272 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:300 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:296 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:292 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:288 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:316 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:312 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:308 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:304 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:332 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:328 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:324 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:320 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_mov_b32 s2, s0 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 ; GFX10-NEXT: s_endpgm %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) @@ -409,24 +355,19 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s32 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_mov_b32 s2, s0 +; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:268 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:264 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:260 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:256 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:284 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:280 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:276 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:272 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:300 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:296 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:292 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:288 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:316 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:312 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:308 -; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:304 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -435,25 +376,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s32 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:268 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:264 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:260 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:256 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:284 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:280 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:276 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:272 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:300 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:296 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:292 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:288 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:316 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:312 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:308 -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:304 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_mov_b32 s2, s0 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %padding = alloca [64 x i32], align 4, addrspace(5) @@ -678,40 +614,23 @@ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_mov_b32 s2, s0 +; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 -; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_large_offset_kernel: @@ -721,41 +640,24 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_mov_b32 s2, s0 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52 -; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX10-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) @@ -771,40 +673,23 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s32 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_mov_b32 s2, s0 +; GFX9-NEXT: s_mov_b32 s3, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 -; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 -; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 +; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -813,41 +698,24 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s32 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 +; GFX10-NEXT: s_mov_b32 s1, s0 +; GFX10-NEXT: s_mov_b32 s2, s0 +; GFX10-NEXT: s_mov_b32 s3, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32 -; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 -; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %padding = alloca [4096 x i32], align 4, addrspace(5) @@ -1185,17 +1053,14 @@ ret void } -; FIXME: Multi-DWORD scratch shall be supported define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { ; GFX9-LABEL: store_load_i64_aligned: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-NEXT: scratch_store_dword v0, v1, off -; GFX9-NEXT: scratch_load_dword v1, v0, off offset:4 -; GFX9-NEXT: scratch_load_dword v0, v0, off +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1203,14 +1068,11 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 -; GFX10-NEXT: scratch_store_dword v0, v2, off -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: scratch_load_dword v1, v0, off offset:4 -; GFX10-NEXT: scratch_load_dword v0, v0, off +; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1220,35 +1082,14 @@ ret void } -; FIXME: Multi-DWORD unaligned scratch shall be supported define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { ; GFX9-LABEL: store_load_i64_unaligned: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: scratch_store_byte v0, v1, off offset:7 -; GFX9-NEXT: scratch_store_byte v0, v1, off offset:6 -; GFX9-NEXT: scratch_store_byte v0, v1, off offset:5 -; GFX9-NEXT: scratch_store_byte v0, v1, off offset:4 -; GFX9-NEXT: scratch_store_byte v0, v1, off offset:3 -; GFX9-NEXT: scratch_store_byte v0, v1, off offset:2 -; GFX9-NEXT: scratch_store_byte v0, v1, off offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 -; GFX9-NEXT: scratch_store_byte v0, v1, off -; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:7 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_ubyte v1, v0, off -; GFX9-NEXT: scratch_load_ubyte v0, v0, off offset:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1256,32 +1097,11 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_mov_b32_e32 v1, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: scratch_store_byte v0, v1, off offset:7 -; GFX10-NEXT: scratch_store_byte v0, v1, off offset:6 -; GFX10-NEXT: scratch_store_byte v0, v1, off offset:5 -; GFX10-NEXT: scratch_store_byte v0, v1, off offset:4 -; GFX10-NEXT: scratch_store_byte v0, v1, off offset:3 -; GFX10-NEXT: scratch_store_byte v0, v1, off offset:2 -; GFX10-NEXT: scratch_store_byte v0, v1, off offset:1 -; GFX10-NEXT: scratch_store_byte v0, v2, off -; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:6 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:7 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:5 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:2 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:3 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: scratch_load_ubyte v1, v0, off -; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1 +; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1291,5 +1111,69 @@ ret void } +define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { +; GFX9-LABEL: store_load_v3i32_unaligned: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_v3i32_unaligned: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NEXT: v_mov_b32_e32 v3, 3 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + store volatile <3 x i32> , <3 x i32> addrspace(5)* %arg, align 1 + %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 + ret void +} + +define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { +; GFX9-LABEL: store_load_v4i32_unaligned: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-NEXT: v_mov_b32_e32 v4, 4 +; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off +; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: store_load_v4i32_unaligned: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NEXT: v_mov_b32_e32 v3, 3 +; GFX10-NEXT: v_mov_b32_e32 v4, 4 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off +; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +bb: + store volatile <4 x i32> , <4 x i32> addrspace(5)* %arg, align 1 + %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 + ret void +} + declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -71,19 +71,14 @@ ; FLATSCR-NEXT: scratch_store_byte off, v0, s3 ; FLATSCR-NEXT: s_cbranch_scc1 BB0_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s2, 0x20d0 -; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2 -; FLATSCR-NEXT: scratch_load_dword v1, off, s2 offset:4 ; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 ; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2 -; FLATSCR-NEXT: scratch_load_dword v0, off, s2 offset:208 -; FLATSCR-NEXT: s_movk_i32 s2, 0x3000 -; FLATSCR-NEXT: scratch_load_dword v2, off, s2 offset:68 +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:208 ; FLATSCR-NEXT: s_movk_i32 s2, 0x3000 -; FLATSCR-NEXT: scratch_load_dword v3, off, s2 offset:64 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR-NEXT: v_mov_b32_e32 v3, s1 ; FLATSCR-NEXT: v_mov_b32_e32 v2, s0 @@ -162,23 +157,17 @@ ; FLATSCR-NEXT: scratch_store_byte off, v2, s1 ; FLATSCR-NEXT: s_cbranch_scc1 BB1_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 -; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000 -; FLATSCR-NEXT: s_add_u32 s0, s1, s0 -; FLATSCR-NEXT: scratch_load_dword v3, off, s0 offset:4 ; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 ; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000 ; FLATSCR-NEXT: s_add_u32 s0, s1, s0 -; FLATSCR-NEXT: scratch_load_dword v2, off, s0 offset:208 -; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000 -; FLATSCR-NEXT: scratch_load_dword v4, off, s0 offset:68 +; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 ; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000 -; FLATSCR-NEXT: scratch_load_dword v5, off, s0 offset:64 +; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 ; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x6000 ; FLATSCR-NEXT: s_mov_b32 s33, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll --- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll @@ -31,24 +31,15 @@ ; FLATSCR-LABEL: memcpy_fixed_align: ; FLATSCR: ; %bb.0: ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FLATSCR-NEXT: global_load_dword v0, v[1:2], off offset:36 -; FLATSCR-NEXT: global_load_dword v11, v[1:2], off offset:32 +; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32 ; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16 ; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off -; FLATSCR-NEXT: s_waitcnt vmcnt(3) -; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:36 -; FLATSCR-NEXT: s_waitcnt vmcnt(3) -; FLATSCR-NEXT: scratch_store_dword off, v11, s32 offset:32 -; FLATSCR-NEXT: s_waitcnt vmcnt(3) -; FLATSCR-NEXT: scratch_store_dword off, v6, s32 offset:28 -; FLATSCR-NEXT: scratch_store_dword off, v5, s32 offset:24 -; FLATSCR-NEXT: scratch_store_dword off, v4, s32 offset:20 -; FLATSCR-NEXT: scratch_store_dword off, v3, s32 offset:16 -; FLATSCR-NEXT: s_waitcnt vmcnt(6) -; FLATSCR-NEXT: scratch_store_dword off, v10, s32 offset:12 -; FLATSCR-NEXT: scratch_store_dword off, v9, s32 offset:8 -; FLATSCR-NEXT: scratch_store_dword off, v8, s32 offset:4 -; FLATSCR-NEXT: scratch_store_dword off, v7, s32 +; FLATSCR-NEXT: s_waitcnt vmcnt(2) +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32 +; FLATSCR-NEXT: s_waitcnt vmcnt(2) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32 offset:16 +; FLATSCR-NEXT: s_waitcnt vmcnt(2) +; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [40 x i8], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -69,13 +69,12 @@ ; FLATSCR-NEXT: s_mov_b32 s2, s32 ; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 ; FLATSCR-NEXT: s_add_i32 s4, s2, s3 -; FLATSCR-NEXT: s_add_u32 s2, s2, s3 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; FLATSCR-NEXT: scratch_store_dword off, v1, s2 -; FLATSCR-NEXT: v_mov_b32_e32 v1, 1 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 +; FLATSCR-NEXT: s_add_u32 s2, s2, s3 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 ; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2 ; FLATSCR-NEXT: s_mov_b32 s32, s4 -; FLATSCR-NEXT: scratch_store_dword off, v1, s4 offset:4 ; FLATSCR-NEXT: s_add_i32 s4, s4, s2 ; FLATSCR-NEXT: scratch_load_dword v1, off, s4 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -174,11 +173,10 @@ ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; FLATSCR-NEXT: scratch_store_dword off, v1, s2 -; FLATSCR-NEXT: v_mov_b32_e32 v1, 1 +; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 ; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2 ; FLATSCR-NEXT: s_mov_b32 s32, s2 -; FLATSCR-NEXT: scratch_store_dword off, v1, s2 offset:4 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 ; FLATSCR-NEXT: s_add_i32 s2, s2, s3 ; FLATSCR-NEXT: scratch_load_dword v1, off, s2 ; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 @@ -275,11 +273,10 @@ ; FLATSCR-NEXT: s_mov_b32 s2, s32 ; FLATSCR-NEXT: s_movk_i32 s3, 0x1000 ; FLATSCR-NEXT: s_add_i32 s4, s2, s3 -; FLATSCR-NEXT: s_add_u32 s2, s2, s3 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 -; FLATSCR-NEXT: scratch_store_dword off, v2, s2 -; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: scratch_store_dword off, v2, s4 offset:4 +; FLATSCR-NEXT: v_mov_b32_e32 v3, 1 +; FLATSCR-NEXT: s_add_u32 s2, s2, s3 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5 @@ -371,11 +368,10 @@ ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 -; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 -; FLATSCR-NEXT: scratch_store_dword off, v2, s2 -; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 -; FLATSCR-NEXT: scratch_store_dword off, v2, s2 offset:4 +; FLATSCR-NEXT: v_mov_b32_e32 v5, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v6, 1 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[5:6], s2 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4 ; FLATSCR-NEXT: s_mov_b32 s32, s2 diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -16,8 +16,9 @@ ; ; GCN-LABEL: {{^}}ps_main: -; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2 -; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-FLATSCR-DAG: s_add_u32 flat_scratch_lo, s0, s2 +; GFX9-FLATSCR-DAG: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-FLATSCR-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0 ; GFX10-FLATSCR: s_add_u32 s0, s0, s2 ; GFX10-FLATSCR: s_addc_u32 s1, s1, 0 @@ -36,15 +37,13 @@ ; FLATSCR-NOT: SCRATCH_RSRC_DWORD ; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 -; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: -; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 -; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: +; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SP]] offset: -; GFX10-FLATSCR: scratch_store_dword off, v2, off offset: -; GFX10-FLATSCR: scratch_store_dword off, v2, off offset: +; GFX10-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], off offset: -; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 -; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] +; MUBUF-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 +; MUBUF-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] +; GFX10-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0 ; GCN-NOT: s_mov_b32 s0 ; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]] @@ -53,7 +52,6 @@ ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off -; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[HI_OFF]], off define amdgpu_ps float @ps_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -79,9 +77,7 @@ ; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 -; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: -; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0 -; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset: +; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SP]] offset: ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -78,10 +78,7 @@ ; FLATSCR-NEXT: ; %bb.1: ; %if.then4.i ; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000 ; FLATSCR-NEXT: s_nop 1 -; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 -; FLATSCR-NEXT: s_waitcnt_depctr 0xffe3 -; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000 -; FLATSCR-NEXT: scratch_load_dword v1, off, vcc_lo offset:8 +; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, vcc_lo offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; FLATSCR-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0