Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -773,22 +773,17 @@ !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) { assert(RS && "RegScavenger required if spilling"); - // We force this to be at offset 0 so no user object ever has 0 as an - // address, so we may use 0 as an invalid pointer value. This is because - // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca - // is required to be address space 0, we are forced to accept this for - // now. Ideally we could have the stack in another address space with 0 as a - // valid pointer, and -1 as the null value. - // - // This will also waste additional space when user stack objects require > 4 - // byte alignment. - // - // The main cost here is losing the offset for addressing modes. However - // this also ensures we shouldn't need a register for the offset when - // emergency scavenging. - int ScavengeFI = MFI.CreateFixedObject( - TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); - RS->addScavengingFrameIndex(ScavengeFI); + if (FuncInfo->isEntryFunction()) { + int ScavengeFI = MFI.CreateFixedObject( + TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); + RS->addScavengingFrameIndex(ScavengeFI); + } else { + int ScavengeFI = MFI.CreateStackObject( + TRI.getSpillSize(AMDGPU::SGPR_32RegClass), + TRI.getSpillAlignment(AMDGPU::SGPR_32RegClass), + false); + RS->addScavengingFrameIndex(ScavengeFI); + } } } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1940,12 +1940,6 @@ bool IsKernel = AMDGPU::isKernel(CallConv); bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); - if (!IsEntryFunc) { - // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over - // this when allocating argument fixed offsets. - CCInfo.AllocateStack(4, 4); - } - if (IsShader) { processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); @@ -2551,7 +2545,6 @@ "unsupported call from graphics shader of function "); } - // The first 4 bytes are reserved for the callee's emergency stack slot. if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); @@ -2578,9 +2571,6 @@ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - // The first 4 bytes are reserved for the callee's emergency stack slot. - CCInfo.AllocateStack(4, 4); - CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -4,14 +4,14 @@ %struct.ByValStruct = type { [4 x i32] } ; GCN-LABEL: {{^}}void_func_byval_struct: -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} ; GCN-NOT: s32 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:20{{$}} +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}} ; GCN-NOT: s32 define hidden void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { entry: @@ -34,16 +34,16 @@ ; GCN-DAG: buffer_store_dword v33 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 ; GCN-DAG: v_writelane_b32 -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}} ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] -; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5{{$}} -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}} ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]] ; GCN: s_swappc_b64 -; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:20{{$}} +; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:16{{$}} ; GCN: v_readlane_b32 ; GCN-NOT: v_readlane_b32 s32 @@ -74,31 +74,31 @@ ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8 -; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24 +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16 -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8 -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12 -; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 -; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20 +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12 ; GCN-NOT: s_add_u32 s32, s32, 0x800 -; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:12 -; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20 +; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24 +; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28 -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:28 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:32 +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 ; GCN: s_swappc_b64 ; GCN-NOT: v_readlane_b32 s32 @@ -144,20 +144,20 @@ ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 ; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:12 -; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 ; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24 ; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28 ; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32 ; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36 -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:28 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:32 +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 ; GCN: s_swappc_b64 @@ -182,14 +182,14 @@ } ; GCN-LABEL: {{^}}void_func_byval_struct_align8: -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}} +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} ; GCN-NOT: s32 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:24{{$}} +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}} ; GCN-NOT: s32 define hidden void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { entry: @@ -222,20 +222,20 @@ ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 ; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12 -; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16 -; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20 +; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 ; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24 ; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28 ; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32 ; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36 -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36 +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 ; GCN: s_swappc_b64 @@ -267,30 +267,30 @@ ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8 -; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24 +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16 -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8 -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12 -; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 -; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20 +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12 ; GCN-NOT: s_add_u32 s32, s32, 0x800 -; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12 -; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16 -; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20 - -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 - -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36 +; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 + +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20 +; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24 +; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28 +; GCN: s_waitcnt vmcnt(0) +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 ; GCN: s_swappc_b64 ; GCN-NOT: v_readlane_b32 s32 Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -636,7 +636,7 @@ ; GCN-DAG: buffer_load_dwordx4 v[28:31], off ; GCN: s_waitcnt -; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}} +; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}} ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { @@ -687,15 +687,15 @@ ; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8 ; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12 -; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4 -; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8 +; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}} +; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4 ; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8 ; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12 -; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4 -; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8 +; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}} +; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4 ; GCN-NEXT: s_swappc_b64 ; GCN-NOT: [[SP]] @@ -722,8 +722,8 @@ ; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12 ; GCN-NOT: s_add_u32 [[SP]] -; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 -; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8 +; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}} +; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 ; GCN: s_swappc_b64 ; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16 ; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20 @@ -758,8 +758,8 @@ } ; GCN-LABEL: {{^}}stack_passed_arg_alignment_v32i32_f64: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:8 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { entry: @@ -769,15 +769,15 @@ ; GCN-LABEL: {{^}}tail_call_byval_align16: ; GCN-NOT: s32 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:36 -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:20 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} ; GCN: s_getpc_b64 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { @@ -789,15 +789,15 @@ ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: ; GCN-NOT: s32 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GCN: s_getpc_b64 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { @@ -808,13 +808,13 @@ ; GCN-LABEL: {{^}}stack_12xv3i32: ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4 +; GCN: buffer_store_dword [[REG12]], {{.*$}} ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 v31, 11 ; GCN: s_getpc define void @stack_12xv3i32() #0 { @@ -837,13 +837,13 @@ ; GCN-LABEL: {{^}}stack_12xv3f32: ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4 +; GCN: buffer_store_dword [[REG12]], {{.*$}} ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 v31, 0x41300000 ; GCN: s_getpc define void @stack_12xv3f32() #0 { @@ -867,21 +867,21 @@ ; GCN-LABEL: {{^}}stack_8xv5i32: ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 +; GCN: buffer_store_dword [[REG8]], {{.*$}} ; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 +; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 +; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 +; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 v31, 7 ; GCN: s_getpc @@ -901,21 +901,21 @@ ; GCN-LABEL: {{^}}stack_8xv5f32: ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 +; GCN: buffer_store_dword [[REG8]], {{.*$}} ; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 +; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 +; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 +; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 v31, 0x40e00000 ; GCN: s_getpc Index: test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/callee-frame-setup.ll +++ test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -23,7 +23,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4{{$}} +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack() #0 { @@ -37,13 +37,13 @@ ; GCN-NEXT: s_waitcnt ; GCN: s_mov_b32 s5, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; GCN-DAG: v_writelane_b32 v32, s33, ; GCN-DAG: v_writelane_b32 v32, s34, ; GCN-DAG: v_writelane_b32 v32, s35, ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5{{$}} ; GCN-DAG: s_mov_b32 s33, s5 @@ -52,7 +52,7 @@ ; GCN-DAG: v_readlane_b32 s35, ; GCN-DAG: v_readlane_b32 s34, ; GCN-DAG: v_readlane_b32 s33, -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; GCN: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_and_call() #0 { @@ -72,7 +72,7 @@ ; GCN: s_waitcnt ; GCN: s_mov_b32 s5, s32 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-DAG: v_writelane_b32 v32, s33, 0 ; GCN-DAG: v_writelane_b32 v32, s34, 1 @@ -84,7 +84,7 @@ ; GCN-DAG: v_readlane_b32 s33, v32, 0 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN: s_sub_u32 s32, s32, 0x400 @@ -99,7 +99,7 @@ ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN: v_writelane_b32 v32 @@ -107,7 +107,7 @@ ; GCN: v_readlane_b32 s{{[0-9]+}}, v32 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -116,7 +116,7 @@ ; GCN-LABEL: {{^}}use_stack_workgroup_id_x: ; GCN: s_waitcnt ; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} ; GCN: ; use s6 ; GCN: s_setpc_b64 define void @use_stack_workgroup_id_x() #1 { @@ -429,7 +429,7 @@ } ; GCN-LABEL: {{^}}use_every_sgpr_input: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} @@ -577,7 +577,7 @@ ; GCN: s_swappc_b64 -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]] ; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]] ; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}} Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -230,11 +230,11 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_x( @@ -289,7 +289,7 @@ ; GCN: s_mov_b32 s33, s7 ; GCN: s_mov_b32 s32, s33 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} ; GCN: s_mov_b32 s4, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { @@ -307,7 +307,7 @@ ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: ; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v1, off, s[0:3], s32 offset: +; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { store volatile i32 %arg0, i32 addrspace(1)* undef @@ -326,14 +326,14 @@ ; Requires loading and storing to stack slot. ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: ; GCN: s_add_u32 s32, s32, 0x400{{$}} -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s5{{$}} -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload ; GCN: s_sub_u32 s32, s32, 0x400{{$}} ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( @@ -350,18 +350,17 @@ } ; stack layout: -; frame[0] = emergency stack slot -; frame[1] = byval arg32 -; frame[2] = stack passed workitem ID x -; frame[3] = VGPR spill slot +; frame[0] = byval arg32 +; frame[1] = stack passed workitem ID x +; frame[2] = VGPR spill slot ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 -; GCN: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}} +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -410,13 +409,9 @@ ret void } -; frame[0] = emergency stack slot -; frame[1] = - -; sp[0] = callee emergency stack slot reservation -; sp[1] = byval -; sp[2] = ?? -; sp[3] = stack passed workitem ID x +; sp[0] = byval +; sp[1] = ?? +; sp[2] = stack passed workitem ID x ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: ; GCN: enable_vgpr_workitem_id = 0 @@ -427,10 +422,10 @@ ; GCN-NOT: s32 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { @@ -451,11 +446,11 @@ ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}} +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4 -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5{{$}} +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { @@ -475,15 +470,15 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12{{$}} -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_xyz( @@ -537,10 +532,9 @@ ret void } -; frame[0] = callee emergency stack slot -; frame[1] = ID X -; frame[2] = ID Y -; frame[3] = ID Z +; frame[0] = ID X +; frame[1] = ID Y +; frame[2] = ID Z ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: ; GCN: enable_vgpr_workitem_id = 2 @@ -548,9 +542,9 @@ ; GCN: s_mov_b32 s33, s7 ; GCN: s_mov_b32 s32, s33 -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32{{$}} +; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { call void @too_many_args_use_workitem_id_xyz( @@ -567,15 +561,14 @@ ; workitem ID X in register, yz on stack ; v31 = workitem ID X -; frame[0] = emergency slot -; frame[1] = workitem Y -; frame[2] = workitem Z +; frame[0] = workitem Y +; frame[1] = workitem Z ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz: ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 -; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_load_dword v31, off, s[0:3], s32{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 -; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:8{{$}} +; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 ; GCN: s_waitcnt @@ -631,9 +624,8 @@ ret void } -; frame[0] = callee emergency stack slot -; frame[1] = ID Y -; frame[2] = ID Z +; frame[0] = ID Y +; frame[1] = ID Z ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: ; GCN: enable_vgpr_workitem_id = 2 @@ -642,8 +634,8 @@ ; GCN: s_mov_b32 s32, s33 ; GCN-DAG: v_mov_b32_e32 v31, v0 -; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32{{$}} +; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { call void @too_many_args_use_workitem_id_x_stack_yz( Index: test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -30,7 +30,7 @@ ; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 @@ -47,7 +47,7 @@ ; GCN-NEXT: v_readlane_b32 s34, v32, 1 ; GCN-NEXT: v_readlane_b32 s33, v32, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -68,7 +68,7 @@ ; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 @@ -85,7 +85,7 @@ ; GCN-NEXT: v_readlane_b32 s34, v32, 1 ; GCN-NEXT: v_readlane_b32 s33, v32, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -106,7 +106,7 @@ ; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 @@ -123,7 +123,7 @@ ; GCN-NEXT: v_readlane_b32 s34, v32, 1 ; GCN-NEXT: v_readlane_b32 s33, v32, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -144,7 +144,7 @@ ; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 @@ -162,7 +162,7 @@ ; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: v_readlane_b32 s33, v32, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_waitcnt vmcnt(0) Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-elimination.ll +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -9,11 +9,8 @@ ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN: s_sub_u32 s6, s32, s4 -; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 -; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] - -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 -; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] +; CI-NEXT: v_lshr_b32_e64 v0, s6, 6 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -23,6 +20,36 @@ ret void } +; Offset due to different objects +; GCN-LABEL: {{^}}func_mov_fi_i32_offset: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + +; CI: s_sub_u32 s6, s32, s4 +; CI-NEXT: v_lshr_b32_e64 v0, s6, 6 + +; CI: s_sub_u32 s6, s32, s4 +; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; CI-NEXT: v_add_i32_e64 v1, s[6:7], 4, [[SCALED]] +; CI-NOT: v_mov +; CI: ds_write_b32 v0, v0 +; CI-NEXT: ds_write_b32 v0, v1 + +; GFX9: s_sub_u32 s6, s32, s4 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6 +; GFX9-DAG: ds_write_b32 v0, v0 + +; GFX9-DAG: s_sub_u32 s6, s32, s4 +; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 +; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] +; GFX9-NEXT: ds_write_b32 v0, v0 +define void @func_mov_fi_i32_offset() #0 { + %alloca0 = alloca i32, addrspace(5) + %alloca1 = alloca i32, addrspace(5) + store volatile i32 addrspace(5)* %alloca0, i32 addrspace(5)* addrspace(3)* undef + store volatile i32 addrspace(5)* %alloca1, i32 addrspace(5)* addrspace(3)* undef + ret void +} + ; Materialize into an add of a constant offset from the FI. ; FIXME: Should be able to merge adds @@ -31,12 +58,10 @@ ; GCN: s_sub_u32 s6, s32, s4 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 -; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] -; CI-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]] ; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] -; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 ; GCN-NOT: v_mov @@ -54,11 +79,9 @@ ; GCN-LABEL: {{^}}func_other_fi_user_i32: ; GCN: s_sub_u32 s6, s32, s4 -; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 -; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] +; CI-NEXT: v_lshr_b32_e64 v0, s6, 6 -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 -; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0 ; GCN-NOT: v_mov @@ -92,12 +115,10 @@ ; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4 ; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 -; CI-NEXT: v_add_i32_e64 [[ADD:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]] -; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[ADD]] +; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] ; GFX9-NEXT: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]] -; GFX9-NEXT: v_add_u32_e32 [[ADD:v[0-9]+]], 4, [[SHIFT]] -; GFX9-NEXT: v_add_u32_e32 v0, 4, [[ADD]] +; GFX9-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -130,17 +151,15 @@ ; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 -; CI: v_add_i32_e64 [[ADD:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]] ; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]] -; GFX9: v_add_u32_e32 [[ADD:v[0-9]+]], 4, [[SHIFT]] ; GCN: s_and_saveexec_b64 -; CI: v_add_i32_e32 v0, vcc, 4, [[ADD]] +; CI: v_add_i32_e32 v0, vcc, 4, [[SHIFT]] ; CI: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4{{$}} -; GFX9: v_add_u32_e32 v0, 4, [[ADD]] +; GFX9: v_add_u32_e32 v0, 4, [[SHIFT]] ; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s4 offen offset:4{{$}} ; GCN: ds_write_b32 @@ -162,7 +181,7 @@ ; Added offset can't be used with VOP3 add ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: ; GCN: s_sub_u32 s6, s32, s4 -; GCN-DAG: s_movk_i32 s6, 0x204 +; GCN-DAG: s_movk_i32 s6, 0x200 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], s6, [[SCALED]] @@ -186,7 +205,7 @@ ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live: ; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s4 -; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x204 +; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]] @@ -241,7 +260,16 @@ ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: ; GCN: s_and_saveexec_b64 -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:12 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 +; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4 + +; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 +; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] + +; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]] +; GFX9-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] + +; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]] define void @alloca_ptr_nonentry_block(i32 %arg0) #0 { %alloca0 = alloca { i8, i32 }, align 4, addrspace(5) %cmp = icmp eq i32 %arg0, 0 Index: test/CodeGen/AMDGPU/function-args.ll =================================================================== --- test/CodeGen/AMDGPU/function-args.ll +++ test/CodeGen/AMDGPU/function-args.ll @@ -516,8 +516,8 @@ } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32: -; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} ; GCN-DAG: buffer_store_dword v[[ELT1]] ; GCN-DAG: buffer_store_byte v[[ELT0]] define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval %arg0) #0 { @@ -527,10 +527,10 @@ } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2: -; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32{{$}} +; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} ; GCN: ds_write_b32 v0, v0 ; GCN: s_setpc_b64 @@ -544,7 +544,7 @@ } ; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64: -; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s32 offset:8{{$}} ; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} ; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off @@ -566,9 +566,9 @@ ; GCN-DAG: buffer_store_dwordx4 v[20:23], off ; GCN-DAG: buffer_store_dwordx4 v[24:27], off ; GCN-DAG: buffer_store_dwordx4 v[28:31], off -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:8 ; GCN: buffer_store_dword v[[LOAD_ARG1]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off @@ -581,14 +581,14 @@ ; FIXME: Different ext load types on CI vs. VI ; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16: -; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} ; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]] ; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]] @@ -609,10 +609,10 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off @@ -636,15 +636,15 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:28{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off @@ -656,15 +656,15 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:28{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off @@ -676,23 +676,23 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:60{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:64{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:28{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:60{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off @@ -706,39 +706,39 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:60{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:64{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:68{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:72{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:76{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:80{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:84{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:88{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:92{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:96{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:100{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:104{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:108{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:112{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:116{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:120{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:124{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:128{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:60{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:64{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:68{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:72{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:76{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:80{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:84{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:88{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:92{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:96{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:100{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:104{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:108{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:112{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:116{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:120{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:124{{$}} define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 { store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef Index: test/CodeGen/AMDGPU/load-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/load-hi16.ll +++ test/CodeGen/AMDGPU/load-hi16.ll @@ -503,7 +503,7 @@ ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 + %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 %load = load i16, i16 addrspace(5)* %gep %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 @@ -522,7 +522,7 @@ ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 { entry: - %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045 + %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047 %load = load half, half addrspace(5)* %gep %build0 = insertelement <2 x half> undef, half %reg, i32 0 %build1 = insertelement <2 x half> %build0, half %load, i32 1 @@ -577,7 +577,7 @@ ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 %load = load i8, i8 addrspace(5)* %gep %ext = zext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 @@ -597,7 +597,7 @@ ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 { entry: - %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 %load = load i8, i8 addrspace(5)* %gep %ext = zext i8 %load to i16 %bitcast = bitcast i16 %ext to half @@ -618,7 +618,7 @@ ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 { entry: - %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 %load = load i8, i8 addrspace(5)* %gep %ext = sext i8 %load to i16 %bitcast = bitcast i16 %ext to half @@ -639,7 +639,7 @@ ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 %load = load i8, i8 addrspace(5)* %gep %ext = sext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 @@ -796,7 +796,7 @@ %obj1 = alloca [4096 x i16], align 2, addrspace(5) %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc - %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025 + %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027 %load = load i16, i16 addrspace(5)* %gep %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 @@ -813,7 +813,7 @@ %obj1 = alloca [4096 x i8], align 2, addrspace(5) %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc - %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 %load = load i8, i8 addrspace(5)* %gep %ext = sext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 @@ -831,7 +831,7 @@ %obj1 = alloca [4096 x i8], align 2, addrspace(5) %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc - %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 %load = load i8, i8 addrspace(5)* %gep %ext = zext i8 %load to i16 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 @@ -975,9 +975,9 @@ ; FIXME: Is there a cost to using the extload over not? ; GCN-LABEL: {{^}}load_private_v2i16_split: ; GCN: s_waitcnt -; GFX900: buffer_load_ushort v0, off, s[0:3], s32 offset:4{{$}} +; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}} ; GFX900-NEXT: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:6 +; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { Index: test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- test/CodeGen/AMDGPU/load-lo16.ll +++ test/CodeGen/AMDGPU/load-lo16.ll @@ -600,7 +600,7 @@ define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 + %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 %load = load i16, i16 addrspace(5)* %gep %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -621,7 +621,7 @@ ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: - %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 + %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 %load = load i16, i16 addrspace(5)* %gep %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0 @@ -641,7 +641,7 @@ define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> - %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045 + %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047 %load = load half, half addrspace(5)* %gep %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0 store <2 x half> %build1, <2 x half> addrspace(1)* undef @@ -714,7 +714,7 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 %load = load i8, i8 addrspace(5)* %gep %ext = zext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 @@ -734,7 +734,7 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> - %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 + %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 %load = load i8, i8 addrspace(5)* %gep %ext = sext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0 @@ -905,7 +905,7 @@ %reg.bc = bitcast i32 %reg to <2 x i16> %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc - %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025 + %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027 %load = load volatile i16, i16 addrspace(5)* %gep %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef @@ -924,7 +924,7 @@ %reg.bc = bitcast i32 %reg to <2 x i16> %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc - %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 %load = load volatile i8, i8 addrspace(5)* %gep %load.ext = sext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0 @@ -944,7 +944,7 @@ %reg.bc = bitcast i32 %reg to <2 x i16> %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc - %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 %load = load volatile i8, i8 addrspace(5)* %gep %load.ext = zext i8 %load to i16 %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0 @@ -964,7 +964,7 @@ %reg.bc = bitcast i32 %reg to <2 x half> %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc - %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 %load = load volatile i8, i8 addrspace(5)* %gep %load.ext = sext i8 %load to i16 %bitcast = bitcast i16 %load.ext to half @@ -985,7 +985,7 @@ %reg.bc = bitcast i32 %reg to <2 x half> %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* store volatile i32 123, i32 addrspace(5)* %bc - %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 %load = load volatile i8, i8 addrspace(5)* %gep %load.ext = zext i8 %load to i16 %bitcast = bitcast i16 %load.ext to half Index: test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll +++ test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll @@ -1,12 +1,12 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s ; CHECK-LABEL: spill_v2i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill ; CHECK: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload define void @spill_v2i32() { entry: @@ -25,12 +25,12 @@ } ; CHECK-LABEL: spill_v2f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill ; CHECK: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload define void @spill_v2f32() { entry: @@ -49,14 +49,14 @@ } ; CHECK-LABEL: spill_v3i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill ; CHECK: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload define void @spill_v3i32() { entry: @@ -75,14 +75,14 @@ } ; CHECK-LABEL: spill_v3f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill ; CHECK: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload define void @spill_v3f32() { entry: @@ -101,16 +101,16 @@ } ; CHECK-LABEL: spill_v4i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill ; CHECK: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload define void @spill_v4i32() { entry: @@ -129,16 +129,16 @@ } ; CHECK-LABEL: spill_v4f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill ; CHECK: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload define void @spill_v4f32() { entry: @@ -157,17 +157,16 @@ } ; CHECK-LABEL: spill_v5i32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill ; CHECK: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload - +; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload define void @spill_v5i32() { entry: %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5) @@ -185,17 +184,16 @@ } ; CHECK-LABEL: spill_v5f32: -; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill -; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill ; CHECK: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND -; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload -; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload - +; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload define void @spill_v5f32() { entry: %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5) @@ -211,6 +209,3 @@ ret void } - - - Index: test/CodeGen/AMDGPU/nested-calls.ll =================================================================== --- test/CodeGen/AMDGPU/nested-calls.ll +++ test/CodeGen/AMDGPU/nested-calls.ll @@ -13,7 +13,7 @@ ; GCN-DAG: s_add_u32 s32, s32, 0x400 ; Spill CSR VGPR used for SGPR spilling ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-DAG: v_writelane_b32 v32, s33, 0 @@ -26,7 +26,7 @@ ; GCN: v_readlane_b32 s34, v32, 1 ; GCN: v_readlane_b32 s33, v32, 0 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN: s_sub_u32 s32, s32, 0x400 Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -19,7 +19,7 @@ ; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:24 +; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20 ; GCN: s_waitcnt vmcnt(0) ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 68 @@ -40,7 +40,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:24 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20 ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 68 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object: ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:24 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20 ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 136 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { @@ -84,7 +84,7 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: ; GCN: s_waitcnt -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0) ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 @@ -100,7 +100,7 @@ ; Tail call disallowed with byval in parent. ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent: ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 ; GCN-NOT: v_readlane_b32 s32 ; GCN: s_setpc_b64 @@ -110,14 +110,17 @@ ret i32 %ret } -; Tail call disallowed with byval in parent, not callee. +; Tail call disallowed with byval in parent, not callee. The stack +; usage of incoming arguments must be <= the outgoing stack +; arguments. + ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: ; GCN-NOT: v0 ; GCN-NOT: s32 ; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16 -; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} ; GCN-NEXT: s_setpc_b64 -define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 { +define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { entry: %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* inttoptr (i32 16 to i32 addrspace(5)*)) ret i32 %ret @@ -125,8 +128,8 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] @@ -149,19 +152,19 @@ ; FIXME: Why load and store same location for stack args? ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 ; GCN-NOT: s32 -; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NOT: s32 ; GCN: s_setpc_b64 @@ -173,7 +176,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:44 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:40 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -203,11 +206,11 @@ ; GCN: s_add_u32 s32, s32, 0x400 ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:12 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:8 ; GCN-NEXT: s_mov_b64 exec -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-DAG: v_writelane_b32 v34, s33, 0 ; GCN-DAG: v_writelane_b32 v34, s34, 1 @@ -221,10 +224,10 @@ ; GCN-DAG: v_readlane_b32 s33, v34, 0 ; GCN-DAG: v_readlane_b32 s34, v34, 1 -; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 +; GCN: buffer_load_dword v33, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:12 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:8 ; GCN-NEXT: s_mov_b64 exec ; GCN: s_sub_u32 s32, s32, 0x400 @@ -258,7 +261,7 @@ ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: ; GCN-NOT: s33 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44 ; GCN-NOT: s33 ; GCN: s_setpc_b64 s[6:7] Index: test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll =================================================================== --- test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -4,13 +4,13 @@ ; storeRegToStackSlot. ; GCN-LABEL: {{^}}spill_csr_s5_copy: -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill ; GCN: v_writelane_b32 v32, s5, 2 ; GCN: s_swappc_b64 ; GCN: v_readlane_b32 s5, v32, 2 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 -; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}} +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @spill_csr_s5_copy() #0 { bb: Index: test/CodeGen/AMDGPU/spill-offset-calculation.ll =================================================================== --- test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -82,6 +82,55 @@ ret void } +; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail +define void @test_sgpr_offset_function_scavenge_fail() #2 { +entry: + ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not + ; fit in the instruction, and has to live in the SGPR offset. + %alloca = alloca i8, i32 4096, align 4, addrspace(5) + %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + + %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + + %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() + %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 + %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 + %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 + %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 + %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 + %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 + %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 + %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 + + ; 0x40000 / 64 = 4096 (for wave64) + %a = load volatile i32, i32 addrspace(5)* %aptr + + ; CHECK: s_add_u32 s32, s32, 0x40000 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill + ; CHECK: s_sub_u32 s32, s32, 0x40000 + call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) + + %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() + %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 + %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 + %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 + %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 + %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 + %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 + %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 + %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 + + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 + + ; CHECK: s_add_u32 s32, s32, 0x40000 + ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload + ; CHECK: s_sub_u32 s32, s32, 0x40000 + + ; Force %a to spill with no free SGPRs + call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) + ret void +} + ; CHECK-LABEL: test_sgpr_offset_subregs_kernel define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { entry: @@ -145,7 +194,7 @@ entry: ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in ; the instruction offset field. - %alloca = alloca i8, i32 4088, align 4, addrspace(5) + %alloca = alloca i8, i32 4092, align 4, addrspace(5) %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 @@ -166,7 +215,7 @@ entry: ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not ; fit in the instruction, and has to live in the SGPR offset. - %alloca = alloca i8, i32 4092, align 4, addrspace(5) + %alloca = alloca i8, i32 4096, align 4, addrspace(5) %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 @@ -190,7 +239,7 @@ ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in ; the instruction offset field. - %alloca = alloca i8, i32 4084, align 4, addrspace(5) + %alloca = alloca i8, i32 4088, align 4, addrspace(5) %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* @@ -218,7 +267,7 @@ ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live ; in the SGPR offset. - %alloca = alloca i8, i32 4088, align 4, addrspace(5) + %alloca = alloca i8, i32 4092, align 4, addrspace(5) %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* @@ -244,3 +293,4 @@ attributes #0 = { nounwind } attributes #1 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } +attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } Index: test/CodeGen/AMDGPU/stack-realign.ll =================================================================== --- test/CodeGen/AMDGPU/stack-realign.ll +++ test/CodeGen/AMDGPU/stack-realign.ll @@ -10,8 +10,9 @@ ; GCN-LABEL: {{^}}needs_align16_default_stack_align: ; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s4 -; GCN-NEXT: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]] -; GCN: v_add_u32_e64 [[FI:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 16, [[FRAMEDIFF]] +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0 +; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]] +; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[FRAMEDIFF]], [[SCALED_IDX]] ; GCN-NOT: s32 @@ -126,10 +127,10 @@ ; GCN-LABEL: {{^}}default_realign_align128: ; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0 ; GCN-NEXT: s_and_b32 s5, [[TMP]], 0xffffe000 -; GCN-NEXT: s_add_u32 s32, s32, 0x6000 +; GCN-NEXT: s_add_u32 s32, s32, 0x4000 ; GCN-NOT: s5 -; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:128 -; GCN: s_sub_u32 s32, s32, 0x6000 +; GCN: buffer_store_dword v0, off, s[0:3], s5{{$}} +; GCN: s_sub_u32 s32, s32, 0x4000 define void @default_realign_align128(i32 %idx) #0 { %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 @@ -138,7 +139,7 @@ ; GCN-LABEL: {{^}}disable_realign_align128: ; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} ; GCN-NOT: s32 define void @disable_realign_align128(i32 %idx) #3 { %alloca.align = alloca i32, align 128, addrspace(5) Index: test/CodeGen/AMDGPU/store-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/store-hi16.ll +++ test/CodeGen/AMDGPU/store-hi16.ll @@ -492,7 +492,7 @@ entry: %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2045 + %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2047 store i16 %hi, i16 addrspace(5)* %gep ret void } @@ -644,7 +644,7 @@ store volatile i32 123, i32 addrspace(5)* %bc %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025 + %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027 store i16 %hi, i16 addrspace(5)* %gep ret void } @@ -661,7 +661,7 @@ store volatile i32 123, i32 addrspace(5)* %bc %value = bitcast i32 %arg to <2 x i16> %hi = extractelement <2 x i16> %value, i32 1 - %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 + %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 %trunc = trunc i16 %hi to i8 store i8 %trunc, i8 addrspace(5)* %gep ret void