Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2592,24 +2592,31 @@ if (!IsSibCall) { Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); + SmallVector CopyFromChains; + unsigned OffsetReg = Info->getScratchWaveOffsetReg(); // In the HSA case, this should be an identity copy. SDValue ScratchRSrcReg = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); // TODO: Don't hardcode these registers and get from the callee function. SDValue ScratchWaveOffsetReg = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); + CopyFromChains.push_back(ScratchWaveOffsetReg.getValue(1)); if (!Info->isEntryFunction()) { // Avoid clobbering this function's FP value. In the current convention // callee will overwrite this, so do save/restore around the call site. CallerSavedFP = DAG.getCopyFromReg(Chain, DL, Info->getFrameOffsetReg(), MVT::i32); + CopyFromChains.push_back(CallerSavedFP.getValue(1)); } + + Chain = DAG.getTokenFactor(DL, CopyFromChains); } SmallVector MemOpChains; Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -78,16 +78,6 @@ ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 - -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:28 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:32 - ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 @@ -95,11 +85,21 @@ ; GCN-NOT: s_add_u32 s32, s32, 0x800 + ; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}} ; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:8 ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:12 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:16 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 +; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 +; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 + +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:20 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:28 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:32 ; GCN: s_swappc_b64 ; GCN-NOT: v_readlane_b32 s32 @@ -272,16 +272,6 @@ ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 - -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36 - ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 @@ -294,7 +284,15 @@ ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 +; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 +; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36 ; GCN: s_swappc_b64 ; GCN-NOT: v_readlane_b32 s32 Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -805,14 +805,14 @@ } ; GCN-LABEL: {{^}}stack_12xv3i32: -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 ; GCN: v_mov_b32_e32 v31, 11 ; GCN: s_getpc define void @stack_12xv3i32() #0 { @@ -834,14 +834,14 @@ } ; GCN-LABEL: {{^}}stack_12xv3f32: -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 ; GCN: v_mov_b32_e32 v31, 0x41300000 ; GCN: s_getpc define void @stack_12xv3f32() #0 { @@ -863,22 +863,24 @@ } ; GCN-LABEL: {{^}}stack_8xv5i32: -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 + ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 ; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 + ; GCN: v_mov_b32_e32 v31, 7 ; GCN: s_getpc define void @stack_8xv5i32() #0 { @@ -896,22 +898,23 @@ } ; GCN-LABEL: {{^}}stack_8xv5f32: -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 ; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 +; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 +; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 +; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 + ; GCN: v_mov_b32_e32 v31, 0x40e00000 ; GCN: s_getpc define void @stack_8xv5f32() #0 {