Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -829,9 +829,12 @@ if (IncomingArg) { LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); - } else { - assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); + } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); + } else { + // We may have proven the input wasn't needed, although the ABI is + // requiring it. We just need to allocate the register appropriately. + MIRBuilder.buildUndef(InputReg); } if (OutgoingArg->isRegister()) { Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -235,7 +235,7 @@ "amdgpu-fixed-function-abi", cl::desc("Enable all implicit function arguments"), cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI), - cl::init(false), + cl::init(true), cl::Hidden); // Enable lib calls simplifications Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -110,20 +110,7 @@ else if (ST.isMesaGfxShader(F)) ImplicitBufferPtr = true; - if (UseFixedABI) { - DispatchPtr = true; - QueuePtr = true; - ImplicitArgPtr = true; - WorkGroupIDX = true; - WorkGroupIDY = true; - WorkGroupIDZ = true; - WorkItemIDX = true; - WorkItemIDY = true; - WorkItemIDZ = true; - - // FIXME: We don't need this? - DispatchID = true; - } else if (!AMDGPU::isGraphics(CC)) { + if (!AMDGPU::isGraphics(CC)) { if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x")) WorkGroupIDX = true; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -481,6 +481,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v0, v16, v0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GCN-NEXT: v_ashrrev_i32_e32 v1, v17, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v2, v18, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v3, v19, v3 @@ -495,13 +496,15 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v12, v28, v12 ; GCN-NEXT: v_ashrrev_i32_e32 v13, v29, v13 ; GCN-NEXT: v_ashrrev_i32_e32 v14, v30, v14 -; GCN-NEXT: v_ashrrev_i32_e32 v15, v31, v15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_ashrrev_i32_e32 v15, v16, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ashr_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_ashrrev_i32_e32 v0, v16, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, v17, v1 ; GFX10-NEXT: v_ashrrev_i32_e32 v2, v18, v2 @@ -517,6 +520,7 @@ ; GFX10-NEXT: v_ashrrev_i32_e32 v12, v28, v12 ; GFX10-NEXT: v_ashrrev_i32_e32 v13, v29, v13 ; GFX10-NEXT: v_ashrrev_i32_e32 v14, v30, v14 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ashrrev_i32_e32 v15, v31, v15 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = ashr <16 x i32> %value, %amount Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll @@ -499,50 +499,52 @@ ; GFX9-CONTRACT-LABEL: test_f64_add_mul: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; GFX9-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(6) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(4) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(2) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(3) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_f64_add_mul: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; GFX9-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(6) -; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(4) -; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(2) -; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(3) +; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] ; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; @@ -550,15 +552,16 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: s_clause 0x7 -; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; GFX10-CONTRACT-NEXT: s_clause 0x8 +; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(6) ; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(4) @@ -577,15 +580,16 @@ ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: s_clause 0x7 -; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; GFX10-DENORM-NEXT: s_clause 0x8 +; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(6) ; GFX10-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(4) @@ -610,50 +614,52 @@ ; GFX9-CONTRACT-LABEL: test_f64_add_mul_rhs: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; GFX9-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(6) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(4) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(2) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] -; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(5) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(3) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_f64_add_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; GFX9-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(6) -; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(4) -; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[34:35] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(2) -; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[36:37] -; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[38:39] +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX9-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX9-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; GFX9-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 +; GFX9-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 +; GFX9-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX9-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[31:32] +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(5) +; GFX9-DENORM-NEXT: v_fma_f64 v[18:19], v[18:19], v[26:27], v[33:34] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(3) +; GFX9-DENORM-NEXT: v_fma_f64 v[20:21], v[20:21], v[28:29], v[35:36] ; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] ; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] ; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f64 v[22:23], v[22:23], v[30:31], v[37:38] ; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; @@ -661,15 +667,16 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: s_clause 0x7 -; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; GFX10-CONTRACT-NEXT: s_clause 0x8 +; GFX10-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(6) ; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(4) @@ -688,15 +695,16 @@ ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: s_clause 0x7 -; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; GFX10-DENORM-NEXT: s_clause 0x8 +; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(6) ; GFX10-DENORM-NEXT: v_fma_f64 v[16:17], v[16:17], v[24:25], v[32:33] ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(4) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll @@ -33,8 +33,8 @@ ; CHECK-NEXT: bb.2.atomicrmw.start: ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %9(s64), %bb.2, [[C1]](s64), %bb.1 - ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %7(s32), %bb.2 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %16(s64), %bb.2, [[C1]](s64), %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %14(s32), %bb.2 ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[PHI1]], [[C]] ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[PHI1]], [[FSUB]] :: (load store seq_cst seq_cst (s32) on %ir.2, addrspace 3) ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll @@ -41,8 +41,8 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: %3:_(s32) = nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]] - ; CHECK-NEXT: $vgpr0 = COPY %3(s32) + ; CHECK-NEXT: %10:_(s32) = nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY %10(s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") @@ -57,8 +57,8 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: %3:_(s32) = nsz nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]] - ; CHECK-NEXT: $vgpr0 = COPY %3(s32) + ; CHECK-NEXT: %10:_(s32) = nsz nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY %10(s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 %val = call nsz float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") @@ -115,8 +115,8 @@ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: %7:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>) + ; CHECK-NEXT: %14:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %14(<2 x s32>) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] @@ -155,8 +155,8 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: %3:_(s32) = nsz nofpexcept G_STRICT_FSUB [[COPY]], [[COPY1]] - ; CHECK-NEXT: $vgpr0 = COPY %3(s32) + ; CHECK-NEXT: %10:_(s32) = nsz nofpexcept G_STRICT_FSUB [[COPY]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY %10(s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 %val = call nsz float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") @@ -171,8 +171,8 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: %3:_(s32) = nsz nofpexcept G_STRICT_FMUL [[COPY]], [[COPY1]] - ; CHECK-NEXT: $vgpr0 = COPY %3(s32) + ; CHECK-NEXT: %10:_(s32) = nsz nofpexcept G_STRICT_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY %10(s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 %val = call nsz float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") @@ -187,8 +187,8 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: %3:_(s32) = nsz nofpexcept G_STRICT_FDIV [[COPY]], [[COPY1]] - ; CHECK-NEXT: $vgpr0 = COPY %3(s32) + ; CHECK-NEXT: %10:_(s32) = nsz nofpexcept G_STRICT_FDIV [[COPY]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY %10(s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 %val = call nsz float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") @@ -203,8 +203,8 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: %3:_(s32) = nsz nofpexcept G_STRICT_FREM [[COPY]], [[COPY1]] - ; CHECK-NEXT: $vgpr0 = COPY %3(s32) + ; CHECK-NEXT: %10:_(s32) = nsz nofpexcept G_STRICT_FREM [[COPY]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY %10(s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 %val = call nsz float @llvm.experimental.constrained.frem.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") @@ -220,8 +220,8 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: %4:_(s32) = nsz nofpexcept G_STRICT_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; CHECK-NEXT: $vgpr0 = COPY %4(s32) + ; CHECK-NEXT: %11:_(s32) = nsz nofpexcept G_STRICT_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; CHECK-NEXT: $vgpr0 = COPY %11(s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 %val = call nsz float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.ignore") Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -108,6 +108,7 @@ ; CHECK: bb.2.bb1: ; CHECK: successors: %bb.3(0x80000000) ; CHECK: G_STORE [[C1]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + ; CHECK: G_BR %bb.3 ; CHECK: bb.3.bb2: ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s64) ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] @@ -817,7 +818,7 @@ define void @void_func_v32i32(<32 x i32> %arg0) #0 { ; CHECK-LABEL: name: void_func_v32i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -849,13 +850,14 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY33]] + ; CHECK: [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY32]] store <32 x i32> %arg0, <32 x i32> addrspace(1)* undef ret void } @@ -864,7 +866,7 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 { ; CHECK-LABEL: name: void_func_v33i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -896,15 +898,16 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5) - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<33 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[LOAD]](s32) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.1, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.0, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<33 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32), [[LOAD1]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: G_STORE [[BUILD_VECTOR]](<33 x s32>), [[DEF]](p1) :: (store (<33 x s32>) into `<33 x i32> addrspace(1)* undef`, align 256, addrspace 1) - ; CHECK: [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY33]] + ; CHECK: [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY32]] store <33 x i32> %arg0, <33 x i32> addrspace(1)* undef ret void } @@ -1088,7 +1091,7 @@ define void @void_func_v16i64(<16 x i64> %arg0) #0 { ; CHECK-LABEL: name: void_func_v16i64 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -1120,7 +1123,8 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5) ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) @@ -1136,13 +1140,13 @@ ; CHECK: [[MV12:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY24]](s32), [[COPY25]](s32) ; CHECK: [[MV13:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY26]](s32), [[COPY27]](s32) ; CHECK: [[MV14:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY28]](s32), [[COPY29]](s32) - ; CHECK: [[MV15:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY30]](s32), [[COPY31]](s32) + ; CHECK: [[MV15:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY30]](s32), [[LOAD]](s32) ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64), [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64), [[MV12]](s64), [[MV13]](s64), [[MV14]](s64), [[MV15]](s64) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: G_STORE [[BUILD_VECTOR]](<16 x s64>), [[DEF]](p1) :: (store (<16 x s64>) into `<16 x i64> addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY33]] + ; CHECK: [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY32]] store <16 x i64> %arg0, <16 x i64> addrspace(1)* undef ret void } @@ -1259,7 +1263,7 @@ define void @void_func_v65i16(<65 x i16> %arg0) #0 { ; CHECK-LABEL: name: void_func_v65i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 @@ -1291,17 +1295,18 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr31 - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (<2 x s16>) from %fixed-stack.0, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (<2 x s16>) from %fixed-stack.1, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (<2 x s16>) from %fixed-stack.0, addrspace 5) ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<130 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>), [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[COPY12]](<2 x s16>), [[COPY13]](<2 x s16>), [[COPY14]](<2 x s16>), [[COPY15]](<2 x s16>), [[COPY16]](<2 x s16>), [[COPY17]](<2 x s16>), [[COPY18]](<2 x s16>), [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>), [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[COPY23]](<2 x s16>), [[COPY24]](<2 x s16>), [[COPY25]](<2 x s16>), [[COPY26]](<2 x s16>), [[COPY27]](<2 x s16>), [[COPY28]](<2 x s16>), [[COPY29]](<2 x s16>), [[COPY30]](<2 x s16>), [[COPY31]](<2 x s16>), [[LOAD]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<130 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>), [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[COPY12]](<2 x s16>), [[COPY13]](<2 x s16>), [[COPY14]](<2 x s16>), [[COPY15]](<2 x s16>), [[COPY16]](<2 x s16>), [[COPY17]](<2 x s16>), [[COPY18]](<2 x s16>), [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>), [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[COPY23]](<2 x s16>), [[COPY24]](<2 x s16>), [[COPY25]](<2 x s16>), [[COPY26]](<2 x s16>), [[COPY27]](<2 x s16>), [[COPY28]](<2 x s16>), [[COPY29]](<2 x s16>), [[COPY30]](<2 x s16>), [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>) ; CHECK: [[UV:%[0-9]+]]:_(<65 x s16>), [[UV1:%[0-9]+]]:_(<65 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<130 x s16>) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: G_STORE [[UV]](<65 x s16>), [[DEF1]](p1) :: (store (<65 x s16>) into `<65 x i16> addrspace(1)* undef`, align 256, addrspace 1) - ; CHECK: [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY33]] + ; CHECK: [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY32]] store <65 x i16> %arg0, <65 x i16> addrspace(1)* undef ret void } @@ -1519,7 +1524,7 @@ define void @void_func_v16f64(<16 x double> %arg0) #0 { ; CHECK-LABEL: name: void_func_v16f64 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -1551,7 +1556,8 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5) ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) @@ -1567,13 +1573,13 @@ ; CHECK: [[MV12:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY24]](s32), [[COPY25]](s32) ; CHECK: [[MV13:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY26]](s32), [[COPY27]](s32) ; CHECK: [[MV14:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY28]](s32), [[COPY29]](s32) - ; CHECK: [[MV15:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY30]](s32), [[COPY31]](s32) + ; CHECK: [[MV15:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY30]](s32), [[LOAD]](s32) ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64), [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64), [[MV12]](s64), [[MV13]](s64), [[MV14]](s64), [[MV15]](s64) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: G_STORE [[BUILD_VECTOR]](<16 x s64>), [[DEF]](p1) :: (store (<16 x s64>) into `<16 x double> addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY33]] + ; CHECK: [[COPY32:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY32]] store <16 x double> %arg0, <16 x double> addrspace(1)* undef ret void } @@ -1873,7 +1879,7 @@ define void @void_func_v32i32_i32_byval_i8(<32 x i32> %arg0, i32 %arg1, i8 addrspace(5)* byval(i8) align 8 %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_i32_byval_i8 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -1905,20 +1911,21 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.1, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[COPY32:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX1]](p5) - ; CHECK: [[COPY33:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.2, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.1, addrspace 5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[COPY31:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX2]](p5) + ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 - ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[C]](p1) - ; CHECK: G_STORE [[LOAD]](s32), [[C]](p1) :: (store (s32) into `i32 addrspace(1)* null`, addrspace 1) - ; CHECK: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[COPY32]](p5) :: (dereferenceable load (s8) from %ir.arg2, addrspace 5) - ; CHECK: G_STORE [[LOAD1]](s8), [[COPY34]](p1) :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) - ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY33]] - ; CHECK: S_SETPC_B64_return [[COPY35]] + ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[C]](p1) + ; CHECK: G_STORE [[LOAD1]](s32), [[C]](p1) :: (store (s32) into `i32 addrspace(1)* null`, addrspace 1) + ; CHECK: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[COPY31]](p5) :: (dereferenceable load (s8) from %ir.arg2, addrspace 5) + ; CHECK: G_STORE [[LOAD2]](s8), [[COPY33]](p1) :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) + ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] + ; CHECK: S_SETPC_B64_return [[COPY34]] store i32 %arg1, i32 addrspace(1)* null %arg2.load = load i8, i8 addrspace(5)* %arg2 store i8 %arg2.load, i8 addrspace(1)* null @@ -1929,7 +1936,7 @@ define void @void_func_v32i32_byval_i8_i32(<32 x i32> %arg0, i8 addrspace(5)* byval(i8) %arg1, i32 %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_byval_i8_i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -1961,20 +1968,21 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[COPY32:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX]](p5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.0, addrspace 5) - ; CHECK: [[COPY33:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.2, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[COPY31:%[0-9]+]]:_(p5) = COPY [[FRAME_INDEX1]](p5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.0, align 8, addrspace 5) + ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 - ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[C]](p1) - ; CHECK: G_STORE [[LOAD]](s32), [[C]](p1) :: (store (s32) into `i32 addrspace(1)* null`, addrspace 1) - ; CHECK: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[COPY32]](p5) :: (dereferenceable load (s8) from %ir.arg1, addrspace 5) - ; CHECK: G_STORE [[LOAD1]](s8), [[COPY34]](p1) :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) - ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY33]] - ; CHECK: S_SETPC_B64_return [[COPY35]] + ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[C]](p1) + ; CHECK: G_STORE [[LOAD1]](s32), [[C]](p1) :: (store (s32) into `i32 addrspace(1)* null`, addrspace 1) + ; CHECK: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[COPY31]](p5) :: (dereferenceable load (s8) from %ir.arg1, addrspace 5) + ; CHECK: G_STORE [[LOAD2]](s8), [[COPY33]](p1) :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) + ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] + ; CHECK: S_SETPC_B64_return [[COPY34]] store i32 %arg2, i32 addrspace(1)* null %arg1.load = load i8, i8 addrspace(5)* %arg1 store i8 %arg1.load, i8 addrspace(1)* null @@ -1984,7 +1992,7 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_i32_i64 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2016,24 +2024,25 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.2, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.1, addrspace 5) - ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.0, align 8, addrspace 5) - ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[LOAD2]](s32) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.2, addrspace 5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.1, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.0, addrspace 5) + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) - ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[LOAD]](s32), [[COPY33]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[MV]](s64), [[COPY34]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY35]] + ; CHECK: G_STORE [[LOAD1]](s32), [[COPY32]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[MV]](s64), [[COPY33]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY34]] store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile i32 %arg1, i32 addrspace(1)* undef store volatile i64 %arg2, i64 addrspace(1)* undef @@ -2044,7 +2053,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 { ; CHECK-LABEL: name: void_func_v32i32_i1_i8_i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2076,31 +2085,32 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s1) from %fixed-stack.3, align 16, addrspace 5) - ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD]](s32) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 - ; CHECK: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5) - ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s16) - ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.1, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.0, align 4, addrspace 5) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s1) from %fixed-stack.3, align 4, addrspace 5) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[LOAD1]](s32) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.2, align 8, addrspace 5) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD2]](s16) + ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.1, align 4, addrspace 5) + ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD4:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s16) from %fixed-stack.0, align 16, addrspace 5) + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY35:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) - ; CHECK: [[COPY36:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[TRUNC]](s1), [[COPY33]](p1) :: (volatile store (s1) into `i1 addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[TRUNC1]](s8), [[COPY34]](p1) :: (volatile store (s8) into `i8 addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[LOAD2]](s16), [[COPY35]](p1) :: (volatile store (s16) into `i16 addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[LOAD3]](s16), [[COPY36]](p1) :: (volatile store (s16) into `half addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY37:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY37]] + ; CHECK: G_STORE [[TRUNC]](s1), [[COPY32]](p1) :: (volatile store (s1) into `i1 addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[TRUNC1]](s8), [[COPY33]](p1) :: (volatile store (s8) into `i8 addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[LOAD3]](s16), [[COPY34]](p1) :: (volatile store (s16) into `i16 addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[LOAD4]](s16), [[COPY35]](p1) :: (volatile store (s16) into `half addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY36:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY36]] store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile i1 %arg1, i1 addrspace(1)* undef store volatile i8 %arg2, i8 addrspace(1)* undef @@ -2112,7 +2122,7 @@ define void @void_func_v32i32_p3_p5_i16(<32 x i32> %arg0, i8 addrspace(3)* %arg1, i8 addrspace(5)* %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_p3_p5_i16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2144,21 +2154,22 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (p3) from %fixed-stack.1, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD1:%[0-9]+]]:_(p5) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (p5) from %fixed-stack.0, addrspace 5) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.2, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (p3) from %fixed-stack.1, addrspace 5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD2:%[0-9]+]]:_(p5) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (p5) from %fixed-stack.0, align 8, addrspace 5) + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) - ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[LOAD]](p3), [[COPY33]](p1) :: (volatile store (p3) into `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[LOAD1]](p5), [[COPY34]](p1) :: (volatile store (p5) into `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY35]] + ; CHECK: G_STORE [[LOAD1]](p3), [[COPY32]](p1) :: (volatile store (p3) into `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[LOAD2]](p5), [[COPY33]](p1) :: (volatile store (p5) into `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY34]] store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile i8 addrspace(3)* %arg1, i8 addrspace(3)* addrspace(1)* undef store volatile i8 addrspace(5)* %arg2, i8 addrspace(5)* addrspace(1)* undef @@ -2168,7 +2179,7 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_v2i32_v2f32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2200,27 +2211,28 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 - ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.2, addrspace 5) - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) - ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.1, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.0, addrspace 5) - ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD2]](s32), [[LOAD3]](s32) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.3, addrspace 5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.2, align 8, addrspace 5) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD1]](s32), [[LOAD2]](s32) + ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.1, addrspace 5) + ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) - ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[BUILD_VECTOR1]](<2 x s32>), [[COPY33]](p1) :: (volatile store (<2 x s32>) into `<2 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[BUILD_VECTOR2]](<2 x s32>), [[COPY34]](p1) :: (volatile store (<2 x s32>) into `<2 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY35]] + ; CHECK: G_STORE [[BUILD_VECTOR1]](<2 x s32>), [[COPY32]](p1) :: (volatile store (<2 x s32>) into `<2 x i32> addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR2]](<2 x s32>), [[COPY33]](p1) :: (volatile store (<2 x s32>) into `<2 x float> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY34]] store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef store volatile <2 x float> %arg2, <2 x float> addrspace(1)* undef @@ -2230,7 +2242,7 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_v2i16_v2f16 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2262,21 +2274,22 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (<2 x s16>) from %fixed-stack.1, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (<2 x s16>) from %fixed-stack.0, addrspace 5) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.2, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (<2 x s16>) from %fixed-stack.1, addrspace 5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD2:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (<2 x s16>) from %fixed-stack.0, align 8, addrspace 5) + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) - ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[LOAD]](<2 x s16>), [[COPY33]](p1) :: (volatile store (<2 x s16>) into `<2 x i16> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[LOAD1]](<2 x s16>), [[COPY34]](p1) :: (volatile store (<2 x s16>) into `<2 x half> addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY35]] + ; CHECK: G_STORE [[LOAD1]](<2 x s16>), [[COPY32]](p1) :: (volatile store (<2 x s16>) into `<2 x i16> addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[LOAD2]](<2 x s16>), [[COPY33]](p1) :: (volatile store (<2 x s16>) into `<2 x half> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY34]] store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile <2 x i16> %arg1, <2 x i16> addrspace(1)* undef store volatile <2 x half> %arg2, <2 x half> addrspace(1)* undef @@ -2286,7 +2299,7 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_v2i64_v2f64 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2318,39 +2331,40 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7 - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.7, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6 - ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.6, addrspace 5) - ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5 - ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.5, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 - ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.4, addrspace 5) - ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) - ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.8, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.7, addrspace 5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6 + ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.6, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5 + ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.5, addrspace 5) + ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 + ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5) + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[LOAD2]](s32) + ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD3]](s32), [[LOAD4]](s32) ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) - ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 - ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 - ; CHECK: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s32) from %fixed-stack.2, addrspace 5) - ; CHECK: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s32) from %fixed-stack.1, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load (s32) from %fixed-stack.0, addrspace 5) - ; CHECK: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32) - ; CHECK: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32) + ; CHECK: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 + ; CHECK: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s32) from %fixed-stack.3, addrspace 5) + ; CHECK: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s32) from %fixed-stack.2, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load (s32) from %fixed-stack.1, addrspace 5) + ; CHECK: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; CHECK: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD5]](s32), [[LOAD6]](s32) + ; CHECK: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD7]](s32), [[LOAD8]](s32) ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) - ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[COPY33]](p1) :: (volatile store (<2 x s64>) into `<2 x i64> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[COPY34]](p1) :: (volatile store (<2 x s64>) into `<2 x double> addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY35]] + ; CHECK: G_STORE [[BUILD_VECTOR1]](<2 x s64>), [[COPY32]](p1) :: (volatile store (<2 x s64>) into `<2 x i64> addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR2]](<2 x s64>), [[COPY33]](p1) :: (volatile store (<2 x s64>) into `<2 x double> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY34]] store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef store volatile <2 x double> %arg2, <2 x double> addrspace(1)* undef @@ -2360,7 +2374,7 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_v4i32_v4f32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2392,35 +2406,36 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7 - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.7, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6 - ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.6, addrspace 5) - ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5 - ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.5, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 - ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.4, addrspace 5) - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) - ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 - ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 - ; CHECK: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s32) from %fixed-stack.2, addrspace 5) - ; CHECK: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s32) from %fixed-stack.1, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load (s32) from %fixed-stack.0, addrspace 5) - ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.8, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.7, addrspace 5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6 + ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.6, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5 + ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.5, addrspace 5) + ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 + ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32) + ; CHECK: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 + ; CHECK: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s32) from %fixed-stack.3, addrspace 5) + ; CHECK: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s32) from %fixed-stack.2, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load (s32) from %fixed-stack.1, addrspace 5) + ; CHECK: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) - ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[BUILD_VECTOR1]](<4 x s32>), [[COPY33]](p1) :: (volatile store (<4 x s32>) into `<4 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[BUILD_VECTOR2]](<4 x s32>), [[COPY34]](p1) :: (volatile store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY35]] + ; CHECK: G_STORE [[BUILD_VECTOR1]](<4 x s32>), [[COPY32]](p1) :: (volatile store (<4 x s32>) into `<4 x i32> addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR2]](<4 x s32>), [[COPY33]](p1) :: (volatile store (<4 x s32>) into `<4 x float> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY34]] store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef store volatile <4 x float> %arg2, <4 x float> addrspace(1)* undef @@ -2430,7 +2445,7 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_v8i32_v8f32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2462,51 +2477,52 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15 - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.15, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14 - ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.14, addrspace 5) - ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.13 - ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.13, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.12 - ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.12, addrspace 5) - ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.11 - ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s32) from %fixed-stack.11, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.10 - ; CHECK: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s32) from %fixed-stack.10, addrspace 5) - ; CHECK: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.9 - ; CHECK: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s32) from %fixed-stack.9, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8 - ; CHECK: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load (s32) from %fixed-stack.8, addrspace 5) - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32) - ; CHECK: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7 - ; CHECK: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load (s32) from %fixed-stack.7, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX9:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6 - ; CHECK: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX9]](p5) :: (invariant load (s32) from %fixed-stack.6, addrspace 5) - ; CHECK: [[FRAME_INDEX10:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5 - ; CHECK: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX10]](p5) :: (invariant load (s32) from %fixed-stack.5, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX11:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 - ; CHECK: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX11]](p5) :: (invariant load (s32) from %fixed-stack.4, addrspace 5) - ; CHECK: [[FRAME_INDEX12:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 - ; CHECK: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX12]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX13:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 - ; CHECK: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX13]](p5) :: (invariant load (s32) from %fixed-stack.2, addrspace 5) - ; CHECK: [[FRAME_INDEX14:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load (s32) from %fixed-stack.1, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load (s32) from %fixed-stack.0, addrspace 5) - ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.16 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.16, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.15, addrspace 5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14 + ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.14, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.13 + ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.13, addrspace 5) + ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.12 + ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s32) from %fixed-stack.12, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.11 + ; CHECK: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s32) from %fixed-stack.11, addrspace 5) + ; CHECK: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.10 + ; CHECK: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s32) from %fixed-stack.10, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.9 + ; CHECK: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load (s32) from %fixed-stack.9, addrspace 5) + ; CHECK: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8 + ; CHECK: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load (s32) from %fixed-stack.8, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32) + ; CHECK: [[FRAME_INDEX9:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7 + ; CHECK: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX9]](p5) :: (invariant load (s32) from %fixed-stack.7, addrspace 5) + ; CHECK: [[FRAME_INDEX10:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6 + ; CHECK: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX10]](p5) :: (invariant load (s32) from %fixed-stack.6, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX11:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5 + ; CHECK: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX11]](p5) :: (invariant load (s32) from %fixed-stack.5, addrspace 5) + ; CHECK: [[FRAME_INDEX12:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 + ; CHECK: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX12]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX13:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 + ; CHECK: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX13]](p5) :: (invariant load (s32) from %fixed-stack.3, addrspace 5) + ; CHECK: [[FRAME_INDEX14:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load (s32) from %fixed-stack.2, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load (s32) from %fixed-stack.1, addrspace 5) + ; CHECK: [[FRAME_INDEX16:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX16]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32), [[LOAD16]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) - ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[BUILD_VECTOR1]](<8 x s32>), [[COPY33]](p1) :: (volatile store (<8 x s32>) into `<8 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[BUILD_VECTOR2]](<8 x s32>), [[COPY34]](p1) :: (volatile store (<8 x s32>) into `<8 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY35]] + ; CHECK: G_STORE [[BUILD_VECTOR1]](<8 x s32>), [[COPY32]](p1) :: (volatile store (<8 x s32>) into `<8 x i32> addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR2]](<8 x s32>), [[COPY33]](p1) :: (volatile store (<8 x s32>) into `<8 x float> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY34]] store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef store volatile <8 x float> %arg2, <8 x float> addrspace(1)* undef @@ -2516,7 +2532,7 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_v16i32_v16f32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2548,83 +2564,84 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.31 - ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.31, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.30 - ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.30, addrspace 5) - ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.29 - ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.29, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.28 - ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.28, addrspace 5) - ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.27 - ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s32) from %fixed-stack.27, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.26 - ; CHECK: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s32) from %fixed-stack.26, addrspace 5) - ; CHECK: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.25 - ; CHECK: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s32) from %fixed-stack.25, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.24 - ; CHECK: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load (s32) from %fixed-stack.24, addrspace 5) - ; CHECK: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.23 - ; CHECK: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load (s32) from %fixed-stack.23, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX9:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.22 - ; CHECK: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX9]](p5) :: (invariant load (s32) from %fixed-stack.22, addrspace 5) - ; CHECK: [[FRAME_INDEX10:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.21 - ; CHECK: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX10]](p5) :: (invariant load (s32) from %fixed-stack.21, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX11:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.20 - ; CHECK: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX11]](p5) :: (invariant load (s32) from %fixed-stack.20, addrspace 5) - ; CHECK: [[FRAME_INDEX12:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.19 - ; CHECK: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX12]](p5) :: (invariant load (s32) from %fixed-stack.19, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX13:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.18 - ; CHECK: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX13]](p5) :: (invariant load (s32) from %fixed-stack.18, addrspace 5) - ; CHECK: [[FRAME_INDEX14:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.17 - ; CHECK: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load (s32) from %fixed-stack.17, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.16 - ; CHECK: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load (s32) from %fixed-stack.16, addrspace 5) - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32) - ; CHECK: [[FRAME_INDEX16:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15 - ; CHECK: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX16]](p5) :: (invariant load (s32) from %fixed-stack.15, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX17:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14 - ; CHECK: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX17]](p5) :: (invariant load (s32) from %fixed-stack.14, addrspace 5) - ; CHECK: [[FRAME_INDEX18:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.13 - ; CHECK: [[LOAD18:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX18]](p5) :: (invariant load (s32) from %fixed-stack.13, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX19:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.12 - ; CHECK: [[LOAD19:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX19]](p5) :: (invariant load (s32) from %fixed-stack.12, addrspace 5) - ; CHECK: [[FRAME_INDEX20:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.11 - ; CHECK: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX20]](p5) :: (invariant load (s32) from %fixed-stack.11, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX21:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.10 - ; CHECK: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX21]](p5) :: (invariant load (s32) from %fixed-stack.10, addrspace 5) - ; CHECK: [[FRAME_INDEX22:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.9 - ; CHECK: [[LOAD22:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX22]](p5) :: (invariant load (s32) from %fixed-stack.9, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX23:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8 - ; CHECK: [[LOAD23:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX23]](p5) :: (invariant load (s32) from %fixed-stack.8, addrspace 5) - ; CHECK: [[FRAME_INDEX24:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7 - ; CHECK: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX24]](p5) :: (invariant load (s32) from %fixed-stack.7, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX25:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6 - ; CHECK: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX25]](p5) :: (invariant load (s32) from %fixed-stack.6, addrspace 5) - ; CHECK: [[FRAME_INDEX26:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5 - ; CHECK: [[LOAD26:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX26]](p5) :: (invariant load (s32) from %fixed-stack.5, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX27:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 - ; CHECK: [[LOAD27:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX27]](p5) :: (invariant load (s32) from %fixed-stack.4, addrspace 5) - ; CHECK: [[FRAME_INDEX28:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 - ; CHECK: [[LOAD28:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX28]](p5) :: (invariant load (s32) from %fixed-stack.3, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX29:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 - ; CHECK: [[LOAD29:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX29]](p5) :: (invariant load (s32) from %fixed-stack.2, addrspace 5) - ; CHECK: [[FRAME_INDEX30:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD30:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX30]](p5) :: (invariant load (s32) from %fixed-stack.1, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX31:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD31:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX31]](p5) :: (invariant load (s32) from %fixed-stack.0, addrspace 5) - ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD16]](s32), [[LOAD17]](s32), [[LOAD18]](s32), [[LOAD19]](s32), [[LOAD20]](s32), [[LOAD21]](s32), [[LOAD22]](s32), [[LOAD23]](s32), [[LOAD24]](s32), [[LOAD25]](s32), [[LOAD26]](s32), [[LOAD27]](s32), [[LOAD28]](s32), [[LOAD29]](s32), [[LOAD30]](s32), [[LOAD31]](s32) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.32 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.32, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.31 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s32) from %fixed-stack.31, addrspace 5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.30 + ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s32) from %fixed-stack.30, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.29 + ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s32) from %fixed-stack.29, addrspace 5) + ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.28 + ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s32) from %fixed-stack.28, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.27 + ; CHECK: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s32) from %fixed-stack.27, addrspace 5) + ; CHECK: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.26 + ; CHECK: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s32) from %fixed-stack.26, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.25 + ; CHECK: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load (s32) from %fixed-stack.25, addrspace 5) + ; CHECK: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.24 + ; CHECK: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load (s32) from %fixed-stack.24, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX9:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.23 + ; CHECK: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX9]](p5) :: (invariant load (s32) from %fixed-stack.23, addrspace 5) + ; CHECK: [[FRAME_INDEX10:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.22 + ; CHECK: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX10]](p5) :: (invariant load (s32) from %fixed-stack.22, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX11:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.21 + ; CHECK: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX11]](p5) :: (invariant load (s32) from %fixed-stack.21, addrspace 5) + ; CHECK: [[FRAME_INDEX12:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.20 + ; CHECK: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX12]](p5) :: (invariant load (s32) from %fixed-stack.20, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX13:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.19 + ; CHECK: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX13]](p5) :: (invariant load (s32) from %fixed-stack.19, addrspace 5) + ; CHECK: [[FRAME_INDEX14:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.18 + ; CHECK: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load (s32) from %fixed-stack.18, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.17 + ; CHECK: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load (s32) from %fixed-stack.17, addrspace 5) + ; CHECK: [[FRAME_INDEX16:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.16 + ; CHECK: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX16]](p5) :: (invariant load (s32) from %fixed-stack.16, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32), [[LOAD16]](s32) + ; CHECK: [[FRAME_INDEX17:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15 + ; CHECK: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX17]](p5) :: (invariant load (s32) from %fixed-stack.15, addrspace 5) + ; CHECK: [[FRAME_INDEX18:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14 + ; CHECK: [[LOAD18:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX18]](p5) :: (invariant load (s32) from %fixed-stack.14, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX19:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.13 + ; CHECK: [[LOAD19:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX19]](p5) :: (invariant load (s32) from %fixed-stack.13, addrspace 5) + ; CHECK: [[FRAME_INDEX20:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.12 + ; CHECK: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX20]](p5) :: (invariant load (s32) from %fixed-stack.12, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX21:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.11 + ; CHECK: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX21]](p5) :: (invariant load (s32) from %fixed-stack.11, addrspace 5) + ; CHECK: [[FRAME_INDEX22:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.10 + ; CHECK: [[LOAD22:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX22]](p5) :: (invariant load (s32) from %fixed-stack.10, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX23:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.9 + ; CHECK: [[LOAD23:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX23]](p5) :: (invariant load (s32) from %fixed-stack.9, addrspace 5) + ; CHECK: [[FRAME_INDEX24:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8 + ; CHECK: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX24]](p5) :: (invariant load (s32) from %fixed-stack.8, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX25:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7 + ; CHECK: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX25]](p5) :: (invariant load (s32) from %fixed-stack.7, addrspace 5) + ; CHECK: [[FRAME_INDEX26:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6 + ; CHECK: [[LOAD26:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX26]](p5) :: (invariant load (s32) from %fixed-stack.6, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX27:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5 + ; CHECK: [[LOAD27:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX27]](p5) :: (invariant load (s32) from %fixed-stack.5, addrspace 5) + ; CHECK: [[FRAME_INDEX28:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 + ; CHECK: [[LOAD28:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX28]](p5) :: (invariant load (s32) from %fixed-stack.4, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX29:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 + ; CHECK: [[LOAD29:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX29]](p5) :: (invariant load (s32) from %fixed-stack.3, addrspace 5) + ; CHECK: [[FRAME_INDEX30:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD30:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX30]](p5) :: (invariant load (s32) from %fixed-stack.2, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX31:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD31:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX31]](p5) :: (invariant load (s32) from %fixed-stack.1, addrspace 5) + ; CHECK: [[FRAME_INDEX32:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD32:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX32]](p5) :: (invariant load (s32) from %fixed-stack.0, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD17]](s32), [[LOAD18]](s32), [[LOAD19]](s32), [[LOAD20]](s32), [[LOAD21]](s32), [[LOAD22]](s32), [[LOAD23]](s32), [[LOAD24]](s32), [[LOAD25]](s32), [[LOAD26]](s32), [[LOAD27]](s32), [[LOAD28]](s32), [[LOAD29]](s32), [[LOAD30]](s32), [[LOAD31]](s32), [[LOAD32]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) - ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[BUILD_VECTOR1]](<16 x s32>), [[COPY33]](p1) :: (volatile store (<16 x s32>) into `<16 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[BUILD_VECTOR2]](<16 x s32>), [[COPY34]](p1) :: (volatile store (<16 x s32>) into `<16 x float> addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY35]] + ; CHECK: G_STORE [[BUILD_VECTOR1]](<16 x s32>), [[COPY32]](p1) :: (volatile store (<16 x s32>) into `<16 x i32> addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[BUILD_VECTOR2]](<16 x s32>), [[COPY33]](p1) :: (volatile store (<16 x s32>) into `<16 x float> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY34]] store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef @@ -2751,7 +2768,7 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { ; CHECK-LABEL: name: void_func_v32i32_v16i8 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2783,49 +2800,50 @@ ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 - ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) - ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15 - ; CHECK: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s16) from %fixed-stack.15, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14 - ; CHECK: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.14, align 4, addrspace 5) - ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.13 - ; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.13, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.12 - ; CHECK: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.12, align 4, addrspace 5) - ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.11 - ; CHECK: [[LOAD4:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s16) from %fixed-stack.11, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.10 - ; CHECK: [[LOAD5:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s16) from %fixed-stack.10, align 4, addrspace 5) - ; CHECK: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.9 - ; CHECK: [[LOAD6:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s16) from %fixed-stack.9, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8 - ; CHECK: [[LOAD7:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load (s16) from %fixed-stack.8, align 4, addrspace 5) - ; CHECK: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7 - ; CHECK: [[LOAD8:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load (s16) from %fixed-stack.7, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX9:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6 - ; CHECK: [[LOAD9:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX9]](p5) :: (invariant load (s16) from %fixed-stack.6, align 4, addrspace 5) - ; CHECK: [[FRAME_INDEX10:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5 - ; CHECK: [[LOAD10:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX10]](p5) :: (invariant load (s16) from %fixed-stack.5, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX11:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 - ; CHECK: [[LOAD11:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX11]](p5) :: (invariant load (s16) from %fixed-stack.4, align 4, addrspace 5) - ; CHECK: [[FRAME_INDEX12:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 - ; CHECK: [[LOAD12:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX12]](p5) :: (invariant load (s16) from %fixed-stack.3, align 16, addrspace 5) - ; CHECK: [[FRAME_INDEX13:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 - ; CHECK: [[LOAD13:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX13]](p5) :: (invariant load (s16) from %fixed-stack.2, align 4, addrspace 5) - ; CHECK: [[FRAME_INDEX14:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 - ; CHECK: [[LOAD14:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load (s16) from %fixed-stack.1, align 8, addrspace 5) - ; CHECK: [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 - ; CHECK: [[LOAD15:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load (s16) from %fixed-stack.0, align 4, addrspace 5) - ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s16>) = G_BUILD_VECTOR [[LOAD]](s16), [[LOAD1]](s16), [[LOAD2]](s16), [[LOAD3]](s16), [[LOAD4]](s16), [[LOAD5]](s16), [[LOAD6]](s16), [[LOAD7]](s16), [[LOAD8]](s16), [[LOAD9]](s16), [[LOAD10]](s16), [[LOAD11]](s16), [[LOAD12]](s16), [[LOAD13]](s16), [[LOAD14]](s16), [[LOAD15]](s16) + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.16 + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load (s32) from %fixed-stack.16, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[LOAD]](s32) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15 + ; CHECK: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load (s16) from %fixed-stack.15, align 4, addrspace 5) + ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14 + ; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load (s16) from %fixed-stack.14, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.13 + ; CHECK: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load (s16) from %fixed-stack.13, align 4, addrspace 5) + ; CHECK: [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.12 + ; CHECK: [[LOAD4:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load (s16) from %fixed-stack.12, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.11 + ; CHECK: [[LOAD5:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX5]](p5) :: (invariant load (s16) from %fixed-stack.11, align 4, addrspace 5) + ; CHECK: [[FRAME_INDEX6:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.10 + ; CHECK: [[LOAD6:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load (s16) from %fixed-stack.10, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.9 + ; CHECK: [[LOAD7:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load (s16) from %fixed-stack.9, align 4, addrspace 5) + ; CHECK: [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8 + ; CHECK: [[LOAD8:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load (s16) from %fixed-stack.8, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX9:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7 + ; CHECK: [[LOAD9:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX9]](p5) :: (invariant load (s16) from %fixed-stack.7, align 4, addrspace 5) + ; CHECK: [[FRAME_INDEX10:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6 + ; CHECK: [[LOAD10:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX10]](p5) :: (invariant load (s16) from %fixed-stack.6, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX11:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.5 + ; CHECK: [[LOAD11:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX11]](p5) :: (invariant load (s16) from %fixed-stack.5, align 4, addrspace 5) + ; CHECK: [[FRAME_INDEX12:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4 + ; CHECK: [[LOAD12:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX12]](p5) :: (invariant load (s16) from %fixed-stack.4, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX13:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3 + ; CHECK: [[LOAD13:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX13]](p5) :: (invariant load (s16) from %fixed-stack.3, align 4, addrspace 5) + ; CHECK: [[FRAME_INDEX14:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2 + ; CHECK: [[LOAD14:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load (s16) from %fixed-stack.2, align 8, addrspace 5) + ; CHECK: [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD15:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load (s16) from %fixed-stack.1, align 4, addrspace 5) + ; CHECK: [[FRAME_INDEX16:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD16:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX16]](p5) :: (invariant load (s16) from %fixed-stack.0, align 16, addrspace 5) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s16>) = G_BUILD_VECTOR [[LOAD1]](s16), [[LOAD2]](s16), [[LOAD3]](s16), [[LOAD4]](s16), [[LOAD5]](s16), [[LOAD6]](s16), [[LOAD7]](s16), [[LOAD8]](s16), [[LOAD9]](s16), [[LOAD10]](s16), [[LOAD11]](s16), [[LOAD12]](s16), [[LOAD13]](s16), [[LOAD14]](s16), [[LOAD15]](s16), [[LOAD16]](s16) ; CHECK: [[TRUNC:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[BUILD_VECTOR1]](<16 x s16>) - ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[COPY31:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) + ; CHECK: [[COPY32:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store (<32 x s32>) into `<32 x i32> addrspace(1)* undef`, addrspace 1) - ; CHECK: G_STORE [[TRUNC]](<16 x s8>), [[COPY33]](p1) :: (volatile store (<16 x s8>) into `<16 x i8> addrspace(1)* undef`, addrspace 1) - ; CHECK: [[COPY34:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] - ; CHECK: S_SETPC_B64_return [[COPY34]] + ; CHECK: G_STORE [[TRUNC]](<16 x s8>), [[COPY32]](p1) :: (volatile store (<16 x s8>) into `<16 x i8> addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY31]] + ; CHECK: S_SETPC_B64_return [[COPY33]] store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef ret void Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll @@ -54,9 +54,9 @@ ; CHECK-NEXT: liveins: $sgpr30_sgpr31 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %2, !0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %2 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[ADD]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] @@ -90,8 +90,8 @@ ; CHECK-NEXT: liveins: $sgpr30_sgpr31 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 @@ -106,8 +106,8 @@ ; CHECK-NEXT: liveins: $sgpr30_sgpr31 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 @@ -123,9 +123,9 @@ ; CHECK-NEXT: liveins: $sgpr30_sgpr31 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 1835018 /* regdef:VGPR_32 */, def %2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %2 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY1]], [[COPY2]] ; CHECK-NEXT: $vgpr0 = COPY [[FADD]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] @@ -144,9 +144,9 @@ ; CHECK-NEXT: liveins: $sgpr30_sgpr31 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1, 2949130 /* regdef:VReg_64 */, def %2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY %2 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 2949130 /* regdef:VReg_64 */, def %9 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY %9 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -217,8 +217,8 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) - ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 1835017 /* reguse:VGPR_32 */, [[COPY2]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY %2 + ; CHECK-NEXT: INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY2]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 @@ -234,8 +234,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %2, 196622 /* mem:m */, [[COPY]](p3) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %2 + ; CHECK-NEXT: INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 196622 /* mem:m */, [[COPY]](p3) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %9 ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 @@ -253,8 +253,8 @@ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32) - ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY %4 + ; CHECK-NEXT: INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY %11 ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 @@ -269,14 +269,14 @@ ; CHECK-NEXT: liveins: $sgpr30_sgpr31 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %3 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %10 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY2]](s32) - ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %5, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY %5 + ; CHECK-NEXT: INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %12, 1966089 /* reguse:SReg_32 */, [[COPY3]], 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY %12 ; CHECK-NEXT: $vgpr0 = COPY [[COPY5]](s32) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY6]], implicit $vgpr0 @@ -300,10 +300,10 @@ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32) ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32) ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %4, 1835018 /* regdef:VGPR_32 */, def %5, 1835018 /* regdef:VGPR_32 */, def %6, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY %4 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY %5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY %6 + ; CHECK-NEXT: INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY4]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY5]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY6]](tied-def 5) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY %11 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY %12 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY %13 ; CHECK-NEXT: G_STORE [[COPY7]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: G_STORE [[COPY8]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: G_STORE [[COPY9]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) @@ -325,11 +325,11 @@ ; CHECK-NEXT: liveins: $sgpr30_sgpr31 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %1 + ; CHECK-NEXT: INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 1966090 /* regdef:SReg_32 */, def %8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY %8 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) - ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %3, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY %3 + ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY2]](tied-def 3) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY %10 ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sibling-call.ll @@ -142,9 +142,9 @@ define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { ; GCN-LABEL: name: kernel_call_i32_fastcc_i32_i32_unused_result ; GCN: bb.1.entry: - ; GCN-NEXT: liveins: $sgpr8_sgpr9 + ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GCN-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll @@ -14,7 +14,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: ds_write_b32 v0, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -37,7 +37,7 @@ ; GFX8-LABEL: func_use_lds_global_constexpr_cast: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: flat_store_dword v[0:1], v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll @@ -99,8 +99,8 @@ ; ALL-LABEL: {{^}}test_workitem_id_x_func: ; ALL: s_waitcnt -; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v2 -; MESA-NEXT: v_and_b32_e32 v2, 0x3ff, v2 +; HSA-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; MESA-NEXT: v_and_b32_e32 v2, 0x3ff, v31 define void @test_workitem_id_x_func(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() store i32 %id, i32 addrspace(1)* %out @@ -108,8 +108,8 @@ } ; ALL-LABEL: {{^}}test_workitem_id_y_func: -; HSA: v_bfe_u32 v2, v2, 10, 10 -; MESA: v_bfe_u32 v2, v2, 10, 10 +; HSA: v_bfe_u32 v2, v31, 10, 10 +; MESA: v_bfe_u32 v2, v31, 10, 10 define void @test_workitem_id_y_func(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.y() store i32 %id, i32 addrspace(1)* %out @@ -117,8 +117,8 @@ } ; ALL-LABEL: {{^}}test_workitem_id_z_func: -; HSA: v_bfe_u32 v2, v2, 20, 10 -; MESA: v_bfe_u32 v2, v2, 20, 10 +; HSA: v_bfe_u32 v2, v31, 20, 10 +; MESA: v_bfe_u32 v2, v31, 20, 10 define void @test_workitem_id_z_func(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.z() store i32 %id, i32 addrspace(1)* %out Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -461,6 +461,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v16, v0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GCN-NEXT: v_lshrrev_b32_e32 v1, v17, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, v18, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v3, v19, v3 @@ -475,13 +476,15 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v12, v28, v12 ; GCN-NEXT: v_lshrrev_b32_e32 v13, v29, v13 ; GCN-NEXT: v_lshrrev_b32_e32 v14, v30, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v15, v31, v15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, v16, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_lshr_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v17, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v18, v2 @@ -497,6 +500,7 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v12, v28, v12 ; GFX10-NEXT: v_lshrrev_b32_e32 v13, v29, v13 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, v30, v14 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v15, v31, v15 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr <16 x i32> %value, %amount Index: llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -170,7 +170,7 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v4 ; GCN-NEXT: v_add_u32_e32 v2, s6, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off @@ -225,14 +225,14 @@ ; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v2, 1 -; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v3 ; GCN-NEXT: v_add_u32_e32 v2, s6, v2 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_u32_e32 v2, v2, v3 ; GCN-NEXT: global_store_dword v[0:1], v2, off Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -1916,13 +1916,13 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s4, 1 -; GFX6-NEXT: v_min_i32_e32 v32, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, s4, v32 -; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 +; GFX6-NEXT: v_min_i32_e32 v31, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, s4, v31 +; GFX6-NEXT: v_max_i32_e32 v16, v31, v16 ; GFX6-NEXT: s_brev_b32 s5, -2 -; GFX6-NEXT: v_max_i32_e32 v32, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, s5, v32 -; GFX6-NEXT: v_min_i32_e32 v16, v16, v32 +; GFX6-NEXT: v_max_i32_e32 v31, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, s5, v31 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v31 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 ; GFX6-NEXT: v_min_i32_e32 v16, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 @@ -1949,8 +1949,9 @@ ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v17 ; GFX6-NEXT: v_min_i32_e32 v17, 0, v4 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 -; GFX6-NEXT: v_max_i32_e32 v19, 0, v4 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 +; GFX6-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GFX6-NEXT: v_max_i32_e32 v19, 0, v4 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v17 @@ -2028,7 +2029,8 @@ ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 ; GFX6-NEXT: v_min_i32_e32 v18, 0, v15 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 -; GFX6-NEXT: v_max_i32_e32 v16, v16, v31 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_max_i32_e32 v16, v16, v20 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2037,13 +2039,13 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s4, 1 -; GFX8-NEXT: v_min_i32_e32 v32, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 +; GFX8-NEXT: v_min_i32_e32 v31, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_max_i32_e32 v16, v31, v16 ; GFX8-NEXT: s_brev_b32 s5, -2 -; GFX8-NEXT: v_max_i32_e32 v32, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, s5, v32 -; GFX8-NEXT: v_min_i32_e32 v16, v16, v32 +; GFX8-NEXT: v_max_i32_e32 v31, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v31, vcc, s5, v31 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v31 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v16 ; GFX8-NEXT: v_min_i32_e32 v16, 0, v1 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, s4, v16 @@ -2070,8 +2072,9 @@ ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 ; GFX8-NEXT: v_min_i32_e32 v17, 0, v4 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 -; GFX8-NEXT: v_max_i32_e32 v19, 0, v4 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 +; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GFX8-NEXT: v_max_i32_e32 v19, 0, v4 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17 @@ -2149,7 +2152,8 @@ ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 ; GFX8-NEXT: v_min_i32_e32 v18, 0, v15 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v18 -; GFX8-NEXT: v_max_i32_e32 v16, v16, v31 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_i32_e32 v16, v16, v20 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2158,6 +2162,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_i32 v0, v0, v16 clamp +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX9-NEXT: v_add_i32 v1, v1, v17 clamp ; GFX9-NEXT: v_add_i32 v2, v2, v18 clamp ; GFX9-NEXT: v_add_i32 v3, v3, v19 clamp @@ -2172,13 +2177,15 @@ ; GFX9-NEXT: v_add_i32 v12, v12, v28 clamp ; GFX9-NEXT: v_add_i32 v13, v13, v29 clamp ; GFX9-NEXT: v_add_i32 v14, v14, v30 clamp -; GFX9-NEXT: v_add_i32 v15, v15, v31 clamp +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_i32 v15, v15, v16 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_add_nc_i32 v0, v0, v16 clamp ; GFX10-NEXT: v_add_nc_i32 v1, v1, v17 clamp ; GFX10-NEXT: v_add_nc_i32 v2, v2, v18 clamp @@ -2194,6 +2201,7 @@ ; GFX10-NEXT: v_add_nc_i32 v12, v12, v28 clamp ; GFX10-NEXT: v_add_nc_i32 v13, v13, v29 clamp ; GFX10-NEXT: v_add_nc_i32 v14, v14, v30 clamp +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_i32 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -473,6 +473,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v0, v16, v0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GCN-NEXT: v_lshlrev_b32_e32 v1, v17, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v2, v18, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v3, v19, v3 @@ -487,13 +488,15 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v12, v28, v12 ; GCN-NEXT: v_lshlrev_b32_e32 v13, v29, v13 ; GCN-NEXT: v_lshlrev_b32_e32 v14, v30, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, v31, v15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v15, v16, v15 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_shl_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, v16, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, v17, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, v18, v2 @@ -509,6 +512,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v12, v28, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v13, v29, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v14, v30, v14 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v15, v31, v15 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl <16 x i32> %value, %amount Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -1902,13 +1902,13 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v32, vcc, s4, v32 -; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 +; GFX6-NEXT: v_max_i32_e32 v31, -1, v0 +; GFX6-NEXT: v_subrev_i32_e32 v31, vcc, s4, v31 +; GFX6-NEXT: v_max_i32_e32 v16, v31, v16 ; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: v_min_i32_e32 v32, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v32, vcc, s5, v32 -; GFX6-NEXT: v_min_i32_e32 v16, v16, v32 +; GFX6-NEXT: v_min_i32_e32 v31, -1, v0 +; GFX6-NEXT: v_subrev_i32_e32 v31, vcc, s5, v31 +; GFX6-NEXT: v_min_i32_e32 v16, v16, v31 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 @@ -1935,8 +1935,9 @@ ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v16 -; GFX6-NEXT: v_min_i32_e32 v19, -1, v4 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 +; GFX6-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GFX6-NEXT: v_min_i32_e32 v19, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v18 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 @@ -2014,7 +2015,8 @@ ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v17, v16 ; GFX6-NEXT: v_min_i32_e32 v17, -1, v15 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v18 -; GFX6-NEXT: v_max_i32_e32 v16, v16, v31 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_max_i32_e32 v16, v16, v20 ; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2023,13 +2025,13 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v32, vcc, s4, v32 -; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 +; GFX8-NEXT: v_max_i32_e32 v31, -1, v0 +; GFX8-NEXT: v_subrev_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_max_i32_e32 v16, v31, v16 ; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: v_min_i32_e32 v32, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v32, vcc, s5, v32 -; GFX8-NEXT: v_min_i32_e32 v16, v16, v32 +; GFX8-NEXT: v_min_i32_e32 v31, -1, v0 +; GFX8-NEXT: v_subrev_u32_e32 v31, vcc, s5, v31 +; GFX8-NEXT: v_min_i32_e32 v16, v16, v31 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v16 ; GFX8-NEXT: v_max_i32_e32 v16, -1, v1 ; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, s4, v16 @@ -2056,8 +2058,9 @@ ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v4 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v16 -; GFX8-NEXT: v_min_i32_e32 v19, -1, v4 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 +; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GFX8-NEXT: v_min_i32_e32 v19, -1, v4 ; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v18 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v17 @@ -2135,7 +2138,8 @@ ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v17, v16 ; GFX8-NEXT: v_min_i32_e32 v17, -1, v15 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v18 -; GFX8-NEXT: v_max_i32_e32 v16, v16, v31 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_i32_e32 v16, v16, v20 ; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 ; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2144,6 +2148,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_i32 v0, v0, v16 clamp +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX9-NEXT: v_sub_i32 v1, v1, v17 clamp ; GFX9-NEXT: v_sub_i32 v2, v2, v18 clamp ; GFX9-NEXT: v_sub_i32 v3, v3, v19 clamp @@ -2158,13 +2163,15 @@ ; GFX9-NEXT: v_sub_i32 v12, v12, v28 clamp ; GFX9-NEXT: v_sub_i32 v13, v13, v29 clamp ; GFX9-NEXT: v_sub_i32 v14, v14, v30 clamp -; GFX9-NEXT: v_sub_i32 v15, v15, v31 clamp +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_i32 v15, v15, v16 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_i32 v0, v0, v16 clamp ; GFX10-NEXT: v_sub_nc_i32 v1, v1, v17 clamp ; GFX10-NEXT: v_sub_nc_i32 v2, v2, v18 clamp @@ -2180,6 +2187,7 @@ ; GFX10-NEXT: v_sub_nc_i32 v12, v12, v28 clamp ; GFX10-NEXT: v_sub_nc_i32 v13, v13, v29 clamp ; GFX10-NEXT: v_sub_nc_i32 v14, v14, v30 clamp +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_i32 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -1307,8 +1307,9 @@ ; GFX6-LABEL: v_uaddsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_xor_b32_e32 v32, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v16, v32, v16 +; GFX6-NEXT: v_xor_b32_e32 v31, -1, v0 +; GFX6-NEXT: v_min_u32_e32 v16, v31, v16 +; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 ; GFX6-NEXT: v_xor_b32_e32 v16, -1, v1 ; GFX6-NEXT: v_min_u32_e32 v16, v16, v17 @@ -1353,6 +1354,7 @@ ; GFX6-NEXT: v_min_u32_e32 v16, v16, v30 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GFX6-NEXT: v_xor_b32_e32 v16, -1, v15 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_min_u32_e32 v16, v16, v31 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1361,6 +1363,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v17 clamp ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v18 clamp ; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v19 clamp @@ -1375,13 +1378,15 @@ ; GFX8-NEXT: v_add_u32_e64 v12, s[4:5], v12, v28 clamp ; GFX8-NEXT: v_add_u32_e64 v13, s[4:5], v13, v29 clamp ; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30 clamp -; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], v15, v31 clamp +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], v15, v16 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX9-NEXT: v_add_u32_e64 v1, v1, v17 clamp ; GFX9-NEXT: v_add_u32_e64 v2, v2, v18 clamp ; GFX9-NEXT: v_add_u32_e64 v3, v3, v19 clamp @@ -1396,13 +1401,15 @@ ; GFX9-NEXT: v_add_u32_e64 v12, v12, v28 clamp ; GFX9-NEXT: v_add_u32_e64 v13, v13, v29 clamp ; GFX9-NEXT: v_add_u32_e64 v14, v14, v30 clamp -; GFX9-NEXT: v_add_u32_e64 v15, v15, v31 clamp +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e64 v15, v15, v16 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp @@ -1418,6 +1425,7 @@ ; GFX10-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -1265,29 +1265,31 @@ ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16 ; GFX6-NEXT: v_min_u32_e32 v16, v3, v19 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v11, v27 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v12, v28 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v13, v29 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v14, v30 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v15, v31 +; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX6-NEXT: v_min_u32_e32 v17, v4, v20 +; GFX6-NEXT: v_min_u32_e32 v18, v5, v21 +; GFX6-NEXT: v_min_u32_e32 v19, v6, v22 +; GFX6-NEXT: v_min_u32_e32 v20, v7, v23 +; GFX6-NEXT: v_min_u32_e32 v21, v8, v24 +; GFX6-NEXT: v_min_u32_e32 v22, v9, v25 +; GFX6-NEXT: v_min_u32_e32 v23, v10, v26 +; GFX6-NEXT: v_min_u32_e32 v24, v11, v27 +; GFX6-NEXT: v_min_u32_e32 v25, v12, v28 +; GFX6-NEXT: v_min_u32_e32 v26, v13, v29 +; GFX6-NEXT: v_min_u32_e32 v27, v14, v30 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v18 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v19 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v20 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v21 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v22 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v23 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v24 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v25 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v26 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v27 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_min_u32_e32 v16, v15, v16 ; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1295,6 +1297,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp @@ -1309,13 +1312,15 @@ ; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp ; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp ; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp -; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v31 clamp +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v16 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp @@ -1330,13 +1335,15 @@ ; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp ; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp ; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp -; GFX9-NEXT: v_sub_u32_e64 v15, v15, v31 clamp +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e64 v15, v15, v16 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp @@ -1352,6 +1359,7 @@ ; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) Index: llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -85,42 +85,46 @@ ; ; FIXEDABI-SDAG-LABEL: parent_kernel_missing_inputs: ; FIXEDABI-SDAG: ; %bb.0: -; FIXEDABI-SDAG-NEXT: s_add_i32 s10, s10, s15 -; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9 +; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s15 +; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9 ; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s11 ; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0 -; FIXEDABI-SDAG-NEXT: s_mov_b64 s[10:11], s[8:9] +; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8 ; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2 ; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0 +; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6 +; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0 -; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[16:17] -; FIXEDABI-SDAG-NEXT: s_add_u32 s16, s16, requires_all_inputs@rel32@lo+4 -; FIXEDABI-SDAG-NEXT: s_addc_u32 s17, s17, requires_all_inputs@rel32@hi+12 -; FIXEDABI-SDAG-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5 +; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5] +; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 +; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 +; FIXEDABI-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] ; FIXEDABI-SDAG-NEXT: s_endpgm ; ; FIXEDABI-GISEL-LABEL: parent_kernel_missing_inputs: ; FIXEDABI-GISEL: ; %bb.0: -; FIXEDABI-GISEL-NEXT: s_add_i32 s10, s10, s15 -; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9 +; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s15 +; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9 ; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2 -; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s11 ; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0 -; FIXEDABI-GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8 ; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1 ; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0 +; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6 +; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0 -; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[16:17] -; FIXEDABI-GISEL-NEXT: s_add_u32 s16, s16, requires_all_inputs@rel32@lo+4 -; FIXEDABI-GISEL-NEXT: s_addc_u32 s17, s17, requires_all_inputs@rel32@hi+12 -; FIXEDABI-GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5 +; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5] +; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 +; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 +; FIXEDABI-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; FIXEDABI-GISEL-NEXT: s_endpgm call void @requires_all_inputs() ret void Index: llvm/test/CodeGen/AMDGPU/addrspacecast.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -42,7 +42,7 @@ ; Test handling inside a non-kernel ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func: -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} +; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}} ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc Index: llvm/test/CodeGen/AMDGPU/amdpal-callable.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -147,8 +147,7 @@ ; GCN: amdpal.pipelines: ; GCN-NEXT: - .registers: -; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}} -; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}} +; GCN-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}} ; GCN-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}} ; GCN-NEXT: .shader_functions: ; GCN-NEXT: dynamic_stack: @@ -190,15 +189,13 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} -; SDAG-NEXT: .vgpr_count: 0x2a{{$}} -; GISEL-NEXT: .vgpr_count: 0x34{{$}} +; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; SDAG-NEXT: .vgpr_count: 0x2a{{$}} -; GISEL-NEXT: .vgpr_count: 0x34{{$}} +; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: simple_lds: ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x20{{$}} @@ -230,8 +227,7 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; SDAG-NEXT: .vgpr_count: 0x2b{{$}} -; GISEL-NEXT: .vgpr_count: 0x34{{$}} +; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: simple_stack_recurse: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x26{{$}} Index: llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -898,10 +898,10 @@ ; CHECK-LABEL: spill_func: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt expcnt(1) ; CHECK-NEXT: v_writelane_b32 v0, s33, 0 ; CHECK-NEXT: v_writelane_b32 v0, s34, 1 @@ -973,7 +973,7 @@ ; CHECK-NEXT: v_writelane_b32 v1, s101, 4 ; CHECK-NEXT: v_writelane_b32 v0, s95, 62 ; CHECK-NEXT: v_writelane_b32 v1, s30, 5 -; CHECK-NEXT: s_mov_b32 s29, s4 +; CHECK-NEXT: s_mov_b32 s29, s12 ; CHECK-NEXT: v_writelane_b32 v0, s96, 63 ; CHECK-NEXT: v_writelane_b32 v1, s31, 6 ; CHECK-NEXT: s_cmp_eq_u32 s29, 0 Index: llvm/test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -481,7 +481,7 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}} -; GCN-NOT: v3 +; GCN-NOT: v3{{$}} ; GCN-DAG: v_mov_b32_e32 v0, 3 ; GCN-DAG: v_mov_b32_e32 v1, 4 ; GCN-DAG: v_mov_b32_e32 v2, 5 @@ -586,7 +586,7 @@ ; GCN-DAG: buffer_load_dwordx4 v[20:23], off ; GCN-DAG: buffer_load_dwordx4 v[24:27], off ; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN-NOT: s_waitcnt +; GCN: buffer_store_dword v31, off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef @@ -611,7 +611,8 @@ ; GCN-DAG: buffer_load_dwordx4 v[28:31], off ; GCN: s_waitcnt -; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}} +; GCN-DAG: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}} +; GCN-DAG: buffer_store_dword v31, off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}} ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { @@ -636,7 +637,6 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32: ; GCN: buffer_load_ubyte v0, off ; GCN: buffer_load_dword v1, off -; GCN-NOT: s_waitcnt ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef @@ -739,13 +739,15 @@ ; GCN-LABEL: {{^}}tail_call_byval_align16: ; GCN-NOT: s32 -; GCN: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:12 -; GCN: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32 offset:8 +; GCN: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:28 +; GCN: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32{{$}} ; GCN: s_getpc_b64 -; GCN: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:20 +; GCN: buffer_load_dword [[VREG3:v[0-9]+]], off, s[0:3], s32 offset:24{{$}} ; GCN: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}} +; GCN: buffer_store_dword [[VREG3]], off, s[0:3], s32 offset:16{{$}} ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { @@ -757,11 +759,13 @@ ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: ; GCN-NOT: s32 -; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_load_dword v31, off, s[0:3], s32{{$}} ; GCN: s_getpc_b64 -; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v31, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { @@ -771,15 +775,16 @@ } ; GCN-LABEL: {{^}}stack_12xv3i32: +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; GCN: buffer_store_dword [[REG11]], off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 ; GCN: buffer_store_dword [[REG12]], {{.*$}} ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 v31, 11 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 ; GCN: s_getpc define void @stack_12xv3i32() #0 { entry: @@ -800,15 +805,16 @@ } ; GCN-LABEL: {{^}}stack_12xv3f32: +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 +; GCN: buffer_store_dword [[REG11]], {{.*$}} ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*$}} +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4 ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 v31, 0x41300000 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 ; GCN: s_getpc define void @stack_12xv3f32() #0 { entry: @@ -829,25 +835,24 @@ } ; GCN-LABEL: {{^}}stack_8xv5i32: - +; GCN: v_mov_b32_e32 [[REG7:v[0-9]+]], 7 +; GCN: buffer_store_dword [[REG7]], {{.*$}} ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN: buffer_store_dword [[REG8]], {{.*$}} +; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 ; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 - -; GCN: v_mov_b32_e32 v31, 7 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 ; GCN: s_getpc define void @stack_8xv5i32() #0 { entry: @@ -864,24 +869,24 @@ } ; GCN-LABEL: {{^}}stack_8xv5f32: +; GCN: v_mov_b32_e32 [[REG7:v[0-9]+]], 0x40e00000 +; GCN: buffer_store_dword [[REG7]], {{.*$}} ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; GCN: buffer_store_dword [[REG8]], {{.*$}} +; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 ; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 - -; GCN: v_mov_b32_e32 v31, 0x40e00000 +; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 ; GCN: s_getpc define void @stack_8xv5f32() #0 { entry: @@ -907,6 +912,9 @@ <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0 declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0 -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind noinline } + + + Index: llvm/test/CodeGen/AMDGPU/call-constexpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-constexpr.ll +++ llvm/test/CodeGen/AMDGPU/call-constexpr.ll @@ -65,8 +65,8 @@ ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt -; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_and_b32_e32 [[TMP:v[0-9]+]], 0x3ff, v31 +; GCN-NEXT: v_add_i32_e32 v0, vcc, [[TMP]], v0 ; GCN-NEXT: s_setpc_b64 define hidden i32 @use_workitem_id_x(i32 %arg0) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -75,7 +75,7 @@ } ; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x: -; GCN: v_mov_b32_e32 v1, v0 +; GCN: v_mov_b32_e32 v31, v0 ; GCN: s_getpc_b64 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+12 Index: llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -3,7 +3,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FLATSCR %s -declare hidden void @external_void_func_void() #0 +declare hidden void @external_void_func_void() #3 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: s_getpc_b64 s[34:35] @@ -341,3 +341,4 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind noinline } +attributes #3 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } Index: llvm/test/CodeGen/AMDGPU/call-waitcnt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -5,19 +5,20 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 { ; GCN-LABEL: call_memory_arg_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[8:9], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_load_dword s6, s[6:7], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: s_endpgm %vgpr = load volatile i32, i32 addrspace(3)* %ptr call void @func(i32 %vgpr) @@ -28,20 +29,21 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_memory_no_dep: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_dword v0, v0, s[4:5] +; GCN-NEXT: global_store_dword v0, v0, s[6:7] +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: s_endpgm store i32 0, i32 addrspace(1)* %ptr call void @func(i32 0) @@ -52,18 +54,19 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 ; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: global_store_dword v40, v40, s[34:35] ; GCN-NEXT: s_endpgm call void @func(i32 0) @@ -74,18 +77,19 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12 +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12 ; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: global_store_dword v40, v0, s[34:35] ; GCN-NEXT: s_endpgm %rv = call i32 @func.return(i32 0) @@ -97,18 +101,19 @@ define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_got_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: s_endpgm call void @got.func(i32 0) ret void @@ -149,4 +154,4 @@ declare hidden i32 @func.return(i32) #0 declare void @got.func(i32) #0 -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } Index: llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -256,7 +256,7 @@ ; GCN: .amdhsa_system_sgpr_workgroup_id_y 1 ; GCN: .amdhsa_system_sgpr_workgroup_id_z 1 ; GCN: .amdhsa_system_sgpr_workgroup_info 0 -; GCN: .amdhsa_system_vgpr_workitem_id 2 +; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 { call void @use_every_sgpr_input() ret void @@ -282,7 +282,7 @@ ; GCN: .amdhsa_system_sgpr_workgroup_id_y 1 ; GCN: .amdhsa_system_sgpr_workgroup_id_z 1 ; GCN: .amdhsa_system_sgpr_workgroup_info 0 -; GCN: .amdhsa_system_vgpr_workitem_id 2 +; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_indirect_use_every_sgpr_input_no_kernargs() #2 { call void @use_every_sgpr_input() ret void Index: llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-fixed-function-abi=0 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -amdgpu-fixed-function-abi=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: ; GCN: s_load_dword s{{[0-9]+}}, s[4:5] Index: llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt -; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0 +; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -15,7 +15,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_y: ; GCN: s_waitcnt -; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10 +; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -27,7 +27,7 @@ ; GCN-LABEL: {{^}}use_workitem_id_z: ; GCN: s_waitcnt -; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10 +; GCN: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -39,8 +39,8 @@ ; GCN-LABEL: {{^}}use_workitem_id_xy: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-NEXT: s_waitcnt @@ -55,9 +55,9 @@ ; GCN-LABEL: {{^}}use_workitem_id_xyz: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] @@ -75,8 +75,8 @@ ; GCN-LABEL: {{^}}use_workitem_id_xz: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] ; GCN-NEXT: s_waitcnt @@ -91,8 +91,8 @@ ; GCN-LABEL: {{^}}use_workitem_id_yz: ; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 -; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] ; GCN-NEXT: s_waitcnt @@ -106,10 +106,9 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: - -; GCN-NOT: v0 +; GCN: v_mov_b32_e32 v31, v0 ; GCN: s_swappc_b64 -; GCN-NOT: v0 +; GCN-NOT: v31 ; GCN: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 { @@ -121,9 +120,11 @@ ; GCN-NOT: v0 ; GCN-NOT: v1 -; UNPACKED-TID: v_lshlrev_b32_e32 v0, 10, v1 -; UNPACKED-TID-NOT: v0 -; UNPACKED-TID-NOT: v1 +; GCN-NOT: v31 +; PACKED-TID: v_mov_b32_e32 v31, v0 +; UNPACKED-TID: v_lshlrev_b32_e32 v31, 10, v1 +; GCN-NOT: v0 +; GCN-NOT: v1 ; GCN: s_swappc_b64 ; GCN: .amdhsa_system_vgpr_workitem_id 1 @@ -136,9 +137,11 @@ ; GCN-NOT: v0 ; GCN-NOT: v2 -; UNPACKED-TID: v_lshlrev_b32_e32 v0, 20, v2 -; UNPACKED-TID-NOT: v0 -; UNPACKED-TID-NOT: v1 +; GCN-NOT: v31 +; PACKED-TID: v_mov_b32_e32 v31, v0 +; UNPACKED-TID: v_lshlrev_b32_e32 v31, 20, v2 +; GCN-NOT: v0 +; GCN-NOT: v1 ; GCN: s_swappc_b64 ; GCN: .amdhsa_system_vgpr_workitem_id 2 @@ -148,10 +151,11 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy: -; UNPACKED-TID-NOT: v0 -; UNPACKED-TID-NOT: v1 +; GCN-NOT: v0 +; GCN-NOT: v1 +; PACKED-TID: v_mov_b32_e32 v31, v0 ; UNPACKED-TID: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 -; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]] +; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDY]] ; GCN-NOT: v0 ; GCN-NOT: v1 ; GCN: s_swappc_b64 @@ -161,10 +165,12 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz: -; UNPACKED-TID-NOT: v0 -; UNPACKED-TID-NOT: v2 +; GCN-NOT: v0 +; GCN-NOT: v2 + +; PACKED-TID: v_mov_b32_e32 v31, v0 ; UNPACKED-TID: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDZ]] +; UNPACKED-TID: v_or_b32_e32 v31, v0, [[IDZ]] ; GCN-NOT: v0 ; GCN-NOT: v2 ; GCN: s_swappc_b64 @@ -174,11 +180,12 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz: -; UNPACKED-TID-NOT: v1 -; UNPACKED-TID-NOT: v2 +; GCN-NOT: v1 +; GCN-NOT: v2 +; PACKED-TID: v_mov_b32_e32 v31, v0 ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 -; UNPACKED-TID: v_or_b32_e32 v0, [[IDY]], [[IDZ]] +; UNPACKED-TID: v_or_b32_e32 v31, [[IDY]], [[IDZ]] ; GCN-NOT: v1 ; GCN-NOT: v2 ; GCN: s_swappc_b64 @@ -188,13 +195,16 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz: -; UNPACKED-TID-NOT: v0 -; UNPACKED-TID-NOT: v1 -; UNPACKED-TID-NOT: v2 +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN-NOT: v2 + +; PACKED-TID: v_mov_b32_e32 v31, v0 + ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 ; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDY]] -; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDZ]] +; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, [[IDZ]] ; GCN-NOT: v0 ; GCN-NOT: v1 ; GCN-NOT: v2 @@ -233,7 +243,7 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: ; GCN: s_waitcnt -; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1 +; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_x(i32 %arg0) #1 { @@ -245,7 +255,7 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: ; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10 +; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { @@ -257,7 +267,7 @@ ; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: ; GCN: s_waitcnt -; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10 +; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { @@ -270,7 +280,7 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: -; GCN: v_mov_b32_e32 v1, v0 +; GCN: v_mov_b32_e32 v31, v0 ; GCN: v_mov_b32_e32 v0, 0x22b ; GCN: s_swappc_b64 @@ -283,8 +293,8 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: -; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1 -; PACKED-TID: v_mov_b32_e32 v1, v0 +; UNPACKED-TID: v_lshlrev_b32_e32 v31, 10, v1 +; PACKED-TID: v_mov_b32_e32 v31, v0 ; GCN-NOT: v1 ; GCN: v_mov_b32_e32 v0, 0x22b ; GCN-NOT: v1 @@ -300,8 +310,8 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z: ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 20, v2 -; PACKED-TID-DAG: v_mov_b32_e32 v1, v0 +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v31, 20, v2 +; PACKED-TID-DAG: v_mov_b32_e32 v31, v0 ; GCN: s_swappc_b64 ; GCN-NOT: v0 @@ -312,10 +322,11 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} -; GCN: v_and_b32_e32 v32, 0x3ff, v32 -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: s_setpc_b64 +; GCN-DAG: v_and_b32_e32 v31, 0x3ff, v31 +; GCN-DAG: buffer_load_dword [[LOAD_ARG31:v[0-9]+]], off, s[0:3], s32{{$}} +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[LOAD_ARG31]] +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_x( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -366,7 +377,8 @@ ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: ; GCN: s_mov_b32 s32, 0 -; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} +; GCN: v_mov_b32_e32 v31, v0 ; GCN: s_swappc_b64 ; GCN: .amdhsa_system_vgpr_workitem_id 0 @@ -384,9 +396,12 @@ } ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: +; GCN-NOT: v31 ; GCN: s_mov_b32 s33, s32 -; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} +; GCN-NOT: v31 ; GCN: s_swappc_b64 +; GCN-NOT: v31 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { store volatile i32 %arg0, i32 addrspace(1)* undef call void @too_many_args_use_workitem_id_x( @@ -428,19 +443,17 @@ } ; stack layout: -; frame[0] = byval arg32 -; frame[1] = stack passed workitem ID x -; frame[2] = VGPR spill slot +; frame[0] = stack passed arg23 +; frame[1] = byval arg32 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: -; GFX7: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX90A: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-DAG: s_waitcnt -; GFX7: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX90A: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32, -; GFX7: buffer_load_dword v0, off, s[0:3], s32 glc{{$}} -; GFX90A: buffer_load_dword v0, off, s[0:3], s32 glc{{$}} -; GCN: s_setpc_b64 +; GCN-DAG: v_and_b32_e32 v31, 0x3ff, v31 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31 +; GCN-DAG: buffer_load_dword [[LOAD_ARG31:v[0-9]+]], off, s[0:3], s32{{$}} +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[LOAD_ARG31]] +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, @@ -488,19 +501,22 @@ ret void } -; sp[0] = byval -; sp[1] = ?? -; sp[2] = stack passed workitem ID x +; sp[0] = stack passed %arg31 +; sp[1] = byval ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; Local stack object initialize. Offset 0 is the emergency spill slot. +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GCN-DAG: s_movk_i32 s32, 0x400 ; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 -; GCN: s_movk_i32 s32, 0x400 -; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} +; Pass %arg31 on stack +; GCN: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} +; GCN: buffer_store_dword [[K1:v[0-9]+]], off, s[0:3], s32{{$}} + +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 @@ -526,7 +542,7 @@ ; GFX7: buffer_store_dword [[K]], off, s[0:3], s33{{$}} ; GFX90A: buffer_store_dword [[K]], off, s[0:3], s33{{$}} ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} -; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { @@ -546,22 +562,24 @@ } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; GFX90A: buffer_load_dword v32, off, s[0:3], s32{{$}} -; GFX90A: v_and_b32_e32 v33, 0x3ff, v32 -; GFX90A: v_bfe_u32 v34, v32, 10, 10 -; GCN90A: v_bfe_u32 v32, v32, 20, 10 -; GFX7: buffer_load_dword v32, off, s[0:3], s32{{$}} -; GFX7: v_and_b32_e32 v33, 0x3ff, v32 -; GFX7: v_bfe_u32 v33, v32, 10, 10 -; GCN7: v_bfe_u32 v32, v32, 20, 10 -; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v33{{$}} +; GFX90A: buffer_load_dword [[LOAD_ARG31:v[0-9]+]], off, s[0:3], s32{{$}} +; GFX90A: v_and_b32_e32 [[ID_X:v[0-9]+]], 0x3ff, v31 +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[ID_X]], off{{$}} +; GFX90A: v_bfe_u32 [[ID_Y:v[0-9]+]], v31, 10, 10 +; GFX90A: v_bfe_u32 [[ID_Z:v[0-9]+]], v31, 20, 10 +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[ID_Y]], off{{$}} +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[ID_Z]], off{{$}} + +; GFX7: v_and_b32_e32 v32, 0x3ff, v31 +; GFX7: v_bfe_u32 v32, v31, 10, 10 +; GCN7: v_bfe_u32 v31, v31, 20, 10 ; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32{{$}} -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v33, off{{$}} -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v34, off{{$}} -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v32, off{{$}} +; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31{{$}} +; GFX7: buffer_load_dword [[LOAD_ARG31:v[0-9]+]], off, s[0:3], s32{{$}} + +; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[LOAD_ARG31]] +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, [[LOAD_ARG31]] -; GFX7-COUNT-32: flat_store_dword v{{\[[0-9]+:[0-9]+]}} -; GFX90A-COUNT-32: global_store_dword v{{\[[0-9]+:[0-9]+]}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_xyz( @@ -624,11 +642,12 @@ ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1 ; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1 ; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2 -; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v2 +; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, v2 ; PACKED-TID-NOT: v0 ; PACKED-TID-NOT: v1 ; PACKED-TID-NOT: v2 -; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140 +; GCN-DAG: buffer_store_dword [[K]], off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 ; GCN: .amdhsa_system_vgpr_workitem_id 2 Index: llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VARABI %s -; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s +; RUN: llc -amdgpu-fixed-function-abi=0 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VARABI %s +; RUN: llc -amdgpu-fixed-function-abi=1 -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s ; GCN-LABEL: {{^}}use_workitem_id_x: ; GCN: s_waitcnt @@ -128,8 +128,7 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: -; VARABI: enable_vgpr_workitem_id = 0 -; FIXEDABI: enable_vgpr_workitem_id = 2 +; GCN: enable_vgpr_workitem_id = 0 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v31 @@ -146,9 +145,7 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y: -; VARABI: enable_vgpr_workitem_id = 1 -; FIXEDABI: enable_vgpr_workitem_id = 2 - +; GCN: enable_vgpr_workitem_id = 1 ; VARABI-NOT: v31 ; VARABI: v_lshlrev_b32_e32 v0, 10, v1 @@ -354,8 +351,7 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: -; VARABI: enable_vgpr_workitem_id = 0 -; FIXEDABI: enable_vgpr_workitem_id = 2 +; GCN: enable_vgpr_workitem_id = 0 ; VARABI: v_mov_b32_e32 v1, v0 ; VARABI: v_mov_b32_e32 v0, 0x22b @@ -372,7 +368,7 @@ ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: -; VARABI: enable_vgpr_workitem_id = 1 +; GCN: enable_vgpr_workitem_id = 1 ; VARABI: v_lshlrev_b32_e32 v1, 10, v1 ; VARABI-NOT: v1 @@ -381,8 +377,6 @@ ; VARABI: s_swappc_b64 ; VARABI-NOT: v0 -; FIXEDABI: enable_vgpr_workitem_id = 2 - ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 @@ -467,14 +461,13 @@ } ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: -; VARABI: enable_vgpr_workitem_id = 0 +; GCN: enable_vgpr_workitem_id = 0 ; VARABI: s_mov_b32 s32, 0 ; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}} ; VARABI: s_swappc_b64 -; FIXEDABI: enable_vgpr_workitem_id = 2 ; FIXEDABI-NOT: v0 ; FIXEDABI-NOT: v1 ; FIXEDABI-NOT: v2 Index: llvm/test/CodeGen/AMDGPU/cc-update.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cc-update.ll +++ llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -57,14 +57,21 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s12, s12, s17 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: s_mov_b32 s12, s14 +; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_mov_b32 s32, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX803-NEXT: s_getpc_b64 s[4:5] -; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_call: @@ -72,12 +79,18 @@ ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_getpc_b64 s[4:5] -; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_kern_call: @@ -87,12 +100,18 @@ ; GFX1010-NEXT: s_addc_u32 s13, s13, 0 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_getpc_b64 s[4:5] -; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_mov_b32 s13, s15 +; GFX1010-NEXT: s_mov_b32 s12, s14 +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm entry: tail call void @ex() #0 @@ -104,17 +123,24 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s12, s12, s17 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: s_mov_b32 s12, s14 +; GFX803-NEXT: v_mov_b32_e32 v3, 0 +; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_movk_i32 s32, 0x400 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: s_getpc_b64 s[4:5] -; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_stack_and_call: @@ -122,15 +148,21 @@ ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_movk_i32 s32, 0x400 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_getpc_b64 s[4:5] -; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_kern_stack_and_call: @@ -140,15 +172,21 @@ ; GFX1010-NEXT: s_addc_u32 s13, s13, 0 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1010-NEXT: v_mov_b32_e32 v3, 0 ; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_getpc_b64 s[4:5] -; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: s_mov_b32 s13, s15 +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b32 s12, s14 +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) @@ -217,15 +255,22 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s12, s12, s17 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: s_mov_b32 s12, s14 +; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_mov_b32 s32, 0 ; GFX803-NEXT: s_mov_b32 s33, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX803-NEXT: s_getpc_b64 s[4:5] -; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_call: @@ -233,13 +278,19 @@ ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_mov_b32 s33, 0 -; GFX900-NEXT: s_getpc_b64 s[4:5] -; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_force_fp_kern_call: @@ -250,12 +301,18 @@ ; GFX1010-NEXT: s_addc_u32 s13, s13, 0 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_getpc_b64 s[4:5] -; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_mov_b32 s13, s15 +; GFX1010-NEXT: s_mov_b32 s12, s14 +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm entry: tail call void @ex() #2 @@ -267,18 +324,25 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s12, s12, s17 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_mov_b32 s33, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 s13, s15 +; GFX803-NEXT: s_mov_b32 s12, s14 +; GFX803-NEXT: v_mov_b32_e32 v3, 0 +; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_movk_i32 s32, 0x400 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: s_getpc_b64 s[4:5] -; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_stack_and_call: @@ -286,16 +350,22 @@ ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_movk_i32 s32, 0x400 -; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_getpc_b64 s[4:5] -; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; ; GFX1010-LABEL: test_force_fp_kern_stack_and_call: @@ -306,15 +376,21 @@ ; GFX1010-NEXT: s_addc_u32 s13, s13, 0 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 -; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1010-NEXT: v_mov_b32_e32 v3, 0 ; GFX1010-NEXT: s_add_u32 s0, s0, s17 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 -; GFX1010-NEXT: s_getpc_b64 s[4:5] -; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX1010-NEXT: s_mov_b32 s13, s15 +; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT: s_mov_b32 s12, s14 +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_getpc_b64 s[18:19] +; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) Index: llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -27,18 +27,18 @@ ; GCN-LABEL: call_split_type_used_outside_block_v2f32: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 @@ -61,18 +61,18 @@ ; GCN-LABEL: call_split_type_used_outside_block_v3f32: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 @@ -95,18 +95,18 @@ ; GCN-LABEL: call_split_type_used_outside_block_v4f16: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 @@ -129,18 +129,18 @@ ; GCN-LABEL: call_split_type_used_outside_block_struct: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 @@ -168,22 +168,30 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-LABEL: v3i16_registers: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[8:9], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_load_dword s12, s[8:9], 0x0 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s4, 0 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s12, 0 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] ; GCN-NEXT: s_cbranch_vccnz .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %if.else -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_v3i16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_add_u32 s8, s8, 8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_branch .LBB4_3 ; GCN-NEXT: .LBB4_2: ; GCN-NEXT: s_mov_b32 s4, 0 @@ -213,22 +221,30 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-LABEL: v3f16_registers: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[8:9], 0x0 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_load_dword s12, s[8:9], 0x0 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bitcmp1_b32 s4, 0 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s12, 0 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] ; GCN-NEXT: s_cbranch_vccnz .LBB5_2 ; GCN-NEXT: ; %bb.1: ; %if.else -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, func_v3f16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_add_u32 s8, s8, 8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_branch .LBB5_3 ; GCN-NEXT: .LBB5_2: ; GCN-NEXT: s_mov_b32 s4, 0 Index: llvm/test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1341,26 +1341,39 @@ ; CI-NEXT: s_getpc_b64 s[40:41] ; CI-NEXT: s_mov_b32 s40, s0 ; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0 -; CI-NEXT: s_load_dword s0, s[4:5], 0xb -; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 -; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_mov_b32 s14, s10 +; CI-NEXT: s_mov_b32 s12, s8 +; CI-NEXT: s_mov_b32 s13, s9 +; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s40, s40, s11 +; CI-NEXT: s_mov_b64 s[10:11], s[6:7] +; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; CI-NEXT: s_load_dword s6, s[4:5], 0xb ; CI-NEXT: s_addc_u32 s41, s41, 0 -; CI-NEXT: v_add_i32_e32 v40, vcc, s0, v0 -; CI-NEXT: s_getpc_b64 s[0:1] -; CI-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 -; CI-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; CI-NEXT: s_add_u32 s8, s4, 48 +; CI-NEXT: s_addc_u32 s9, s5, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 ; CI-NEXT: ds_read_b32 v41, v40 +; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: s_mov_b64 s[4:5], s[0:1] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_mov_b64 s[0:1], s[40:41] +; CI-NEXT: v_or_b32_e32 v31, v0, v2 ; CI-NEXT: s_mov_b64 s[2:3], s[42:43] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b32 s39, 0xf000 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CI-NEXT: ds_read_b32 v0, v40 offset:4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_add_i32_e32 v0, vcc, v41, v0 @@ -1372,28 +1385,39 @@ ; GFX9-NEXT: s_getpc_b64 s[36:37] ; GFX9-NEXT: s_mov_b32 s36, s0 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 -; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_mov_b32_e32 v42, 0 +; GFX9-NEXT: s_mov_b32 s14, s10 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_mov_b32 s13, s9 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshl_add_u32 v40, v0, 2, s2 ; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_getpc_b64 s[0:1] -; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX9-NEXT: ds_read_b32 v41, v40 +; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_add_u32 s8, s4, 48 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-NEXT: ds_read_b32 v42, v41 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: ds_read_b32 v0, v40 offset:4 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: ds_read_b32 v0, v41 offset:4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v41, v0 -; GFX9-NEXT: global_store_dword v42, v0, s[34:35] +; GFX9-NEXT: v_add_u32_e32 v0, v42, v0 +; GFX9-NEXT: global_store_dword v40, v0, s[34:35] ; GFX9-NEXT: s_endpgm %x = call i32 @llvm.amdgcn.workitem.id.x() %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x Index: llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll +++ llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll @@ -12,25 +12,25 @@ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %8:vgpr_32, %9:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %10:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %13:vgpr_32, %14:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %15:vgpr_32, %16:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %17:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode - ; GCN-NEXT: %14:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %15:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %16:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %18:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %19:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %21:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %22:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %23:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %24:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %25:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %26:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode - ; GCN-NEXT: $vcc = COPY %7 - ; GCN-NEXT: %20:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec - ; GCN-NEXT: %21:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = COPY %14 + ; GCN-NEXT: %27:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN-NEXT: %28:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; GCN-NEXT: $vgpr0 = COPY %21 + ; GCN-NEXT: $vgpr0 = COPY %28 ; GCN-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] ; GCN-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 entry: @@ -46,25 +46,25 @@ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %8:vgpr_32, %9:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %10:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %8, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %13:vgpr_32, %14:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %15:vgpr_32, %16:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %17:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %15, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode - ; GCN-NEXT: %14:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %10, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %14, 0, %10, 0, %10, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %16:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %6, 0, %15, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %16, 0, %6, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %18:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %17, 0, %15, 0, %16, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %19:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %8, 0, %18, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %21:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %17, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %22:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %21, 0, %17, 0, %17, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %23:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %13, 0, %22, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %24:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %23, 0, %13, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %25:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %24, 0, %22, 0, %23, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %26:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %15, 0, %25, 0, %13, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode - ; GCN-NEXT: $vcc = COPY %7 - ; GCN-NEXT: %20:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %19, 0, %15, 0, %18, 0, 0, implicit $mode, implicit $vcc, implicit $exec - ; GCN-NEXT: %21:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %20, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vcc = COPY %14 + ; GCN-NEXT: %27:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %26, 0, %22, 0, %25, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN-NEXT: %28:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %27, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; GCN-NEXT: $vgpr0 = COPY %21 + ; GCN-NEXT: $vgpr0 = COPY %28 ; GCN-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] ; GCN-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 entry: Index: llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -73,23 +73,43 @@ ; FLAT_SCR_OPT-NEXT: s_addc_u32 s9, s9, 0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 -; FLAT_SCR_OPT-NEXT: s_getpc_b64 s[0:1] -; FLAT_SCR_OPT-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 -; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; FLAT_SCR_OPT-NEXT: s_mov_b64 s[8:9], s[4:5] +; FLAT_SCR_OPT-NEXT: s_getpc_b64 s[4:5] +; FLAT_SCR_OPT-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; FLAT_SCR_OPT-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 +; FLAT_SCR_OPT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; FLAT_SCR_OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s14, s12 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s13, s11 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s12, s10 +; FLAT_SCR_OPT-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLAT_SCR_OPT-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLAT_SCR_OPT-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLAT_SCR_OPT-NEXT: s_mov_b64 s[6:7], s[2:3] ; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_OPT-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLAT_SCR_OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] ; FLAT_SCR_OPT-NEXT: s_endpgm ; ; FLAT_SCR_ARCH-LABEL: kernel_calls_no_stack: ; FLAT_SCR_ARCH: ; %bb.0: -; FLAT_SCR_ARCH-NEXT: s_getpc_b64 s[0:1] -; FLAT_SCR_ARCH-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 -; FLAT_SCR_ARCH-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s13, s9 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s12, s8 +; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[8:9], s[4:5] +; FLAT_SCR_ARCH-NEXT: s_getpc_b64 s[4:5] +; FLAT_SCR_ARCH-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; FLAT_SCR_ARCH-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 +; FLAT_SCR_ARCH-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; FLAT_SCR_ARCH-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s14, s10 +; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLAT_SCR_ARCH-NEXT: v_or3_b32 v31, v0, v1, v2 ; FLAT_SCR_ARCH-NEXT: s_mov_b32 s32, 0 -; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_ARCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLAT_SCR_ARCH-NEXT: s_swappc_b64 s[30:31], s[16:17] ; FLAT_SCR_ARCH-NEXT: s_endpgm call void @extern_func() ret void Index: llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -42,7 +42,7 @@ ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; NO-SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 ; NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x800 -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[6:7], exec +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[8:9], exec ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v1, s30, 0 @@ -50,7 +50,7 @@ ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] ; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) @@ -79,4 +79,4 @@ ret void } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } Index: llvm/test/CodeGen/AMDGPU/function-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/function-args.ll +++ llvm/test/CodeGen/AMDGPU/function-args.ll @@ -220,7 +220,8 @@ ; GCN-DAG: buffer_store_dwordx4 v[4:7], off ; GCN-DAG: buffer_store_dwordx4 v[8:11], off ; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s32 +; GCN-DAG: buffer_load_dword v31, off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s32 offset:4 ; GCN-DAG: buffer_store_dwordx4 v[16:19], off ; GCN-DAG: buffer_store_dwordx4 v[20:23], off ; GCN-DAG: buffer_store_dwordx4 v[24:27], off @@ -576,9 +577,10 @@ ; GCN-DAG: buffer_store_dwordx4 v[20:23], off ; GCN-DAG: buffer_store_dwordx4 v[24:27], off ; GCN-DAG: buffer_store_dwordx4 v[28:31], off -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_load_dword v[[LOAD_ARG0_31:[0-9]+]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12 ; GCN: buffer_store_dword v[[LOAD_ARG1]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off @@ -591,14 +593,14 @@ ; FIXME: Different ext load types on CI vs. VI ; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16: -; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} ; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]] ; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]] @@ -619,10 +621,10 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off @@ -646,15 +648,15 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off @@ -666,15 +668,15 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off @@ -686,23 +688,23 @@ } ; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:28{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:60{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:60{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:64{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off Index: llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -4845,12 +4845,13 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 30 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s35, 1 ; GFX9-NEXT: v_writelane_b32 v40, s36, 2 @@ -4886,9 +4887,9 @@ ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, byval_align16_f64_arg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, byval_align16_f64_arg@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s63, v40, 29 @@ -4924,7 +4925,7 @@ ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: v_readlane_b32 s33, v40, 30 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] @@ -4934,23 +4935,24 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 30 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:12 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 ; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31] ; GFX10-NEXT: s_getpc_b64 s[6:7] ; GFX10-NEXT: s_add_u32 s6, s6, byval_align16_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s7, s7, byval_align16_f64_arg@rel32@hi+12 -; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX10-NEXT: v_writelane_b32 v40, s35, 1 ; GFX10-NEXT: v_writelane_b32 v40, s36, 2 @@ -5015,7 +5017,7 @@ ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 30 ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -5026,15 +5028,17 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:16 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:24 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 30 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16 +; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 -; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:8 -; GFX10-SCRATCH-NEXT: s_mov_b64 s[4:5], s[30:31] ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 0 +; GFX10-SCRATCH-NEXT: s_mov_b64 s[4:5], s[30:31] ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 @@ -5067,7 +5071,7 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s61, 27 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s62, 28 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s63, 29 -; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) +; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s63, v40, 29 @@ -5103,7 +5107,7 @@ ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 30 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:24 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9774,29 +9778,30 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[30:31] ; GFX9-NEXT: s_add_u32 s30, s30, stack_passed_f64_arg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s31, s31, stack_passed_f64_arg@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -9806,19 +9811,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 +; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[30:31] @@ -9827,10 +9833,10 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9841,27 +9847,29 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) +; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:12 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -10,7 +10,6 @@ ; CHECK-DAG: s_cselect_b64 [[CALL_TARGET:s\[[0-9]+:[0-9]+\]]] ; CHECK-DAG: s_mov_b64 s[8:9], 0 -; CHECK-DAG: s_mov_b32 s12, s6 ; CHECK-DAG: v_mov_b32_e32 v31, v0 ; CHECK: s_swappc_b64 s[30:31], [[CALL_TARGET]] Index: llvm/test/CodeGen/AMDGPU/ipra.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ipra.ll +++ llvm/test/CodeGen/AMDGPU/ipra.ll @@ -106,4 +106,4 @@ } attributes #0 = { nounwind } -attributes #1 = { nounwind noinline } +attributes #1 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } Index: llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -13,9 +13,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_write_b32 v0, v0 -; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -37,7 +37,7 @@ ; GFX8-LABEL: func_use_lds_global_constexpr_cast: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8-NEXT: s_mov_b64 s[0:1], s[6:7] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -91,7 +91,7 @@ ; GCN-LABEL: {{^}}func_implicitarg_ptr: ; GCN: s_waitcnt -; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @func_implicitarg_ptr() #0 { @@ -103,7 +103,7 @@ ; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr: ; GCN: s_waitcnt -; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @opencl_func_implicitarg_ptr() #0 { @@ -122,9 +122,7 @@ ; MESA: kernarg_segment_byte_size = 16 ; MESA: kernarg_segment_alignment = 4 -; GCN-NOT: s[4:5] -; GCN-NOT: s4 -; GCN-NOT: s5 +; GCN: s_mov_b64 s[8:9], s[4:5] ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 { call void @func_implicitarg_ptr() @@ -140,10 +138,8 @@ ; MESA: kernarg_segment_byte_size = 16 ; MESA: kernarg_segment_alignment = 4 -; HSA: s_mov_b64 s[4:5], 0{{$}} -; MESA-NOT: s[4:5] -; MESA-NOT: s4 -; MESA-NOT: s5 +; HSA: s_mov_b64 s[8:9], 0{{$}} +; MESA: s_mov_b64 s[8:9], s[4:5]{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty_implicit0() #3 { call void @func_implicitarg_ptr() @@ -155,8 +151,7 @@ ; HSA: kernarg_segment_byte_size = 48 ; HSA: kernarg_segment_alignment = 4 ; MESA: kernarg_segment_byte_size = 16 -; MESA: kernarg_segment_alignment = 4 -; GCN-NOT: s[4:5] +; GCN: s_mov_b64 s[8:9], s[4:5] ; GCN-NOT: s4 ; GCN-NOT: s5 ; GCN: s_swappc_b64 @@ -173,10 +168,10 @@ ; MESA: kernarg_segment_byte_size = 128 ; MESA: kernarg_segment_alignment = 4 -; HSA: s_add_u32 s4, s4, 0x70 -; MESA: s_add_u32 s4, s4, 0x70 +; HSA: s_add_u32 s8, s4, 0x70 +; MESA: s_add_u32 s8, s4, 0x70 -; GCN: s_addc_u32 s5, s5, 0{{$}} +; GCN: s_addc_u32 s9, s5, 0{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 { call void @func_implicitarg_ptr() @@ -190,8 +185,8 @@ ; MESA: kernarg_segment_byte_size = 128 ; MESA: kernarg_segment_alignment = 4 -; GCN: s_add_u32 s4, s4, 0x70 -; GCN: s_addc_u32 s5, s5, 0{{$}} +; GCN: s_add_u32 s8, s4, 0x70 +; GCN: s_addc_u32 s9, s5, 0{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 { call void @func_implicitarg_ptr() @@ -199,18 +194,18 @@ } ; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func: -; GCN-NOT: s4 -; GCN-NOT: s5 -; GCN-NOT: s[4:5] +; GCN-NOT: s8 +; GCN-NOT: s9 +; GCN-NOT: s[8:9] define void @func_call_implicitarg_ptr_func() #0 { call void @func_implicitarg_ptr() ret void } ; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func: -; GCN-NOT: s4 -; GCN-NOT: s5 -; GCN-NOT: s[4:5] +; GCN-NOT: s8 +; GCN-NOT: s9 +; GCN-NOT: s[8:9] define void @opencl_func_call_implicitarg_ptr_func() #0 { call void @func_implicitarg_ptr() ret void @@ -220,7 +215,7 @@ ; GCN: s_waitcnt ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 -; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 ; GCN: s_waitcnt lgkmcnt(0) define void @func_kernarg_implicitarg_ptr() #0 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() @@ -236,7 +231,7 @@ ; GCN: s_waitcnt ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 -; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 ; GCN: s_waitcnt lgkmcnt(0) define void @opencl_func_kernarg_implicitarg_ptr() #0 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() @@ -249,8 +244,8 @@ } ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: -; GCN: s_add_u32 s4, s4, 0x70 -; GCN: s_addc_u32 s5, s5, 0 +; GCN: s_add_u32 s8, s4, 0x70 +; GCN: s_addc_u32 s9, s5, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { call void @func_kernarg_implicitarg_ptr() Index: llvm/test/CodeGen/AMDGPU/memory_clause.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -159,7 +159,7 @@ ; GCN-LABEL: mubuf_clause: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v31 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GCN-NEXT: v_add_u32_e32 v0, v0, v2 ; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:12 @@ -219,7 +219,7 @@ ; GCN-SCRATCH: ; %bb.0: ; %bb ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-SCRATCH-NEXT: v_and_b32_e32 v2, 0x3ff, v2 +; GCN-SCRATCH-NEXT: v_and_b32_e32 v2, 0x3ff, v31 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v18, 4, v2 ; GCN-SCRATCH-NEXT: v_add_nc_u32_e32 v0, v0, v18 ; GCN-SCRATCH-NEXT: s_clause 0x3 Index: llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -254,10 +254,11 @@ ret void } -declare void @foo(i32) #0 +declare void @foo(i32) #2 declare float @llvm.fmuladd.f32(float, float, float) #1 attributes #0 = { nounwind willreturn "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone speculatable } +attributes #2 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } !0 = !{float 2.500000e+00} Index: llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -13,26 +13,26 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_or_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: s_or_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b64 exec, s[16:17] ; CHECK-NEXT: v_writelane_b32 v40, s33, 2 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 -; CHECK-NEXT: .Ltmp0: -; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: s_getpc_b64 s[4:5] -; CHECK-NEXT: s_add_u32 s4, s4, _ZL13sleep_foreverv@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s5, s5, _ZL13sleep_foreverv@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] -; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] -; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9] -; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11] +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _ZL13sleep_foreverv@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _ZL13sleep_foreverv@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] +; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] +; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 Index: llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -12,7 +12,7 @@ ; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an ; alignment less than the stack alignment. -define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { +define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) #1 { ; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_add_u32 s0, s0, s9 @@ -231,7 +231,7 @@ ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; MUBUF-NEXT: s_mov_b32 s32, s6 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 @@ -265,7 +265,7 @@ ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off -; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; FLATSCR-NEXT: s_mov_b32 s32, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 @@ -321,13 +321,13 @@ ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 -; MUBUF-NEXT: v_mov_b32_e32 v5, s6 -; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; MUBUF-NEXT: v_mov_b32_e32 v4, s6 +; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; MUBUF-NEXT: v_mov_b32_e32 v2, 1 -; MUBUF-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 +; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; MUBUF-NEXT: s_mov_b32 s32, s6 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 @@ -354,12 +354,12 @@ ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 -; FLATSCR-NEXT: v_mov_b32_e32 v5, 0 -; FLATSCR-NEXT: v_mov_b32_e32 v6, 1 -; FLATSCR-NEXT: scratch_store_dwordx2 off, v[5:6], s2 +; FLATSCR-NEXT: v_mov_b32_e32 v4, 0 +; FLATSCR-NEXT: v_mov_b32_e32 v5, 1 +; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off -; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; FLATSCR-NEXT: s_mov_b32 s32, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 @@ -397,3 +397,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX90A %s -declare i64 @_Z13get_global_idj(i32) +declare i64 @_Z13get_global_idj(i32) #0 define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) { ; GFX8-LABEL: clmem_read_simplified: @@ -13,19 +13,20 @@ ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 @@ -94,19 +95,20 @@ ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_add_u32 s36, s36, s3 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 @@ -164,19 +166,20 @@ ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -236,19 +239,20 @@ ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_add_u32 s36, s36, s3 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 @@ -345,19 +349,20 @@ ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 17, v0 @@ -471,19 +476,20 @@ ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_add_u32 s36, s36, s3 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 17, v0 @@ -589,19 +595,20 @@ ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 17, v0 @@ -702,19 +709,20 @@ ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_add_u32 s36, s36, s3 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 17, v0 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0 @@ -914,19 +922,20 @@ ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 @@ -1000,19 +1009,20 @@ ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_add_u32 s36, s36, s3 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 @@ -1057,19 +1067,20 @@ ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -1123,19 +1134,20 @@ ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_add_u32 s36, s36, s3 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 @@ -1231,19 +1243,20 @@ ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 @@ -1284,19 +1297,20 @@ ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_add_u32 s36, s36, s3 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 @@ -1333,19 +1347,20 @@ ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -1383,19 +1398,20 @@ ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_add_u32 s36, s36, s3 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 @@ -1462,19 +1478,20 @@ ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 @@ -1512,19 +1529,20 @@ ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_add_u32 s36, s36, s3 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 @@ -1556,19 +1574,20 @@ ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -1600,19 +1619,20 @@ ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_add_u32 s36, s36, s3 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 @@ -1673,19 +1693,20 @@ ; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s42, -1 ; GFX8-NEXT: s_mov_b32 s43, 0xe80000 -; GFX8-NEXT: s_add_u32 s40, s40, s11 +; GFX8-NEXT: s_add_u32 s40, s40, s3 ; GFX8-NEXT: s_addc_u32 s41, s41, 0 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s37 @@ -1735,19 +1756,20 @@ ; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s42, -1 ; GFX9-NEXT: s_mov_b32 s43, 0xe00000 -; GFX9-NEXT: s_add_u32 s40, s40, s11 +; GFX9-NEXT: s_add_u32 s40, s40, s3 ; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 ; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s37 @@ -1795,19 +1817,20 @@ ; GFX10-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s42, -1 ; GFX10-NEXT: s_mov_b32 s43, 0x31c16000 -; GFX10-NEXT: s_add_u32 s40, s40, s11 +; GFX10-NEXT: s_add_u32 s40, s40, s3 ; GFX10-NEXT: s_addc_u32 s41, s41, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b64 s[0:1], s[40:41] ; GFX10-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 ; GFX10-NEXT: v_add_co_u32 v0, s0, s36, v2 @@ -1891,19 +1914,20 @@ ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 @@ -1972,19 +1996,20 @@ ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_add_u32 s36, s36, s3 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 @@ -2041,19 +2066,20 @@ ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -2117,19 +2143,20 @@ ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_add_u32 s36, s36, s3 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 @@ -2229,19 +2256,20 @@ ; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s38, -1 ; GFX8-NEXT: s_mov_b32 s39, 0xe80000 -; GFX8-NEXT: s_add_u32 s36, s36, s11 +; GFX8-NEXT: s_add_u32 s36, s36, s3 ; GFX8-NEXT: s_addc_u32 s37, s37, 0 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX8-NEXT: s_getpc_b64 s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX8-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX8-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: v_mov_b32_e32 v31, v0 ; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 @@ -2271,19 +2299,20 @@ ; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX900-NEXT: s_mov_b32 s38, -1 ; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s11 +; GFX900-NEXT: s_add_u32 s36, s36, s3 ; GFX900-NEXT: s_addc_u32 s37, s37, 0 +; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX900-NEXT: s_getpc_b64 s[0:1] ; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX900-NEXT: v_mov_b32_e32 v31, v0 ; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 @@ -2313,19 +2342,20 @@ ; GFX10-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX10-NEXT: s_mov_b32 s38, -1 ; GFX10-NEXT: s_mov_b32 s39, 0x31c16000 -; GFX10-NEXT: s_add_u32 s36, s36, s11 +; GFX10-NEXT: s_add_u32 s36, s36, s3 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_getpc_b64 s[0:1] -; GFX10-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[2:3] +; GFX10-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -2354,19 +2384,20 @@ ; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GFX90A-NEXT: s_mov_b32 s38, -1 ; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s11 +; GFX90A-NEXT: s_add_u32 s36, s36, s3 ; GFX90A-NEXT: s_addc_u32 s37, s37, 0 +; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 ; GFX90A-NEXT: s_getpc_b64 s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX90A-NEXT: v_mov_b32_e32 v31, v0 ; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 @@ -2411,3 +2442,5 @@ store i64 %add, i64 addrspace(1)* %buffer_head, align 8 ret void } + +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } Index: llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll +++ llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll @@ -37,22 +37,20 @@ ; are totally misleading. The former represent part of the incoming argument in register ; while the latter was emitted for the parameter copy to a virtual register inserted ; at the function entry by DAGBuilder. -define hidden void @ptr_arg_split_reg_mem(<31 x i32>, %struct.A* %arg2) #0 !dbg !25 { +define hidden void @ptr_arg_split_reg_mem(<30 x i32>, %struct.A* %arg2) #0 !dbg !25 { ; CHECK-LABEL: ptr_arg_split_reg_mem: ; CHECK: .Lfunc_begin1: ; CHECK-NEXT: .loc 1 10 0 ; example.cpp:10:0 ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: -;; NOTE: One dbg_value (DEBUG_VALUE: ptr_arg_split_reg_mem:b <- [$vgpr31+0]) will be considered as -;; redundant after the virtregrewrite, so it will be removed. -; CHECK-NEXT: ;DEBUG_VALUE: ptr_arg_split_reg_mem:b <- [$vgpr31+0] +; CHECK-NEXT: ;DEBUG_VALUE: ptr_arg_split_reg_mem:b <- [$vgpr30+0] ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; CHECK-NEXT: buffer_load_dword v31, off, s[0:3], s32{{$}} ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: .Ltmp2: ; CHECK-NEXT: .loc 1 12 13 prologue_end ; example.cpp:12:13 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dword v[31:32], v0 offset:396 +; CHECK-NEXT: flat_store_dword v[30:31], v0 offset:396 ; CHECK-NEXT: .loc 1 13 5 ; example.cpp:13:5 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -73,8 +71,8 @@ ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; CHECK-NEXT: v_mov_b32_e32 v2, 1 ; CHECK-NEXT: .Ltmp4: ; CHECK-NEXT: .loc 1 17 13 prologue_end ; example.cpp:17:13 Index: llvm/test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -128,8 +128,8 @@ ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] @@ -154,11 +154,13 @@ ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD_2:v[0-9]+]], off, s[0:3], s32 offset:8 ; GCN-NOT: s32 ; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword [[LOAD_2]], off, s[0:3], s32 offset:8 ; GCN-NOT: s32 ; GCN: s_setpc_b64 @@ -170,7 +172,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -259,7 +261,7 @@ ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: ; GCN-NOT: s33 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48 ; GCN-NOT: s33 ; GCN: s_setpc_b64 s[4:5] @@ -349,7 +351,8 @@ ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}} ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}} ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; GCN: v_mov_b32_e32 v0, 0 ; GCN: v_mov_b32_e32 v30, 0 Index: llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -26,7 +26,7 @@ ret void } -declare i64 @func() +declare i64 @func() #0 -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind readnone } Index: llvm/test/CodeGen/AMDGPU/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -781,78 +781,80 @@ ; GFX6-NEXT: v_xor_b32_e32 v4, s6, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v5, v21 +; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v17 ; GFX6-NEXT: v_xor_b32_e32 v5, s6, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v6, v22 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v6, v22 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v17 ; GFX6-NEXT: v_xor_b32_e32 v6, s6, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v7, v23 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v23 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v17 ; GFX6-NEXT: v_xor_b32_e32 v7, s6, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v8, v24 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v8, v24 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 -; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v17 ; GFX6-NEXT: v_xor_b32_e32 v8, s6, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v9, v25 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v9, v25 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v17 ; GFX6-NEXT: v_xor_b32_e32 v9, s6, v9 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v26 +; GFX6-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v10, v26 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 -; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 +; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v17 ; GFX6-NEXT: v_xor_b32_e32 v10, s6, v10 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v11, v27 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v11, v27 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 +; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v17 ; GFX6-NEXT: v_xor_b32_e32 v11, s6, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v12, v28 +; GFX6-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v12, v28 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 -; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 +; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v17 ; GFX6-NEXT: v_xor_b32_e32 v12, s6, v12 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v13, v29 +; GFX6-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v13, v29 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 -; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 +; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v17 ; GFX6-NEXT: v_xor_b32_e32 v13, s6, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v14, v30 +; GFX6-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc +; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v14, v30 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 -; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v16 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 +; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v17 ; GFX6-NEXT: v_xor_b32_e32 v14, s6, v14 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v31 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 +; GFX6-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 +; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v16 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v15, s6, v15 @@ -899,78 +901,80 @@ ; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v5, v21 +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 +; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v17 ; GFX8-NEXT: v_xor_b32_e32 v5, s6, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v6, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v6, v22 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v17 ; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v7, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v7, v23 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v17 ; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v8, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v8, v24 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 -; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v17 ; GFX8-NEXT: v_xor_b32_e32 v8, s6, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v9, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v9, v25 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 -; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 +; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v17 ; GFX8-NEXT: v_xor_b32_e32 v9, s6, v9 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v10, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v10, v26 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 -; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 +; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v17 ; GFX8-NEXT: v_xor_b32_e32 v10, s6, v10 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v11, v27 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v11, v27 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 +; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v17 ; GFX8-NEXT: v_xor_b32_e32 v11, s6, v11 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v12, v28 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v12, v28 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 -; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 +; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v17 ; GFX8-NEXT: v_xor_b32_e32 v12, s6, v12 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v13, v29 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v13, v29 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 -; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 +; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v17 ; GFX8-NEXT: v_xor_b32_e32 v13, s6, v13 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v14, v30 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc +; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v14, v30 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 -; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v16 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 +; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v17 ; GFX8-NEXT: v_xor_b32_e32 v14, s6, v14 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v31 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 +; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v16 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v15, s6, v15 @@ -982,6 +986,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_i32 v0, v0, v16 clamp +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX9-NEXT: v_sub_i32 v1, v1, v17 clamp ; GFX9-NEXT: v_sub_i32 v2, v2, v18 clamp ; GFX9-NEXT: v_sub_i32 v3, v3, v19 clamp @@ -996,13 +1001,15 @@ ; GFX9-NEXT: v_sub_i32 v12, v12, v28 clamp ; GFX9-NEXT: v_sub_i32 v13, v13, v29 clamp ; GFX9-NEXT: v_sub_i32 v14, v14, v30 clamp -; GFX9-NEXT: v_sub_i32 v15, v15, v31 clamp +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_i32 v15, v15, v16 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_i32 v0, v0, v16 clamp ; GFX10-NEXT: v_sub_nc_i32 v1, v1, v17 clamp ; GFX10-NEXT: v_sub_nc_i32 v2, v2, v18 clamp @@ -1018,6 +1025,7 @@ ; GFX10-NEXT: v_sub_nc_i32 v12, v12, v28 clamp ; GFX10-NEXT: v_sub_nc_i32 v13, v13, v29 clamp ; GFX10-NEXT: v_sub_nc_i32 v14, v14, v30 clamp +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_i32 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) Index: llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -8,12 +8,12 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) { ; MUBUF-LABEL: kernel_background_evaluate: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_load_dword s0, s[4:5], 0x24 +; MUBUF-NEXT: s_load_dword s0, s[0:1], 0x24 ; MUBUF-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; MUBUF-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; MUBUF-NEXT: s_mov_b32 s38, -1 ; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 -; MUBUF-NEXT: s_add_u32 s36, s36, s11 +; MUBUF-NEXT: s_add_u32 s36, s36, s3 ; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 @@ -46,12 +46,12 @@ ; ; FLATSCR-LABEL: kernel_background_evaluate: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 s8, s8, s13 +; FLATSCR-NEXT: s_add_u32 s2, s2, s5 ; FLATSCR-NEXT: s_movk_i32 s32, 0x6000 -; FLATSCR-NEXT: s_addc_u32 s9, s9, 0 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 -; FLATSCR-NEXT: s_load_dword s2, s[4:5], 0x24 +; FLATSCR-NEXT: s_addc_u32 s3, s3, 0 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; FLATSCR-NEXT: s_load_dword s2, s[0:1], 0x24 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 0 @@ -98,4 +98,6 @@ ret void } -declare hidden i32 @svm_eval_nodes(float addrspace(5)*, <1339 x i32> addrspace(5)*, <4 x i32> addrspace(5)*, i32, i32) local_unnamed_addr +declare hidden i32 @svm_eval_nodes(float addrspace(5)*, <1339 x i32> addrspace(5)*, <4 x i32> addrspace(5)*, i32, i32) local_unnamed_addr #0 + +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } Index: llvm/test/CodeGen/AMDGPU/stack-realign.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -157,19 +157,19 @@ ; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: ; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 ; GCN-DAG: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 -; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 -; GCN: s_mov_b32 s34, s32 ; GCN: v_mov_b32_e32 v32, 0 +; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 +; GCN: s_mov_b32 s34, s32 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-DAG: s_add_i32 s32, s32, 0x30000 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: s_swappc_b64 s[30:31], ; GCN: s_add_i32 s32, s32, 0xfffd0000 ; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2 Index: llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll +++ llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll @@ -4,41 +4,44 @@ ; GCN-LABEL: {{^}}token_factor_inline_limit_test: +; GCN-TFLID: v_mov_b32_e32 [[REG7:v[0-9]+]], 7 +; GCN-TFLID: buffer_store_dword [[REG7]], {{.*$}} ; GCN-TFILD: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN-TFILD: buffer_store_dword [[REG8]], {{.*$}} +; GCN-TFILD: buffer_store_dword [[REG8]], {{.*}} offset:4 ; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:8 ; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:12 ; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:16 ; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:20 ; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:24 ; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:28 ; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:32 ; GCN-TFIL7: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:32 ; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:28 ; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:24 ; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:20 ; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:16 ; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:12 ; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN-TFIL7: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-TFIL7: buffer_store_dword [[REG9]], {{.*}} offset:8 ; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN-TFIL7: buffer_store_dword [[REG8]], {{.*$}} +; GCN-TFIL7: buffer_store_dword [[REG8]], {{.*}} offset:4 +; GCN-TFLL7: v_mov_b32_e32 [[REG7:v[0-9]+]], 7 +; GCN-TFLL7: buffer_store_dword [[REG7]], {{.*$}} -; GCN: v_mov_b32_e32 v31, 7 ; GCN: s_getpc define void @token_factor_inline_limit_test() { entry: Index: llvm/test/CodeGen/AMDGPU/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -462,8 +462,9 @@ ; GFX6-LABEL: v_uaddsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_not_b32_e32 v32, v16 -; GFX6-NEXT: v_min_u32_e32 v0, v0, v32 +; GFX6-NEXT: v_not_b32_e32 v31, v16 +; GFX6-NEXT: v_min_u32_e32 v0, v0, v31 +; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v17 ; GFX6-NEXT: v_min_u32_e32 v1, v1, v16 @@ -493,8 +494,6 @@ ; GFX6-NEXT: v_min_u32_e32 v13, v13, v16 ; GFX6-NEXT: v_not_b32_e32 v16, v30 ; GFX6-NEXT: v_min_u32_e32 v14, v14, v16 -; GFX6-NEXT: v_not_b32_e32 v16, v31 -; GFX6-NEXT: v_min_u32_e32 v15, v15, v16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v17 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v18 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v19 @@ -509,6 +508,9 @@ ; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v28 ; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v29 ; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v30 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_not_b32_e32 v16, v31 +; GFX6-NEXT: v_min_u32_e32 v15, v15, v16 ; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v31 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -516,6 +518,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v17 clamp ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v18 clamp ; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v19 clamp @@ -530,13 +533,15 @@ ; GFX8-NEXT: v_add_u32_e64 v12, s[4:5], v12, v28 clamp ; GFX8-NEXT: v_add_u32_e64 v13, s[4:5], v13, v29 clamp ; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30 clamp -; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], v15, v31 clamp +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], v15, v16 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX9-NEXT: v_add_u32_e64 v1, v1, v17 clamp ; GFX9-NEXT: v_add_u32_e64 v2, v2, v18 clamp ; GFX9-NEXT: v_add_u32_e64 v3, v3, v19 clamp @@ -551,13 +556,15 @@ ; GFX9-NEXT: v_add_u32_e64 v12, v12, v28 clamp ; GFX9-NEXT: v_add_u32_e64 v13, v13, v29 clamp ; GFX9-NEXT: v_add_u32_e64 v14, v14, v30 clamp -; GFX9-NEXT: v_add_u32_e64 v15, v15, v31 clamp +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e64 v15, v15, v16 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp @@ -573,6 +580,7 @@ ; GFX10-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp ; GFX10-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) Index: llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -6,40 +6,40 @@ ; GCN-LABEL: widget: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_load_dword v0, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 ; GCN-NEXT: s_and_b64 vcc, exec, vcc -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb4 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 ; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: s_cbranch_vccnz .LBB0_4 ; GCN-NEXT: ; %bb.2: ; %bb7 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_branch .LBB0_7 ; GCN-NEXT: .LBB0_3: ; %bb2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0 ; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: s_cbranch_vccnz .LBB0_6 ; GCN-NEXT: .LBB0_4: ; %bb9 -; GCN-NEXT: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, wibble@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, wibble@rel32@hi+12 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, wibble@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execnz .LBB0_7 @@ -185,41 +185,61 @@ ; GCN-LABEL: blam: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 4 +; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_writelane_b32 v40, s33, 15 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v40, s34, 0 ; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: v_writelane_b32 v40, s36, 2 ; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s40, 6 +; GCN-NEXT: v_writelane_b32 v40, s41, 7 +; GCN-NEXT: v_writelane_b32 v40, s42, 8 +; GCN-NEXT: v_writelane_b32 v40, s43, 9 +; GCN-NEXT: v_writelane_b32 v40, s44, 10 +; GCN-NEXT: v_writelane_b32 v40, s45, 11 +; GCN-NEXT: v_writelane_b32 v40, s46, 12 +; GCN-NEXT: v_writelane_b32 v40, s48, 13 +; GCN-NEXT: v_writelane_b32 v40, s49, 14 +; GCN-NEXT: v_mov_b32_e32 v41, v31 +; GCN-NEXT: s_mov_b32 s44, s14 +; GCN-NEXT: s_mov_b32 s45, s13 +; GCN-NEXT: s_mov_b32 s46, s12 +; GCN-NEXT: s_mov_b64 s[36:37], s[10:11] +; GCN-NEXT: s_mov_b64 s[38:39], s[8:9] +; GCN-NEXT: s_mov_b64 s[40:41], s[6:7] +; GCN-NEXT: s_mov_b64 s[42:43], s[4:5] ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: flat_load_dword v43, v[1:2] -; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 -; GCN-NEXT: s_getpc_b64 s[36:37] -; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v41 +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: flat_load_dword v44, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v45, 0x7fc00000 +; GCN-NEXT: s_getpc_b64 s[48:49] +; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v43 +; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v44 ; GCN-NEXT: s_branch .LBB1_3 ; GCN-NEXT: .LBB1_1: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 ; GCN-NEXT: .LBB1_2: ; %bb18 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: .LBB1_3: ; %bb2 ; GCN-NEXT: ; =>This Loop Header: Depth=1 @@ -228,8 +248,8 @@ ; GCN-NEXT: .LBB1_4: ; %bb2 ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: flat_load_dword v0, v[41:42] -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0 +; GCN-NEXT: flat_load_dword v0, v[42:43] +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc @@ -253,7 +273,15 @@ ; GCN-NEXT: ; %bb.7: ; %bb11 ; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GCN-NEXT: s_mov_b64 s[4:5], s[42:43] +; GCN-NEXT: s_mov_b64 s[6:7], s[40:41] +; GCN-NEXT: s_mov_b64 s[8:9], s[38:39] +; GCN-NEXT: s_mov_b64 s[10:11], s[36:37] +; GCN-NEXT: s_mov_b32 s12, s46 +; GCN-NEXT: s_mov_b32 s13, s45 +; GCN-NEXT: s_mov_b32 s14, s44 +; GCN-NEXT: v_mov_b32_e32 v31, v41 +; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49] ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 @@ -267,10 +295,10 @@ ; GCN-NEXT: ; %bb.9: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 ; GCN-NEXT: .LBB1_10: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_branch .LBB1_2 bb: %tmp = load float, float* null, align 16 Index: llvm/test/CodeGen/AMDGPU/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -549,6 +549,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_u32_e32 v0, v0, v16 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v17 ; GFX6-NEXT: v_max_u32_e32 v2, v2, v18 ; GFX6-NEXT: v_max_u32_e32 v3, v3, v19 @@ -563,8 +565,6 @@ ; GFX6-NEXT: v_max_u32_e32 v12, v12, v28 ; GFX6-NEXT: v_max_u32_e32 v13, v13, v29 ; GFX6-NEXT: v_max_u32_e32 v14, v14, v30 -; GFX6-NEXT: v_max_u32_e32 v15, v15, v31 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v18 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v19 @@ -579,13 +579,16 @@ ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v28 ; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v29 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v30 -; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v31 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_max_u32_e32 v15, v15, v16 +; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp @@ -600,13 +603,15 @@ ; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp ; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp ; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp -; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v31 clamp +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v16 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp @@ -621,13 +626,15 @@ ; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp ; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp ; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp -; GFX9-NEXT: v_sub_u32_e64 v15, v15, v31 clamp +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e64 v15, v15, v16 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp @@ -643,6 +650,7 @@ ; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp ; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) Index: llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -declare void @extern_func() +declare void @extern_func() #2 define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { ; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be @@ -167,3 +167,4 @@ attributes #0 = { nounwind writeonly } attributes #1 = { nounwind readonly } +attributes #2 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } Index: llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir =================================================================== --- llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -20,10 +20,18 @@ # FULL-NEXT: stackPtrOffsetReg: '$sgpr13' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } # FULL-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' } +# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# FULL-NEXT: workGroupIDX: { reg: '$sgpr6' } +# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } +# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } # FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } -# FULL-NEXT: workItemIDX: { reg: '$vgpr0' } +# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# FULL-NEXT: workItemIDX: { reg: '$vgpr0' } +# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -47,10 +55,18 @@ # SIMPLE-NEXT: stackPtrOffsetReg: '$sgpr13' # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } # SIMPLE-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' } +# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr6' } +# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } +# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } # SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' } +# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } # SIMPLE-NEXT: workItemIDX: { reg: '$vgpr0' } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: name: kernel0 @@ -96,6 +112,16 @@ # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } +# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } +# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -111,6 +137,16 @@ # SIMPLE-NEXT: maxKernArgAlign: 1 # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } +# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } +# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: @@ -139,6 +175,16 @@ # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } +# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } +# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -154,6 +200,16 @@ # SIMPLE-NEXT: maxKernArgAlign: 1 # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } +# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } +# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: @@ -183,6 +239,16 @@ # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# FULL-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# FULL-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# FULL-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# FULL-NEXT: workGroupIDX: { reg: '$sgpr12' } +# FULL-NEXT: workGroupIDY: { reg: '$sgpr13' } +# FULL-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# FULL-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# FULL-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# FULL-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -199,6 +265,16 @@ # SIMPLE-NEXT: isEntryFunction: true # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } +# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } +# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } # SIMPLE-NEXT: occupancy: 10 # SIMPLE-NEXT: body: @@ -234,14 +310,23 @@ # ALL-LABEL: name: fake_stack_arginfo # FULL: argumentInfo: -# FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: flatScratchInit: { offset: 4 } -# FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } +# FULL: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# FULL: flatScratchInit: { offset: 4 } +# FULL: workItemIDY: { reg: '$vgpr0', mask: 65280 } # SIMPLE: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +# SIMPLE-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +# SIMPLE-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +# SIMPLE-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } # SIMPLE-NEXT: flatScratchInit: { offset: 4 } -# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } +# SIMPLE-NEXT: workGroupIDX: { reg: '$sgpr12' } +# SIMPLE-NEXT: workGroupIDY: { reg: '$sgpr13' } +# SIMPLE-NEXT: workGroupIDZ: { reg: '$sgpr14' } +# SIMPLE-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +# SIMPLE-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +# SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } +# SIMPLE-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } name: fake_stack_arginfo machineFunctionInfo: argumentInfo: Index: llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll =================================================================== --- llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -92,6 +92,16 @@ ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; CHECK-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; CHECK-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr12' } +; CHECK-NEXT: workGroupIDY: { reg: '$sgpr13' } +; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr14' } +; CHECK-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +; CHECK-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +; CHECK-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true @@ -123,6 +133,16 @@ ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } +; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' } +; CHECK-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' } +; CHECK-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' } +; CHECK-NEXT: workGroupIDX: { reg: '$sgpr12' } +; CHECK-NEXT: workGroupIDY: { reg: '$sgpr13' } +; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr14' } +; CHECK-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' } +; CHECK-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 } +; CHECK-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 } +; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true