Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2234,16 +2234,13 @@ SDValue ReturnAddrReg = CreateLiveInRegister( DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); - // FIXME: Should be able to use a vreg here, but need a way to prevent it - // from being allcoated to a CSR. - - SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), - MVT::i64); - - Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag); + SDValue ReturnAddrVirtualReg = DAG.getRegister( + MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass), + MVT::i64); + Chain = + DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag); Flag = Chain.getValue(1); - - RetOps.push_back(PhysReturnAddrReg); + RetOps.push_back(ReturnAddrVirtualReg); } // Copy the result values into the output registers. Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td @@ -488,6 +488,12 @@ let AllocationPriority = 9; } +// CCR (call clobbered registers) SGPR 64-bit registers +def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, (add (trunc SGPR_64, 16))> { + let CopyCost = SGPR_64.CopyCost; + let AllocationPriority = SGPR_64.AllocationPriority; +} + def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> { let isAllocatable = 0; } Index: llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SOPInstructions.td @@ -125,8 +125,8 @@ } // 64-bit input, no output -class SOP1_1 pattern=[]> : SOP1_Pseudo < - opName, (outs), (ins SReg_64:$src0), "$src0", pattern> { +class SOP1_1 pattern=[]> : SOP1_Pseudo < + opName, (outs), (ins rc:$src0), "$src0", pattern> { let has_sdst = 0; } @@ -224,7 +224,7 @@ let isReturn = 1 in { // Define variant marked as return rather than branch. -def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>; +def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>; } } // End isTerminator = 1, isBarrier = 1 Index: llvm/trunk/test/CodeGen/AMDGPU/call-graph-register-usage.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ llvm/trunk/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -14,13 +14,13 @@ ; GCN-LABEL: {{^}}indirect_use_vcc: ; GCN: v_writelane_b32 v32, s34, 2 -; GCN: v_writelane_b32 v32, s36, 0 -; GCN: v_writelane_b32 v32, s37, 1 +; GCN: v_writelane_b32 v32, s30, 0 +; GCN: v_writelane_b32 v32, s31, 1 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s37, v32, 1 -; GCN: v_readlane_b32 s36, v32, 0 +; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s5, v32, 1 ; GCN: v_readlane_b32 s34, v32, 2 -; GCN: ; NumSgprs: 40 +; GCN: ; NumSgprs: 37 ; GCN: ; NumVgprs: 33 define void @indirect_use_vcc() #1 { call void @use_vcc() @@ -29,8 +29,8 @@ ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: ; GCN: is_dynamic_callstack = 0 -; CI: ; NumSgprs: 42 -; VI-NOBUG: ; NumSgprs: 44 +; CI: ; NumSgprs: 39 +; VI-NOBUG: ; NumSgprs: 41 ; VI-BUG: ; NumSgprs: 96 ; GCN: ; NumVgprs: 33 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 { @@ -48,8 +48,8 @@ } ; GCN-LABEL: {{^}}indirect_use_flat_scratch: -; CI: ; NumSgprs: 42 -; VI: ; NumSgprs: 44 +; CI: ; NumSgprs: 39 +; VI: ; NumSgprs: 41 ; GCN: ; NumVgprs: 33 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() @@ -58,8 +58,8 @@ ; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel: ; GCN: is_dynamic_callstack = 0 -; CI: ; NumSgprs: 42 -; VI-NOBUG: ; NumSgprs: 44 +; CI: ; NumSgprs: 39 +; VI-NOBUG: ; NumSgprs: 41 ; VI-BUG: ; NumSgprs: 96 ; GCN: ; NumVgprs: 33 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ llvm/trunk/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -27,14 +27,15 @@ ; GCN: v_writelane_b32 v32, s34, 4 ; GCN: v_writelane_b32 v32, s36, 0 ; GCN: v_writelane_b32 v32, s37, 1 -; GCN: v_writelane_b32 v32, s38, 2 +; GCN: v_writelane_b32 v32, s30, 2 +; GCN: v_writelane_b32 v32, s31, 3 ; GCN: s_swappc_b64 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s39, v32, 3 -; GCN-DAG: v_readlane_b32 s38, v32, 2 +; GCN-DAG: v_readlane_b32 s4, v32, 2 +; GCN-DAG: v_readlane_b32 s5, v32, 3 ; GCN: v_readlane_b32 s37, v32, 1 ; GCN: v_readlane_b32 s36, v32, 0 @@ -71,8 +72,7 @@ ; GCN-NEXT: #ASMSTART ; GCN: ; clobber ; GCN-NEXT: #ASMEND -; GCN-NEXT: s_mov_b64 s[30:31], [[SAVEPC]] -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: s_setpc_b64 [[SAVEPC]] define void @void_func_void_clobber_s30_s31() #2 { call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 ret void Index: llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -84,15 +84,15 @@ ; GCN-DAG: s_mov_b32 s34, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s36, -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s37, +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, ; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34{{$}} ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s36, [[CSR_VGPR]] -; GCN-DAG: v_readlane_b32 s37, [[CSR_VGPR]] +; GCN-DAG: v_readlane_b32 s5, [[CSR_VGPR]] +; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]] ; GCN: s_sub_u32 s32, s32, 0x400{{$}} ; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], 2 @@ -123,12 +123,12 @@ ; GCN-DAG: s_add_u32 s32, s32, 0x400 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s34, [[FP_SPILL_LANE:[0-9]+]] -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s36, 0 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s37, 1 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s36, v32, 0 -; GCN-DAG: v_readlane_b32 s37, v32, 1 +; GCN-DAG: v_readlane_b32 s4, v32, 0 +; GCN-DAG: v_readlane_b32 s5, v32, 1 ; GCN: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], [[FP_SPILL_LANE]] @@ -313,17 +313,20 @@ ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: ; GCN: s_waitcnt -; GCN-NEXT: v_writelane_b32 v1, s34, 0 +; GCN-NEXT: v_writelane_b32 v1, s34, 2 +; GCN-NEXT: v_writelane_b32 v1, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GCN: v_writelane_b32 v1, s31, 1 ; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:4 ; GCN: ;;#ASMSTART -; GCN: s_add_u32 s32, s32, 0x200 -; GCN-NEXT: s_mov_b64 s[30:31], vcc +; GCN: v_readlane_b32 s4, v1, 0 +; GCN-NEXT: s_add_u32 s32, s32, 0x200 +; GCN-NEXT: v_readlane_b32 s5, v1, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x200 -; GCN-NEXT: v_readlane_b32 s34, v1, 0 +; GCN-NEXT: v_readlane_b32 s34, v1, 2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: s_setpc_b64 s[4:5] define void @no_unused_non_csr_sgpr_for_fp() #1 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -344,18 +347,20 @@ ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s34, 0 +; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 -; GCN: s_add_u32 s32, s32, 0x300{{$}} -; GCN-DAG: s_mov_b64 vcc, s[30:31] +; GCN-DAG: v_writelane_b32 v32, s31, 1 ; GCN-DAG: buffer_store_dword +; GCN: s_add_u32 s32, s32, 0x300{{$}} ; GCN: ;;#ASMSTART -; GCN: s_mov_b64 s[30:31], vcc -; GCN: s_sub_u32 s32, s32, 0x300{{$}} -; GCN-NEXT: v_readlane_b32 s34, v32, 0 +; GCN: v_readlane_b32 s4, v32, 0 +; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x300{{$}} +; GCN-NEXT: v_readlane_b32 s34, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -389,17 +394,19 @@ ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s34, 0 +; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-DAG: v_writelane_b32 v32, s31, 1 ; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}} -; GCN-DAG: s_mov_b64 vcc, s[30:31] ; GCN-DAG: buffer_store_dword ; GCN: ;;#ASMSTART -; GCN: s_mov_b64 s[30:31], vcc -; GCN: s_sub_u32 s32, s32, 0x40300{{$}} -; GCN-NEXT: v_readlane_b32 s34, v32, 0 +; GCN: v_readlane_b32 s4, v32, 0 +; GCN-NEXT: v_readlane_b32 s5, v32, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x40300{{$}} +; GCN-NEXT: v_readlane_b32 s34, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload Index: llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -311,6 +311,7 @@ ; Argument is in right place already ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x: ; GCN-NOT: s4 +; GCN: v_readlane_b32 s4, v32, 0 define hidden void @func_indirect_use_workgroup_id_x() #1 { call void @use_workgroup_id_x() ret void @@ -318,6 +319,7 @@ ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y: ; GCN-NOT: s4 +; GCN: v_readlane_b32 s4, v32, 0 define hidden void @func_indirect_use_workgroup_id_y() #1 { call void @use_workgroup_id_y() ret void @@ -325,6 +327,7 @@ ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z: ; GCN-NOT: s4 +; GCN: v_readlane_b32 s4, v32, 0 define hidden void @func_indirect_use_workgroup_id_z() #1 { call void @use_workgroup_id_z() ret void @@ -490,7 +493,7 @@ ; GCN-NOT: s[8:9] ; GCN-NOT: s[10:11] ; GCN-NOT: s[12:13] -; GCN: s_or_saveexec_b64 s[4:5], -1 +; GCN: s_or_saveexec_b64 s[16:17], -1 define hidden void @func_indirect_use_every_sgpr_input() #1 { call void @use_every_sgpr_input() ret void Index: llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ llvm/trunk/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -281,12 +281,12 @@ ; GCN-LABEL: {{^}}chain_hi_to_lo_group_may_alias_store: ; GFX900: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b -; GFX900-NEXT: ds_read_u16 v3, v0 +; GFX900-NEXT: ds_read_u16 v2, v0 ; GFX900-NEXT: ds_write_b16 v1, [[K]] ; GFX900-NEXT: ds_read_u16 v0, v0 offset:2 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX900-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX900-NEXT: s_setpc_b64 define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) { bb: Index: llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -31,25 +31,23 @@ ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN-NEXT: v_writelane_b32 v32, s36, 0 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: v_writelane_b32 v32, s37, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+4 -; GCN-NEXT: s_mov_b64 s[36:37], s[30:31] +; GCN-NEXT: v_writelane_b32 v32, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_mov_b64 s[30:31], s[36:37] -; GCN-NEXT: v_readlane_b32 s37, v32, 1 -; GCN-NEXT: v_readlane_b32 s36, v32, 0 +; GCN-NEXT: v_readlane_b32 s4, v32, 0 +; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, v32, 2 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call <2 x float> @func_v2f32() br label %bb1 @@ -67,25 +65,23 @@ ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN-NEXT: v_writelane_b32 v32, s36, 0 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: v_writelane_b32 v32, s37, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+4 -; GCN-NEXT: s_mov_b64 s[36:37], s[30:31] +; GCN-NEXT: v_writelane_b32 v32, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_mov_b64 s[30:31], s[36:37] -; GCN-NEXT: v_readlane_b32 s37, v32, 1 -; GCN-NEXT: v_readlane_b32 s36, v32, 0 +; GCN-NEXT: v_readlane_b32 s4, v32, 0 +; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, v32, 2 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call <3 x float> @func_v3f32() br label %bb1 @@ -103,25 +99,23 @@ ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN-NEXT: v_writelane_b32 v32, s36, 0 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: v_writelane_b32 v32, s37, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+4 -; GCN-NEXT: s_mov_b64 s[36:37], s[30:31] +; GCN-NEXT: v_writelane_b32 v32, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_mov_b64 s[30:31], s[36:37] -; GCN-NEXT: v_readlane_b32 s37, v32, 1 -; GCN-NEXT: v_readlane_b32 s36, v32, 0 +; GCN-NEXT: v_readlane_b32 s4, v32, 0 +; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, v32, 2 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call <4 x half> @func_v4f16() br label %bb1 @@ -139,26 +133,24 @@ ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN-NEXT: v_writelane_b32 v32, s36, 0 +; GCN-NEXT: v_writelane_b32 v32, s30, 0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: v_writelane_b32 v32, s37, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+4 -; GCN-NEXT: s_mov_b64 s[36:37], s[30:31] +; GCN-NEXT: v_writelane_b32 v32, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_mov_b64 s[30:31], s[36:37] -; GCN-NEXT: v_readlane_b32 s37, v32, 1 -; GCN-NEXT: v_readlane_b32 s36, v32, 0 +; GCN-NEXT: v_readlane_b32 s4, v32, 0 +; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, v32, 2 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct() br label %bb1 Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll @@ -35,15 +35,15 @@ ; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218 ; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c ; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] -; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]] +; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]] +; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]] ; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]] ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll @@ -35,15 +35,15 @@ ; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a ; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1 ; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] -; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]] -; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]] +; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] ; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]] +; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]] ; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]] ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] Index: llvm/trunk/test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-lo16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-lo16.ll @@ -271,12 +271,12 @@ ; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_lohi: ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900: ds_read_u16 v0, v0 -; GFX900: v_lshrrev_b32_e32 v4, 16, v1 +; GFX900: v_lshrrev_b32_e32 v[[A_F16:[0-9]+]], 16, v1 +; GFX900: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0xffff ; GFX900: s_waitcnt lgkmcnt(0) ; GFX900: ds_write_b16 v2, v0 -; GFX900: ds_write_b16 v3, v4 -; GFX900: v_mov_b32_e32 v2, 0xffff -; GFX900: v_bfi_b32 v0, v2, v0, v1 +; GFX900: ds_write_b16 v3, v[[A_F16]] +; GFX900: v_bfi_b32 v0, v[[A_F32]], v0, v1 ; GFX900: global_store_dword v[0:1], v0, off ; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900: s_setpc_b64 s[30:31] Index: llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll +++ llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll @@ -17,13 +17,13 @@ ; GCN-DAG: v_writelane_b32 v32, s34, 2 ; GCN-DAG: s_mov_b32 s34, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: v_writelane_b32 v32, s36, 0 -; GCN-DAG: v_writelane_b32 v32, s37, 1 +; GCN-DAG: v_writelane_b32 v32, s30, 0 +; GCN-DAG: v_writelane_b32 v32, s31, 1 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s37, v32, 1 -; GCN: v_readlane_b32 s36, v32, 0 +; GCN: v_readlane_b32 s4, v32, 0 +; GCN: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: v_readlane_b32 s34, v32, 2 @@ -31,7 +31,7 @@ ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 +; GCN-NEXT: s_setpc_b64 s[4:5] define void @test_func_call_external_void_func_i32_imm() #0 { call void @external_void_func_i32(i32 42) ret void Index: llvm/trunk/test/CodeGen/AMDGPU/wave32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/wave32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/wave32.ll @@ -1082,11 +1082,11 @@ ; GFX1032: s_add_u32 s32, s32, 0x200 -; GCN-DAG: v_writelane_b32 v32, s36, 0 -; GCN-DAG: v_writelane_b32 v32, s37, 1 +; GCN-DAG: v_writelane_b32 v32, s30, 0 +; GCN-DAG: v_writelane_b32 v32, s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s36, v32, 0 -; GCN-DAG: v_readlane_b32 s37, v32, 1 +; GCN-DAG: v_readlane_b32 s4, v32, 0 +; GCN-DAG: v_readlane_b32 s5, v32, 1 ; GFX1064: s_sub_u32 s32, s32, 0x400