Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2617,20 +2617,12 @@ SmallVector CopyFromChains; - unsigned OffsetReg = Info->getScratchWaveOffsetReg(); - // In the HSA case, this should be an identity copy. SDValue ScratchRSrcReg = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); - // TODO: Don't hardcode these registers and get from the callee function. - SDValue ScratchWaveOffsetReg - = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); - RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); - CopyFromChains.push_back(ScratchWaveOffsetReg.getValue(1)); - if (!Info->isEntryFunction()) { // Avoid clobbering this function's FP value. In the current convention // callee will overwrite this, so do save/restore around the call site. Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -69,7 +69,7 @@ // Non-entry functions have no special inputs for now, other registers // required for scratch access. ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; - ScratchWaveOffsetReg = AMDGPU::SGPR4; + ScratchWaveOffsetReg = AMDGPU::SGPR33; FrameOffsetReg = AMDGPU::SGPR5; StackPtrOffsetReg = AMDGPU::SGPR32; Index: test/CodeGen/AMDGPU/bswap.ll =================================================================== --- test/CodeGen/AMDGPU/bswap.ll +++ test/CodeGen/AMDGPU/bswap.ll @@ -723,8 +723,8 @@ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: s_mov_b32 s6, 0xff00ff -; SI-NEXT: v_bfi_b32 v0, s6, v0, v1 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -735,8 +735,8 @@ ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: v_alignbit_b32 v1, v0, v0, 8 ; VI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; VI-NEXT: s_mov_b32 s6, 0xff00ff -; VI-NEXT: v_bfi_b32 v0, s6, v0, v1 +; VI-NEXT: s_mov_b32 s4, 0xff00ff +; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 ; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: s_setpc_b64 s[30:31] bb: Index: test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll =================================================================== --- test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -80,8 +80,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_movk_i32 s6, 0x63 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_movk_i32 s4, 0x63 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -273,8 +273,8 @@ ; GFX9-LABEL: undef_hi_op_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s6, 0x63 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_movk_i32 s4, 0x63 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -81,14 +81,14 @@ ; MESA: s_mov_b32 s33, s3{{$}} ; HSA: s_mov_b32 s33, s9{{$}} +; HSA: buffer_load_ubyte [[VAR:v[0-9]+]] +; HSA: s_mov_b32 s32, s33 + ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+4 -; GCN-NEXT: buffer_load_ubyte [[VAR:v[0-9]+]] -; HSA-NEXT: s_mov_b32 s4, s33 -; HSA-NEXT: s_mov_b32 s32, s33 -; MESA-DAG: s_mov_b32 s4, s33{{$}} +; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]] ; MESA-DAG: s_mov_b32 s32, s33{{$}} ; GCN: s_waitcnt vmcnt(0) @@ -105,13 +105,15 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext: ; MESA: s_mov_b32 s33, s3{{$}} +; HSA: buffer_load_ubyte v0 +; HSA-DAG: s_mov_b32 s32, s33{{$}} + ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+4 -; GCN-NEXT: buffer_load_ubyte v0 -; GCN-DAG: s_mov_b32 s4, s33{{$}} -; GCN-DAG: s_mov_b32 s32, s33{{$}} +; MESA: buffer_load_ubyte v0 +; MESA-DAG: s_mov_b32 s32, s33{{$}} ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 @@ -131,7 +133,6 @@ ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4 ; GCN-DAG: v_mov_b32_e32 v0, 0x7b -; HSA-DAG: s_mov_b32 s4, s33{{$}} ; GCN-DAG: s_mov_b32 s32, s33{{$}} ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -151,7 +152,6 @@ ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s3 ; GCN-NOT: s_waitcnt @@ -186,7 +186,6 @@ ; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm: ; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}} -; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 @@ -240,7 +239,6 @@ ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4 ; GCN-DAG: v_mov_b32_e32 v0, 42 -; GCN-DAG: s_mov_b32 s4, s33 ; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} Index: test/CodeGen/AMDGPU/call-graph-register-usage.ll =================================================================== --- test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -13,14 +13,14 @@ } ; GCN-LABEL: {{^}}indirect_use_vcc: -; GCN: v_writelane_b32 v32, s33, 0 -; GCN: v_writelane_b32 v32, s34, 1 -; GCN: v_writelane_b32 v32, s35, 2 +; GCN: v_writelane_b32 v32, s34, 0 +; GCN: v_writelane_b32 v32, s35, 1 +; GCN: v_writelane_b32 v32, s36, 2 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s35, v32, 2 -; GCN: v_readlane_b32 s34, v32, 1 -; GCN: v_readlane_b32 s33, v32, 0 -; GCN: ; NumSgprs: 38 +; GCN: v_readlane_b32 s36, v32, 2 +; GCN: v_readlane_b32 s35, v32, 1 +; GCN: v_readlane_b32 s34, v32, 0 +; GCN: ; NumSgprs: 39 ; GCN: ; NumVgprs: 33 define void @indirect_use_vcc() #1 { call void @use_vcc() @@ -29,8 +29,8 @@ ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: ; GCN: is_dynamic_callstack = 0 -; CI: ; NumSgprs: 40 -; VI-NOBUG: ; NumSgprs: 42 +; CI: ; NumSgprs: 41 +; VI-NOBUG: ; NumSgprs: 43 ; VI-BUG: ; NumSgprs: 96 ; GCN: ; NumVgprs: 33 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 { @@ -48,8 +48,8 @@ } ; GCN-LABEL: {{^}}indirect_use_flat_scratch: -; CI: ; NumSgprs: 40 -; VI: ; NumSgprs: 42 +; CI: ; NumSgprs: 41 +; VI: ; NumSgprs: 43 ; GCN: ; NumVgprs: 33 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() @@ -58,8 +58,8 @@ ; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel: ; GCN: is_dynamic_callstack = 0 -; CI: ; NumSgprs: 40 -; VI-NOBUG: ; NumSgprs: 42 +; CI: ; NumSgprs: 41 +; VI-NOBUG: ; NumSgprs: 43 ; VI-BUG: ; NumSgprs: 96 ; GCN: ; NumVgprs: 33 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 { Index: test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- test/CodeGen/AMDGPU/call-preserved-registers.ll +++ test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -6,14 +6,12 @@ ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: s_mov_b32 s33, s7 -; GCN: s_mov_b32 s4, s33 -; GCN-NEXT: s_getpc_b64 s[34:35] +; GCN: s_getpc_b64 s[34:35] ; GCN-NEXT: s_add_u32 s34, s34, ; GCN-NEXT: s_addc_u32 s35, s35, ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 s[30:31], s[34:35] -; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: #ASMSTART ; GCN-NEXT: #ASMEND ; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -25,25 +23,25 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: v_writelane_b32 v32, s33, 0 -; GCN: v_writelane_b32 v32, s34, 1 -; GCN: v_writelane_b32 v32, s35, 2 -; GCN: v_writelane_b32 v32, s36, 3 -; GCN: v_writelane_b32 v32, s37, 4 +; GCN: v_writelane_b32 v32, s34, 0 +; GCN: v_writelane_b32 v32, s35, 1 +; GCN: v_writelane_b32 v32, s36, 2 +; GCN: v_writelane_b32 v32, s37, 3 +; GCN: v_writelane_b32 v32, s38, 4 -; GCN: s_mov_b32 s33, s5 +; GCN: s_mov_b32 [[COPY_FP:s[0-9]+]], s5 ; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s5, s33 -; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_mov_b32 s5, [[COPY_FP]] +; GCN-NEXT: s_mov_b32 [[COPY_FP]], s5 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_swappc_b64 -; GCN-DAG: s_mov_b32 s5, s33 -; GCN-DAG: v_readlane_b32 s37, v32, 4 -; GCN: v_readlane_b32 s36, v32, 3 -; GCN: v_readlane_b32 s35, v32, 2 -; GCN: v_readlane_b32 s34, v32, 1 -; GCN: v_readlane_b32 s33, v32, 0 +; GCN-DAG: s_mov_b32 s5, [[COPY_FP]] +; GCN-DAG: v_readlane_b32 s38, v32, 4 +; GCN: v_readlane_b32 s37, v32, 3 +; GCN: v_readlane_b32 s36, v32, 2 +; GCN: v_readlane_b32 s35, v32, 1 +; GCN: v_readlane_b32 s34, v32, 0 ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { call void @external_void_func_void() @@ -54,12 +52,12 @@ ; FIXME: Avoid extra restore of FP in between calls. ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; GCN: s_mov_b32 s33, s5 +; GCN: s_mov_b32 [[COPY_FP:s[0-9]+]], s5 ; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s5, s33 -; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_mov_b32 s5, [[COPY_FP]] +; GCN-NEXT: s_mov_b32 [[COPY_FP]], s5 ; GCN-NEXT: s_swappc_b64 -; GCN: s_mov_b32 s5, s33 +; GCN: s_mov_b32 s5, [[COPY_FP]] define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() @@ -127,19 +125,23 @@ ret void } +; FIXME: What is the expected behavior for reserved registers here? + ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: ; GCN: s_mov_b32 s34, s9 -; GCN: s_mov_b32 s4, s34 -; GCN-DAG: s_mov_b32 s32, s34 -; GCN-DAG: ; def s33 -; GCN-DAG: #ASMEND -; GCN-DAG: s_getpc_b64 s[6:7] -; GCN-DAG: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4 -; GCN-DAG: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: ;;#ASMSTART +; GCN-NOT: s33 +; GCN: #ASMSTART +; GCN-NEXT: ; def s33 +; GCN-NEXT: #ASMEND +; GCN: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 +; GCN: s_mov_b32 s32, s34 +; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN: ;;#ASMSTART ; GCN-NEXT: ; use s33 ; GCN-NEXT: ;;#ASMEND +; GCN-NOT: s33 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(1)* %out) #0 { %s33 = call i32 asm sideeffect "; def $0", "={s33}"() @@ -148,17 +150,54 @@ ret void } +; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: +; GCN: s_mov_b32 s33, s9 +; GCN-NOT: s34 +; GCN: ;;#ASMSTART +; GCN-NEXT: ; def s34 +; GCN-NEXT: ;;#ASMEND + +; GCN-NOT: s34 + +; GCN: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 + +; GCN-NOT: s34 +; GCN: s_swappc_b64 s[30:31], s[4:5] + +; GCN-NOT: s34 + +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s34 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(1)* %out) #0 { + %s34 = call i32 asm sideeffect "; def $0", "={s34}"() + call void @external_void_func_void() + call void asm sideeffect "; use $0", "{s34}"(i32 %s34) + ret void +} + ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: ; GCN: s_mov_b32 s33, s9 -; GCN: s_mov_b32 s4, s33 + +; GCN: ;;#ASMSTART +; GCN-NEXT: ; def v32 +; GCN-NEXT: ;;#ASMEND + +; GCN-NOT: v32 +; GCN: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 +; GCN-NOT: v32 ; GCN-DAG: s_mov_b32 s32, s33 -; GCN-DAG: ; def v32 -; GCN-DAG: #ASMEND -; GCN-DAG: s_getpc_b64 s[6:7] -; GCN-DAG: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4 -; GCN-DAG: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: ;;#ASMSTART + +; GCN: s_swappc_b64 s[30:31], s[4:5] + +; GCN-NOT: v32 + +; GCN: ;;#ASMSTART ; GCN-NEXT: ; use v32 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm @@ -181,10 +220,22 @@ ret void } +; GCN-LABEL: {{^}}void_func_void_clobber_s34: +; GCN: v_writelane_b32 v0, s34, 0 +; GCN-NEXT: #ASMSTART +; GCN-NEXT: ; clobber +; GCN-NEXT: #ASMEND +; GCN-NEXT: v_readlane_b32 s34, v0, 0 +; GCN: s_setpc_b64 +define hidden void @void_func_void_clobber_s34() #2 { + call void asm sideeffect "; clobber", "~{s34}"() #0 + ret void +} + ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: ; GCN: s_mov_b32 s33, s7 -; GCN: s_mov_b32 s4, s33 -; GCN-NEXT: s_getpc_b64 + +; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 ; GCN-NEXT: s_mov_b32 s32, s33 @@ -195,6 +246,19 @@ ret void } +; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34: +; GCN: s_mov_b32 s33, s7 +; GCN: s_getpc_b64 +; GCN-NEXT: s_add_u32 +; GCN-NEXT: s_addc_u32 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { + call void @void_func_void_clobber_s34() + ret void +} + ; GCN-LABEL: {{^}}callee_saved_sgpr_func: ; GCN-NOT: s40 ; GCN: v_writelane_b32 v32, s40 Index: test/CodeGen/AMDGPU/call-waitcnt.ll =================================================================== --- test/CodeGen/AMDGPU/call-waitcnt.ll +++ test/CodeGen/AMDGPU/call-waitcnt.ll @@ -13,11 +13,10 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: ds_read_b32 v0, v0 -; GCN-NEXT: s_mov_b32 s4, s33 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm %vgpr = load volatile i32, i32 addrspace(3)* %ptr call void @func(i32 %vgpr) @@ -40,7 +39,6 @@ ; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4 ; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] @@ -58,14 +56,13 @@ ; GCN-NEXT: s_mov_b32 s33, s9 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NEXT: v_mov_b32_e32 v1, s35 ; GCN-NEXT: global_store_dword v[0:1], v32, off @@ -82,13 +79,12 @@ ; GCN-NEXT: s_mov_b32 s33, s9 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func.return@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func.return@rel32@hi+4 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+4 ; GCN-NEXT: s_mov_b32 s32, s33 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NEXT: global_store_dword v[1:2], v0, off @@ -108,12 +104,11 @@ ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+4 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm call void @got.func(i32 0) ret void Index: test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/callee-frame-setup.ll +++ test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -39,19 +39,19 @@ ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 -; GCN-DAG: v_writelane_b32 v32, s33, ; GCN-DAG: v_writelane_b32 v32, s34, ; GCN-DAG: v_writelane_b32 v32, s35, +; GCN-DAG: v_writelane_b32 v32, s36, ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5{{$}} -; GCN-DAG: s_mov_b32 s33, s5 +; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5 ; GCN: s_swappc_b64 -; GCN-DAG: s_mov_b32 s5, s33 -; GCN-DAG: v_readlane_b32 s35, +; GCN-DAG: s_mov_b32 s5, [[COPY_FP]] ; GCN-DAG: v_readlane_b32 s34, -; GCN-DAG: v_readlane_b32 s33, +; GCN-DAG: v_readlane_b32 s35, +; GCN-DAG: v_readlane_b32 s36, ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; GCN: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -74,14 +74,16 @@ ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v32, s33, 0 -; GCN-DAG: v_writelane_b32 v32, s34, 1 -; GCN: s_mov_b32 s33, s5 +; GCN-DAG: v_writelane_b32 v32, s34, 0 +; GCN-DAG: v_writelane_b32 v32, s35, 1 +; GCN-DAG: v_writelane_b32 v32, s36, 2 +; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5 ; GCN: s_swappc_b64 -; GCN: s_mov_b32 s5, s33 +; GCN: s_mov_b32 s5, [[COPY_FP]] -; GCN-DAG: v_readlane_b32 s34, v32, 1 -; GCN-DAG: v_readlane_b32 s33, v32, 0 +; GCN-DAG: v_readlane_b32 s34, v32, 0 +; GCN-DAG: v_readlane_b32 s35, v32, 1 +; GCN-DAG: v_readlane_b32 s36, v32, 2 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -106,7 +106,7 @@ ; GCN-LABEL: {{^}}use_workgroup_id_x: ; GCN: s_waitcnt -; GCN: ; use s6 +; GCN: ; use s4 define hidden void @use_workgroup_id_x() #1 { %val = call i32 @llvm.amdgcn.workgroup.id.x() call void asm sideeffect "; use $0", "s"(i32 %val) @@ -117,7 +117,7 @@ ; GCN: s_waitcnt ; GCN-NOT: s32 ; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN: ; use s6 +; GCN: ; use s4 ; GCN: s_setpc_b64 define hidden void @use_stack_workgroup_id_x() #1 { %alloca = alloca i32, addrspace(5) @@ -129,7 +129,7 @@ ; GCN-LABEL: {{^}}use_workgroup_id_y: ; GCN: s_waitcnt -; GCN: ; use s6 +; GCN: ; use s4 define hidden void @use_workgroup_id_y() #1 { %val = call i32 @llvm.amdgcn.workgroup.id.y() call void asm sideeffect "; use $0", "s"(i32 %val) @@ -138,7 +138,7 @@ ; GCN-LABEL: {{^}}use_workgroup_id_z: ; GCN: s_waitcnt -; GCN: ; use s6 +; GCN: ; use s4 define hidden void @use_workgroup_id_z() #1 { %val = call i32 @llvm.amdgcn.workgroup.id.z() call void asm sideeffect "; use $0", "s"(i32 %val) @@ -146,8 +146,8 @@ } ; GCN-LABEL: {{^}}use_workgroup_id_xy: +; GCN: ; use s4 ; GCN: ; use s6 -; GCN: ; use s7 define hidden void @use_workgroup_id_xy() #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -157,9 +157,9 @@ } ; GCN-LABEL: {{^}}use_workgroup_id_xyz: +; GCN: ; use s4 ; GCN: ; use s6 ; GCN: ; use s7 -; GCN: ; use s8 define hidden void @use_workgroup_id_xyz() #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -171,8 +171,8 @@ } ; GCN-LABEL: {{^}}use_workgroup_id_xz: +; GCN: ; use s4 ; GCN: ; use s6 -; GCN: ; use s7 define hidden void @use_workgroup_id_xz() #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() @@ -182,8 +182,8 @@ } ; GCN-LABEL: {{^}}use_workgroup_id_yz: +; GCN: ; use s4 ; GCN: ; use s6 -; GCN: ; use s7 define hidden void @use_workgroup_id_yz() #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() @@ -198,12 +198,13 @@ ; GCN: enable_sgpr_workgroup_id_z = 0 ; GCN-NOT: s6 -; GCN: s_mov_b32 s33, s7 -; GCN-NOT: s6 -; GCN: s_mov_b32 s4, s33 -; GCN-NOT: s6 +; GCN: s_mov_b32 s4, s6 +; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x@rel32@hi+4 ; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 +; GCN-NEXT: s_endpgm define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { call void @use_workgroup_id_x() ret void @@ -215,8 +216,7 @@ ; GCN: enable_sgpr_workgroup_id_z = 0 ; GCN: s_mov_b32 s33, s8 -; GCN-DAG: s_mov_b32 s4, s33 -; GCN-DAG: s_mov_b32 s6, s7 +; GCN-DAG: s_mov_b32 s4, s7 ; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { @@ -228,10 +228,9 @@ ; GCN: enable_sgpr_workgroup_id_x = 1 ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 1 - ; GCN: s_mov_b32 s33, s8 -; GCN-DAG: s_mov_b32 s4, s33 -; GCN-DAG: s_mov_b32 s6, s7 +; GCN: s_mov_b32 s4, s7 + ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 { call void @use_workgroup_id_z() @@ -244,14 +243,10 @@ ; GCN: enable_sgpr_workgroup_id_z = 0 ; GCN: s_mov_b32 s33, s8 -; GCN-NOT: s6 -; GCN-NOT: s7 -; GCN: s_mov_b32 s4, s33 -; GCN-NOT: s6 -; GCN-NOT: s7 + +; GCN: s_mov_b32 s4, s6 +; GCN: s_mov_b32 s6, s7 ; GCN: s_mov_b32 s32, s33 -; GCN-NOT: s6 -; GCN-NOT: s7 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { call void @use_workgroup_id_xy() @@ -265,22 +260,11 @@ ; GCN: s_mov_b32 s33, s9 -; GCN-NOT: s6 -; GCN-NOT: s7 -; GCN-NOT: s8 - -; GCN: s_mov_b32 s4, s33 - -; GCN-NOT: s6 -; GCN-NOT: s7 -; GCN-NOT: s8 +; GCN: s_mov_b32 s4, s6 +; GCN: s_mov_b32 s6, s7 +; GCN: s_mov_b32 s7, s8 ; GCN: s_mov_b32 s32, s33 - -; GCN-NOT: s6 -; GCN-NOT: s7 -; GCN-NOT: s8 - ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 { call void @use_workgroup_id_xyz() @@ -293,16 +277,10 @@ ; GCN: enable_sgpr_workgroup_id_z = 1 ; GCN: s_mov_b32 s33, s8 -; GCN-NOT: s6 -; GCN-NOT: s7 - -; GCN: s_mov_b32 s4, s33 -; GCN-NOT: s6 -; GCN-NOT: s7 +; GCN: s_mov_b32 s4, s6 +; GCN: s_mov_b32 s6, s7 ; GCN: s_mov_b32 s32, s33 -; GCN-NOT: s6 -; GCN-NOT: s7 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 { @@ -316,9 +294,8 @@ ; GCN: enable_sgpr_workgroup_id_z = 1 ; GCN: s_mov_b32 s33, s9 -; GCN: s_mov_b32 s6, s7 -; GCN: s_mov_b32 s4, s33 -; GCN: s_mov_b32 s7, s8 +; GCN: s_mov_b32 s6, s8 +; GCN: s_mov_b32 s4, s7 ; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 { @@ -328,21 +305,21 @@ ; Argument is in right place already ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x: -; GCN-NOT: s6 +; GCN-NOT: s4 define hidden void @func_indirect_use_workgroup_id_x() #1 { call void @use_workgroup_id_x() ret void } ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y: -; GCN-NOT: s6 +; GCN-NOT: s4 define hidden void @func_indirect_use_workgroup_id_y() #1 { call void @use_workgroup_id_y() ret void } ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z: -; GCN-NOT: s6 +; GCN-NOT: s4 define hidden void @func_indirect_use_workgroup_id_z() #1 { call void @use_workgroup_id_z() ret void @@ -350,7 +327,7 @@ ; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x: ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN: ; use s6 +; GCN: ; use s4 define hidden void @other_arg_use_workgroup_id_x(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.x() store volatile i32 %arg0, i32 addrspace(1)* undef @@ -360,7 +337,7 @@ ; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y: ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN: ; use s6 +; GCN: ; use s4 define hidden void @other_arg_use_workgroup_id_y(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.y() store volatile i32 %arg0, i32 addrspace(1)* undef @@ -370,7 +347,7 @@ ; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z: ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GCN: ; use s6 +; GCN: ; use s4 define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { %val = call i32 @llvm.amdgcn.workgroup.id.z() store volatile i32 %arg0, i32 addrspace(1)* undef @@ -383,12 +360,11 @@ ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 0 -; GCN-NOT: s6 ; GCN-DAG: s_mov_b32 s33, s7 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s4, s6 ; GCN-DAG: s_mov_b32 s32, s33 -; GCN-NOT: s6 +; GCN-NOT: s4 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { call void @other_arg_use_workgroup_id_x(i32 555) @@ -402,8 +378,8 @@ ; GCN-DAG: s_mov_b32 s33, s8 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s4, s33 -; GCN-DAG: s_mov_b32 s6, s7 +; GCN-DAG: s_mov_b32 s4, s7 + ; GCN-DAG: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { @@ -416,10 +392,8 @@ ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s33, s8 +; GCN-DAG: s_mov_b32 s33, s8 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s4, s33 -; GCN-DAG: s_mov_b32 s6, s7 ; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 @@ -440,9 +414,9 @@ ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s11 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN: ; use s[12:13] +; GCN: ; use s4 ; GCN: ; use s14 ; GCN: ; use s15 -; GCN: ; use s16 define hidden void @use_every_sgpr_input() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -492,7 +466,9 @@ ; GCN: s_mov_b64 s[10:11], s[8:9] ; GCN: s_mov_b64 s[8:9], s[6:7] ; GCN: s_mov_b64 s[6:7], s[4:5] -; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s4, s14 +; GCN: s_mov_b32 s14, s15 +; GCN: s_mov_b32 s15, s16 ; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 { @@ -519,9 +495,11 @@ } ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz: -; GCN-DAG: s_mov_b32 s6, s14 -; GCN-DAG: s_mov_b32 s7, s15 -; GCN-DAG: s_mov_b32 s8, s16 +; GCN-NOT: s_mov_b32 s4 +; GCN: s_mov_b32 s6, s14 +; GCN-NEXT: s_mov_b32 s7, s15 +; GCN-NOT: s_mov_b32 s4 + ; GCN: s_swappc_b64 define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 { %alloca = alloca i32, align 4, addrspace(5) @@ -560,20 +538,18 @@ ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s14 -; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-68-9][0-9]*]], s15 -; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-79][0-9]*]], s16 -; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[6:7] -; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[8:9] -; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[10:11] - +; GCN: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[6:7] +; GCN-NOT: s_mov_b32 s4, ; GCN-DAG: s_mov_b32 s6, s14 ; GCN-DAG: s_mov_b32 s7, s15 -; GCN-DAG: s_mov_b32 s8, s16 -; GCN-DAG: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[6:7] -; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[8:9] -; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[10:11] +; GCN: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[8:9] + +; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s4 +; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s14 +; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-68-9][0-9]*]], s15 + +; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[10:11] ; GCN: s_swappc_b64 Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -290,7 +290,6 @@ ; GCN: s_mov_b32 s33, s7 ; GCN: s_mov_b32 s32, s33 ; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN: s_mov_b32 s4, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { call void @too_many_args_use_workitem_id_x( Index: test/CodeGen/AMDGPU/chain-hi-to-lo.ll =================================================================== --- test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -220,10 +220,10 @@ } ; GCN-LABEL: {{^}}chain_hi_to_lo_private_other_dep: -; GFX900: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen +; GFX900: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] -; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:2 +; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s33 offen offset:2 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: s_setpc_b64 Index: test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -32,20 +32,20 @@ ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_writelane_b32 v32, s33, 0 -; GCN-NEXT: v_writelane_b32 v32, s34, 1 -; GCN-NEXT: v_writelane_b32 v32, s35, 2 +; GCN-NEXT: v_writelane_b32 v32, s34, 0 +; GCN-NEXT: v_writelane_b32 v32, s35, 1 +; GCN-NEXT: v_writelane_b32 v32, s36, 2 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func_v2f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func_v2f32@rel32@hi+4 ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] -; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_mov_b32 s36, s5 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_mov_b32 s5, s36 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] -; GCN-NEXT: v_readlane_b32 s35, v32, 2 -; GCN-NEXT: s_mov_b32 s5, s33 -; GCN-NEXT: v_readlane_b32 s34, v32, 1 -; GCN-NEXT: v_readlane_b32 s33, v32, 0 +; GCN-NEXT: v_readlane_b32 s36, v32, 2 +; GCN-NEXT: v_readlane_b32 s35, v32, 1 +; GCN-NEXT: v_readlane_b32 s34, v32, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -70,20 +70,20 @@ ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_writelane_b32 v32, s33, 0 -; GCN-NEXT: v_writelane_b32 v32, s34, 1 -; GCN-NEXT: v_writelane_b32 v32, s35, 2 +; GCN-NEXT: v_writelane_b32 v32, s34, 0 +; GCN-NEXT: v_writelane_b32 v32, s35, 1 +; GCN-NEXT: v_writelane_b32 v32, s36, 2 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func_v3f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func_v3f32@rel32@hi+4 ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] -; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_mov_b32 s36, s5 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_mov_b32 s5, s36 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] -; GCN-NEXT: v_readlane_b32 s35, v32, 2 -; GCN-NEXT: s_mov_b32 s5, s33 -; GCN-NEXT: v_readlane_b32 s34, v32, 1 -; GCN-NEXT: v_readlane_b32 s33, v32, 0 +; GCN-NEXT: v_readlane_b32 s36, v32, 2 +; GCN-NEXT: v_readlane_b32 s35, v32, 1 +; GCN-NEXT: v_readlane_b32 s34, v32, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -108,20 +108,20 @@ ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_writelane_b32 v32, s33, 0 -; GCN-NEXT: v_writelane_b32 v32, s34, 1 -; GCN-NEXT: v_writelane_b32 v32, s35, 2 +; GCN-NEXT: v_writelane_b32 v32, s34, 0 +; GCN-NEXT: v_writelane_b32 v32, s35, 1 +; GCN-NEXT: v_writelane_b32 v32, s36, 2 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func_v4f16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func_v4f16@rel32@hi+4 ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] -; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_mov_b32 s36, s5 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: s_mov_b32 s5, s36 ; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] -; GCN-NEXT: v_readlane_b32 s35, v32, 2 -; GCN-NEXT: s_mov_b32 s5, s33 -; GCN-NEXT: v_readlane_b32 s34, v32, 1 -; GCN-NEXT: v_readlane_b32 s33, v32, 0 +; GCN-NEXT: v_readlane_b32 s36, v32, 2 +; GCN-NEXT: v_readlane_b32 s35, v32, 1 +; GCN-NEXT: v_readlane_b32 s34, v32, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -146,21 +146,21 @@ ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_writelane_b32 v32, s33, 0 -; GCN-NEXT: v_writelane_b32 v32, s34, 1 -; GCN-NEXT: v_writelane_b32 v32, s35, 2 +; GCN-NEXT: v_writelane_b32 v32, s34, 0 +; GCN-NEXT: v_writelane_b32 v32, s35, 1 +; GCN-NEXT: v_writelane_b32 v32, s36, 2 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func_struct@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func_struct@rel32@hi+4 ; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] -; GCN-NEXT: s_mov_b32 s33, s5 +; GCN-NEXT: s_mov_b32 s36, s5 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] -; GCN-NEXT: v_readlane_b32 s35, v32, 2 -; GCN-NEXT: s_mov_b32 s5, s33 -; GCN-NEXT: v_readlane_b32 s34, v32, 1 +; GCN-NEXT: s_mov_b32 s5, s36 ; GCN-NEXT: v_mov_b32_e32 v1, v4 -; GCN-NEXT: v_readlane_b32 s33, v32, 0 +; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] +; GCN-NEXT: v_readlane_b32 s36, v32, 2 +; GCN-NEXT: v_readlane_b32 s35, v32, 1 +; GCN-NEXT: v_readlane_b32 s34, v32, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-elimination.ll +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -7,10 +7,10 @@ ; Materialize into a mov. Make sure there isn't an unnecessary copy. ; GCN-LABEL: {{^}}func_mov_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 s6, s32, s4 +; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 v0, s6, 6 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6 +; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]] ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -24,22 +24,22 @@ ; GCN-LABEL: {{^}}func_mov_fi_i32_offset: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI: s_sub_u32 s6, s32, s4 -; CI-NEXT: v_lshr_b32_e64 v0, s6, 6 +; CI: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 +; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 -; CI: s_sub_u32 s6, s32, s4 -; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; CI: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 +; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 ; CI-NEXT: v_add_i32_e64 v1, s[6:7], 4, [[SCALED]] ; CI-NOT: v_mov ; CI: ds_write_b32 v0, v0 ; CI-NEXT: ds_write_b32 v0, v1 -; GFX9: s_sub_u32 s6, s32, s4 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6 +; GFX9: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]] ; GFX9-DAG: ds_write_b32 v0, v0 -; GFX9-DAG: s_sub_u32 s6, s32, s4 -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 +; GFX9-DAG: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 +; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] ; GFX9-NEXT: ds_write_b32 v0, v0 define void @func_mov_fi_i32_offset() #0 { @@ -55,12 +55,12 @@ ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 s6, s32, s4 +; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 ; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]] -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 +; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] @@ -77,11 +77,11 @@ ; into. ; GCN-LABEL: {{^}}func_other_fi_user_i32: -; GCN: s_sub_u32 s6, s32, s4 +; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 v0, s6, 6 +; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]] ; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0 ; GCN-NOT: v_mov @@ -96,7 +96,7 @@ ; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr: ; GCN: v_mov_b32_e32 v1, 15{{$}} -; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}} +; GCN: buffer_store_dword v1, v0, s[0:3], s33 offen{{$}} define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { store volatile i32 15, i32 addrspace(5)* %ptr ret void @@ -104,7 +104,7 @@ ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}} +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen{{$}} define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { %val = load volatile i32, i32 addrspace(5)* %ptr ret void @@ -112,7 +112,7 @@ ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4 +; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33 ; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 ; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] @@ -148,7 +148,7 @@ ; FrameIndex is hidden behind a CopyFromReg in the second block. ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: -; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4 +; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 @@ -157,10 +157,10 @@ ; GCN: s_and_saveexec_b64 ; CI: v_add_i32_e32 v0, vcc, 4, [[SHIFT]] -; CI: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4{{$}} +; CI: buffer_load_dword v1, v1, s[0:3], s33 offen offset:4{{$}} ; GFX9: v_add_u32_e32 v0, 4, [[SHIFT]] -; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s4 offen offset:4{{$}} +; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s33 offen offset:4{{$}} ; GCN: ds_write_b32 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 { @@ -180,14 +180,14 @@ ; Added offset can't be used with VOP3 add ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: -; GCN: s_sub_u32 s6, s32, s4 -; GCN-DAG: s_movk_i32 s6, 0x200 +; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 +; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x200 -; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 -; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], s6, [[SCALED]] +; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 +; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], [[K]], [[SCALED]] -; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6 -; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], s6, [[SCALED]] +; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] +; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[K]], [[SCALED]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] ; GCN: ds_write_b32 v0, [[VZ]] @@ -204,7 +204,7 @@ } ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live: -; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s4 +; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s33 ; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6 @@ -261,7 +261,7 @@ ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: ; GCN: s_and_saveexec_b64 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 -; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4 +; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 ; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] Index: test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- test/CodeGen/AMDGPU/function-returns.ll +++ test/CodeGen/AMDGPU/function-returns.ll @@ -389,8 +389,8 @@ ; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32: ; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]] ; GCN: buffer_load_dword [[VAL1:v[0-9]+]] -; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s4 offen{{$}} -; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s4 offen offset:4{{$}} +; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s33 offen{{$}} +; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s33 offen offset:4{{$}} define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret %arg0) #0 { %val0 = load volatile i8, i8 addrspace(1)* undef %val1 = load volatile i32, i32 addrspace(1)* undef @@ -406,39 +406,39 @@ ; AssertZext inserted. Not using it introduces the spills. ; GCN-LABEL: {{^}}v33i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:4{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:8{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:12{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:16{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:20{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:24{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:28{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:32{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:36{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:40{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:44{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:48{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:52{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:56{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:60{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:64{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:68{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:72{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:76{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:80{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:84{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:88{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:92{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:96{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:100{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:104{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:108{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:112{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:116{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:120{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:124{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}} ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <33 x i32> @v33i32_func_void() #0 { @@ -448,39 +448,39 @@ } ; GCN-LABEL: {{^}}struct_v32i32_i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:4{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:8{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:12{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:16{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:20{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:24{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:28{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:32{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:36{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:40{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:44{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:48{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:52{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:56{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:60{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:64{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:68{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:72{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:76{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:80{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:84{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:88{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:92{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:96{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:100{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:104{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:108{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:112{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:116{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:120{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:124{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}} ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { @@ -490,39 +490,39 @@ } ; GCN-LABEL: {{^}}struct_i32_v32i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:132{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:136{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:140{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:144{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:148{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:152{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:156{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:160{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:164{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:168{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:172{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:176{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:180{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:184{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:188{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:192{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:196{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:200{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:204{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:208{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:212{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:216{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:220{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:224{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:228{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:232{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:236{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:240{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:244{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:248{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:252{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:132{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:136{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:140{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:144{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:148{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:152{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:156{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:160{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:164{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:168{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:172{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:176{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:180{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:184{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:188{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:192{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:196{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:200{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:204{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:208{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:212{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:216{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:220{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:224{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:228{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:232{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:236{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:240{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:244{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:248{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:252{{$}} ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { Index: test/CodeGen/AMDGPU/known-never-snan.ll =================================================================== --- test/CodeGen/AMDGPU/known-never-snan.ll +++ test/CodeGen/AMDGPU/known-never-snan.ll @@ -67,8 +67,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_brev_b32 s6, -2 -; GCN-NEXT: v_bfi_b32 v0, s6, v0, v1 +; GCN-NEXT: s_brev_b32 s4, -2 +; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %a.nnan.add = fdiv nnan float 1.0, %a @@ -455,9 +455,9 @@ ; GCN-LABEL: v_test_known_not_snan_round_input_fmed3_r_i_i_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_brev_b32 s6, -2 +; GCN-NEXT: s_brev_b32 s4, -2 ; GCN-NEXT: v_trunc_f32_e32 v2, v0 -; GCN-NEXT: v_bfi_b32 v1, s6, 1.0, v0 +; GCN-NEXT: v_bfi_b32 v1, s4, 1.0, v0 ; GCN-NEXT: v_sub_f32_e32 v0, v0, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, 0.5 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -217,9 +217,10 @@ } ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: +; GCN: s_add_u32 s8, s4, 0x70 +; GCN: s_addc_u32 s9, s5, 0 + ; GCN: s_mov_b64 s[6:7], s[4:5] -; GCN: s_add_u32 s8, s6, 0x70 -; GCN: s_addc_u32 s9, s7, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { call void @func_kernarg_implicitarg_ptr() Index: test/CodeGen/AMDGPU/load-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/load-hi16.ll +++ test/CodeGen/AMDGPU/load-hi16.ll @@ -532,13 +532,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s4 offset:4094{{$}} +; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s33 offset:4094{{$}} ; GFX900: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) @@ -550,13 +550,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { entry: %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) @@ -650,13 +650,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -669,13 +669,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -688,13 +688,13 @@ ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) Index: test/CodeGen/AMDGPU/load-lo16.ll =================================================================== --- test/CodeGen/AMDGPU/load-lo16.ll +++ test/CodeGen/AMDGPU/load-lo16.ll @@ -650,13 +650,13 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -668,13 +668,13 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -686,13 +686,13 @@ ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -744,13 +744,13 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -763,13 +763,13 @@ ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s33 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}} define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -782,13 +782,13 @@ ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}} +; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> Index: test/CodeGen/AMDGPU/mubuf-legalize-operands.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -124,7 +124,7 @@ ; CHECK-O0-LABEL: mubuf_vgpr_outside_entry ; CHECK-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s4 -; CHECK-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], [[IDX_S]] +; CHECK-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], s4 ; CHECK-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec ; CHECK-O0-DAG: buffer_store_dword [[IDX_V]], off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0-DAG: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]] Index: test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll =================================================================== --- test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -249,7 +249,7 @@ ; GCN-DAG: v_and_b32_e32 v1, [[U23_MASK]], v1 ; GCN-DAG: v_mul_u32_u24_e32 v0, 0xea, v0 ; GCN-DAG: v_mul_u32_u24_e32 v1, 0x39b, v1 -; GCN: v_and_b32_e32 v1, s6, v1 +; GCN: v_and_b32_e32 v1, s4, v1 ; GCN: v_and_b32_e32 v0, 0x7ffffe, v0 ; GCN: v_mul_u32_u24_e32 v0, v0, v1 ; GCN: v_and_b32_e32 v0, 0x1fffe, v0 Index: test/CodeGen/AMDGPU/nested-calls.ll =================================================================== --- test/CodeGen/AMDGPU/nested-calls.ll +++ test/CodeGen/AMDGPU/nested-calls.ll @@ -16,15 +16,15 @@ ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v32, s33, 0 -; GCN-DAG: v_writelane_b32 v32, s34, 1 -; GCN-DAG: v_writelane_b32 v32, s35, 2 +; GCN-DAG: v_writelane_b32 v32, s34, 0 +; GCN-DAG: v_writelane_b32 v32, s35, 1 +; GCN-DAG: v_writelane_b32 v32, s36, 2 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s35, v32, 2 -; GCN: v_readlane_b32 s34, v32, 1 -; GCN: v_readlane_b32 s33, v32, 0 +; GCN: v_readlane_b32 s36, v32, 2 +; GCN: v_readlane_b32 s35, v32, 1 +; GCN: v_readlane_b32 s34, v32, 0 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] Index: test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir =================================================================== --- test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir +++ test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir @@ -17,7 +17,7 @@ machineFunctionInfo: isEntryFunction: true scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr5 + scratchWaveOffsetReg: $sgpr33 frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 @@ -25,9 +25,9 @@ ; CHECK-LABEL: name: scavenge_register_position ; CHECK: bb.0: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $sgpr4, $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr6 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc - ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) + ; CHECK: liveins: $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc + ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_BRANCH %bb.1 ; CHECK: bb.1: ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 Index: test/CodeGen/AMDGPU/shl_add_ptr.ll =================================================================== --- test/CodeGen/AMDGPU/shl_add_ptr.ll +++ test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -333,10 +333,10 @@ ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private: ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0 -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s4 offen offset:16 +; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:16 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0 -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s4 offen offset:32 +; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen offset:32 define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 %idx.add = add nuw i32 %idx, 4 @@ -352,9 +352,9 @@ ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_private_offset: ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s4 offen offset:4088 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:4088 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x1ff0, [[SCALE1]] -; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], s4 offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], s33 offen{{$}} define void @shl_add_ptr_combine_2use_max_private_offset(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 %idx.add = add nuw i32 %idx, 511 @@ -370,8 +370,8 @@ ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x100, v0 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]] ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]] -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s4 offen{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s4 offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen{{$}} define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 %idx.add = add nuw i32 %idx, 256 Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -117,7 +117,7 @@ ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: ; GCN-NOT: v0 ; GCN-NOT: s32 -; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16 +; GCN: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} ; GCN-NEXT: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { @@ -211,8 +211,8 @@ ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill ; GCN: buffer_store_dword v33, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v34, s33, 0 -; GCN-DAG: v_writelane_b32 v34, s34, 1 +; GCN-DAG: v_writelane_b32 v34, s34, 0 +; GCN-DAG: v_writelane_b32 v34, s35, 1 ; GCN-DAG: s_getpc_b64 ; GCN: s_swappc_b64 @@ -221,8 +221,8 @@ ; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 ; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4 -; GCN-DAG: v_readlane_b32 s33, v34, 0 -; GCN-DAG: v_readlane_b32 s34, v34, 1 +; GCN-DAG: v_readlane_b32 s34, v34, 0 +; GCN-DAG: v_readlane_b32 s35, v34, 1 ; GCN: buffer_load_dword v33, off, s[0:3], s5 ; 4-byte Folded Reload ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload Index: test/CodeGen/AMDGPU/spill-offset-calculation.ll =================================================================== --- test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -220,8 +220,8 @@ %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) - ; CHECK: s_add_u32 s6, s32, 0x40000 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill + ; CHECK: s_add_u32 s4, s32, 0x40000 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill @@ -272,9 +272,9 @@ %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; 0x3ff00 / 64 = 4092 (for wave64) - ; CHECK: s_add_u32 s6, s32, 0x3ff00 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill + ; CHECK: s_add_u32 s4, s32, 0x3ff00 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr @@ -293,4 +293,4 @@ attributes #0 = { nounwind } attributes #1 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } -attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } +attributes #2 = { nounwind "amdgpu-num-sgpr"="15" "amdgpu-num-vgpr"="8" } Index: test/CodeGen/AMDGPU/stack-realign.ll =================================================================== --- test/CodeGen/AMDGPU/stack-realign.ll +++ test/CodeGen/AMDGPU/stack-realign.ll @@ -9,18 +9,18 @@ ; = 144 bytes with padding between them ; GCN-LABEL: {{^}}needs_align16_default_stack_align: -; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s4 +; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0 ; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]] ; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[FRAMEDIFF]], [[SCALED_IDX]] ; GCN-NOT: s32 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN-NOT: s32 @@ -34,14 +34,14 @@ ; GCN-LABEL: {{^}}needs_align16_stack_align4: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}} -; GCN: s_and_b32 s5, s6, 0xfffffc00 +; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xfffffc00 ; GCN: s_add_u32 s32, s32, 0x2800{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: s_sub_u32 s32, s32, 0x2800 @@ -55,14 +55,14 @@ ; GCN-LABEL: {{^}}needs_align32: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}} -; GCN: s_and_b32 s5, s6, 0xfffff800 +; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xfffff800 ; GCN: s_add_u32 s32, s32, 0x3000{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: s_sub_u32 s32, s32, 0x3000 @@ -76,10 +76,10 @@ ; GCN-LABEL: {{^}}force_realign4: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}} -; GCN: s_and_b32 s5, s6, 0xffffff00 +; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xffffff00 ; GCN: s_add_u32 s32, s32, 0xd00{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen ; GCN: s_sub_u32 s32, s32, 0xd00 ; GCN: ; ScratchSize: 52 Index: test/CodeGen/AMDGPU/store-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/store-hi16.ll +++ test/CodeGen/AMDGPU/store-hi16.ll @@ -389,10 +389,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s4 offen{{$}} +; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -408,10 +408,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2f16: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s4 offen{{$}} +; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -427,10 +427,10 @@ ; GCN-LABEL: {{^}}store_private_hi_i32_shift: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen{{$}} +; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -445,10 +445,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s4 offen{{$}} +; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s33 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen{{$}} +; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -464,10 +464,10 @@ ; GCN-LABEL: {{^}}store_private_hi_i8_shift: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s4 offen{{$}} +; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s33 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen{{$}} +; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -502,10 +502,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s4{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s33{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s4{{$}} +; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s33{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -522,10 +522,10 @@ ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s4{{$}} +; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s33{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 -; NO-D16-HI: buffer_store_byte v0, off, s[0:3], s4{{$}} +; NO-D16-HI: buffer_store_byte v0, off, s[0:3], s33{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64