diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -355,14 +355,23 @@ auto const &ST = MF.getSubtarget(); - unsigned ReturnOpc = - IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; + unsigned ReturnOpc = 0; + if (IsShader) + ReturnOpc = AMDGPU::SI_RETURN_TO_EPILOG; + else if (CC == CallingConv::AMDGPU_Gfx) + ReturnOpc = AMDGPU::S_SETPC_B64_return_gfx; + else + ReturnOpc = AMDGPU::S_SETPC_B64_return; auto Ret = B.buildInstrNoInsert(ReturnOpc); Register ReturnAddrVReg; if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); Ret.addUse(ReturnAddrVReg); + } else if (ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) { + ReturnAddrVReg = + MRI.createVirtualRegister(&AMDGPU::Gfx_CCR_SGPR_64RegClass); + Ret.addUse(ReturnAddrVReg); } if (!FLI.CanLowerReturn) @@ -370,7 +379,8 @@ else if (!lowerReturnVal(B, Val, VRegs, Ret)) return false; - if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { + if (ReturnOpc == AMDGPU::S_SETPC_B64_return || + ReturnOpc == AMDGPU::S_SETPC_B64_return_gfx) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), &AMDGPU::SGPR_64RegClass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -20,6 +20,8 @@ // 0-3 are reserved for the stack buffer descriptor // 30-31 are reserved for the return address // 32 is reserved for the stack pointer + // 33 is reserved for the frame pointer + // 34 is reserved for the base pointer CCIfInReg>, CCIfType<[i1, i16], CCIfExtend>>, - // 0-3 are reserved for the stack buffer descriptor - // 32 is reserved for the stack pointer - CCIfInReg>>, - CCIfNotInReg; +def CSR_AMDGPU_SI_Gfx_SGPRs_4_29 : CalleeSavedRegs< + (sequence "SGPR%u", 4, 29) +>; + +def CSR_AMDGPU_SI_Gfx_SGPRs_64_105 : CalleeSavedRegs< + (sequence "SGPR%u", 64, 105) +>; + // Just to get the regmask, not for calling convention purposes. def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs< (sequence "VGPR%u", 0, 255) @@ -198,6 +197,14 @@ (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255) >; +def CSR_AMDGPU_SI_Gfx : CalleeSavedRegs< + (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs_4_29, CSR_AMDGPU_SI_Gfx_SGPRs_64_105) +>; + +def CSR_AMDGPU_SI_Gfx_With_AGPRs : CalleeSavedRegs< + (add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs_32_255) +>; + def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>; // Calling convention for leaf functions diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -337,7 +337,7 @@ enum NodeType : unsigned { // AMDIL ISD Opcodes FIRST_NUMBER = ISD::BUILTIN_OP_END, - UMUL, // 32bit unsigned multiplication + UMUL, // 32bit unsigned multiplication BRANCH_COND, // End AMDIL ISD Opcodes @@ -360,6 +360,9 @@ // Return with values from a non-entry function. RET_FLAG, + // Return with values from a non-entry function (AMDGPU_Gfx CC). + RET_GFX_FLAG, + DWORDADDR, FRACT, @@ -416,10 +419,10 @@ DOT4, CARRY, BORROW, - BFE_U32, // Extract range of bits with zero extension to 32-bits. - BFE_I32, // Extract range of bits with sign extension to 32-bits. - BFI, // (src0 & src1) | (~src0 & src2) - BFM, // Insert a range of bits into a 32-bit word. + BFE_U32, // Extract range of bits with zero extension to 32-bits. + BFE_I32, // Extract range of bits with sign extension to 32-bits. + BFI, // (src0 & src1) | (~src0 & src2) + BFM, // Insert a range of bits into a 32-bit word. FFBH_U32, // ctlz with -1 if input is zero. FFBH_I32, FFBL_B32, // cttz with -1 if input is zero. @@ -528,7 +531,6 @@ LAST_AMDGPU_ISD_NUMBER }; - } // End namespace AMDGPUISD } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -352,6 +352,10 @@ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; +def AMDGPUret_gfx_flag : SDNode<"AMDGPUISD::RET_GFX_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; + //===----------------------------------------------------------------------===// // Intrinsic/Custom node compatibility PatFrags diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -120,7 +120,8 @@ // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We // need to select it to the subtarget specific version, and there's no way to // do that with a single pseudo source operation. - if (Opcode == AMDGPU::S_SETPC_B64_return) + if (Opcode == AMDGPU::S_SETPC_B64_return || + Opcode == AMDGPU::S_SETPC_B64_return_gfx) Opcode = AMDGPU::S_SETPC_B64; else if (Opcode == AMDGPU::SI_CALL) { // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2616,9 +2616,12 @@ SDValue ReturnAddrReg = CreateLiveInRegister( DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); - SDValue ReturnAddrVirtualReg = DAG.getRegister( - MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass), - MVT::i64); + SDValue ReturnAddrVirtualReg = + DAG.getRegister(MF.getRegInfo().createVirtualRegister( + CallConv != CallingConv::AMDGPU_Gfx + ? &AMDGPU::CCR_SGPR_64RegClass + : &AMDGPU::Gfx_CCR_SGPR_64RegClass), + MVT::i64); Chain = DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag); Flag = Chain.getValue(1); @@ -2681,8 +2684,15 @@ RetOps.push_back(Flag); unsigned Opc = AMDGPUISD::ENDPGM; - if (!IsWaveEnd) - Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG; + if (!IsWaveEnd) { + if (IsShader) + Opc = AMDGPUISD::RETURN_TO_EPILOG; + else if (CallConv == CallingConv::AMDGPU_Gfx) + Opc = AMDGPUISD::RET_GFX_FLAG; + else + Opc = AMDGPUISD::RET_FLAG; + } + return DAG.getNode(Opc, DL, MVT::Other, RetOps); } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -963,6 +963,7 @@ // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || + MI.getOpcode() == AMDGPU::S_SETPC_B64_return_gfx || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -348,10 +348,13 @@ case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: - case CallingConv::AMDGPU_Gfx: return MF->getSubtarget().hasGFX90AInsts() ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList : CSR_AMDGPU_HighRegs_SaveList; + case CallingConv::AMDGPU_Gfx: + return MF->getSubtarget().hasGFX90AInsts() + ? CSR_AMDGPU_SI_Gfx_With_AGPRs_SaveList + : CSR_AMDGPU_SI_Gfx_SaveList; default: { // Dummy to not crash RegisterClassInfo. static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; @@ -371,10 +374,13 @@ case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: - case CallingConv::AMDGPU_Gfx: return MF.getSubtarget().hasGFX90AInsts() ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask : CSR_AMDGPU_HighRegs_RegMask; + case CallingConv::AMDGPU_Gfx: + return MF.getSubtarget().hasGFX90AInsts() + ? CSR_AMDGPU_SI_Gfx_With_AGPRs_RegMask + : CSR_AMDGPU_SI_Gfx_RegMask; default: return nullptr; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -691,6 +691,14 @@ let AllocationPriority = SGPR_64.AllocationPriority; } +// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC +def Gfx_CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, + (add (trunc (shl SGPR_64, 15), 1), // s[30:31] + (trunc (shl SGPR_64, 18), 15))> { // s[36:37]-s[s62:63] + let CopyCost = SGPR_64.CopyCost; + let AllocationPriority = SGPR_64.AllocationPriority; +} + def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> { let isAllocatable = 0; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -265,6 +265,7 @@ let isReturn = 1 in { // Define variant marked as return rather than branch. def S_SETPC_B64_return : SOP1_1<"", CCR_SGPR_64, [(AMDGPUret_flag i64:$src0)]>; +def S_SETPC_B64_return_gfx : SOP1_1<"", Gfx_CCR_SGPR_64, [(AMDGPUret_gfx_flag i64:$src0)]>; } } // End isTerminator = 1, isBarrier = 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll @@ -11,16 +11,17 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_void() #0 { ; CHECK-LABEL: name: test_gfx_call_external_void_func_void ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr30_sgpr31 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc - ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY2]] + ; CHECK-NEXT: liveins: $sgpr30_sgpr31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]] call amdgpu_gfx void @external_gfx_void_func_void() ret void } @@ -28,19 +29,20 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm(i32) #0 { ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm ; CHECK: bb.1 (%ir-block.1): - ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc - ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32 - ; CHECK: $vgpr0 = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>) - ; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] - ; CHECK: S_SETPC_B64_return [[COPY3]] + ; CHECK-NEXT: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32 + ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>) + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY3]] call amdgpu_gfx void @external_gfx_void_func_i32(i32 42) ret void } @@ -48,19 +50,20 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg) #0 { ; CHECK-LABEL: name: test_gfx_call_external_void_func_i32_imm_inreg ; CHECK: bb.1 (%ir-block.1): - ; CHECK: liveins: $sgpr4, $sgpr30_sgpr31 - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc - ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg - ; CHECK: $sgpr4 = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>) - ; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] - ; CHECK: S_SETPC_B64_return [[COPY3]] + ; CHECK-NEXT: liveins: $sgpr4, $sgpr30_sgpr31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg + ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>) + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY3]] call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42) ret void } @@ -68,26 +71,27 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32() #0 { ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32 ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr30_sgpr31 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4) - ; CHECK: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load (s8) from %ir.ptr0, align 4, addrspace 1) - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64) - ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32) from %ir.ptr0 + 4, addrspace 1) - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc - ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT1]](s32) - ; CHECK: $vgpr1 = COPY [[LOAD2]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY2]] + ; CHECK-NEXT: liveins: $sgpr30_sgpr31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load (s8) from %ir.ptr0, align 4, addrspace 1) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32) from %ir.ptr0 + 4, addrspace 1) + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32 + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT1]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[LOAD2]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]] %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val) @@ -97,26 +101,27 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #0 { ; CHECK-LABEL: name: test_gfx_call_external_void_func_struct_i8_i32_inreg ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr30_sgpr31 - ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF - ; CHECK: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4) - ; CHECK: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load (s8) from %ir.ptr0, align 4, addrspace 1) - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64) - ; CHECK: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32) from %ir.ptr0 + 4, addrspace 1) - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc - ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) - ; CHECK: $sgpr4 = COPY [[ANYEXT1]](s32) - ; CHECK: $sgpr5 = COPY [[LOAD2]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY2]] + ; CHECK-NEXT: liveins: $sgpr30_sgpr31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `{ i8, i32 } addrspace(1)* addrspace(4)* undef`, addrspace 4) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[LOAD]](p1) :: (load (s8) from %ir.ptr0, align 4, addrspace 1) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[LOAD]], [[C]](s64) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32) from %ir.ptr0 + 4, addrspace 1) + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) + ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32) + ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]] %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll @@ -141,12 +141,12 @@ ; GCN-NEXT: $vgpr0 = COPY [[C]](s32) ; GCN-NEXT: [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>) - ; GCN-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 + ; GCN-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 ; GCN-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (volatile store (s32) into %ir.out, addrspace 1) - ; GCN-NEXT: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] - ; GCN-NEXT: S_SETPC_B64_return [[COPY5]] + ; GCN-NEXT: [[COPY5:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY2]] + ; GCN-NEXT: S_SETPC_B64_return_gfx [[COPY5]] %val = call amdgpu_gfx i32 @external_gfx_i32_func_i32(i32 42) store volatile i32 %val, i32 addrspace(1)* %out ret void @@ -219,13 +219,13 @@ ; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i1_func_void ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; GCN-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 + ; GCN-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i1_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store (s1) into `i1 addrspace(1)* undef`, addrspace 1) - ; GCN-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; GCN-NEXT: S_SETPC_B64_return [[COPY3]] + ; GCN-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]] + ; GCN-NEXT: S_SETPC_B64_return_gfx [[COPY3]] %val = call amdgpu_gfx i1 @external_gfx_i1_func_void() store volatile i1 %val, i1 addrspace(1)* undef ret void @@ -415,14 +415,14 @@ ; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i8_func_void ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; GCN-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i8_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 + ; GCN-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i8_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) ; GCN-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16) ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store (s8) into `i8 addrspace(1)* undef`, addrspace 1) - ; GCN-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; GCN-NEXT: S_SETPC_B64_return [[COPY3]] + ; GCN-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]] + ; GCN-NEXT: S_SETPC_B64_return_gfx [[COPY3]] %val = call amdgpu_gfx i8 @external_gfx_i8_func_void() store volatile i8 %val, i8 addrspace(1)* undef ret void @@ -784,12 +784,12 @@ ; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i32_func_void ; GCN-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; GCN-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 + ; GCN-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0 ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[COPY2]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1) - ; GCN-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; GCN-NEXT: S_SETPC_B64_return [[COPY3]] + ; GCN-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]] + ; GCN-NEXT: S_SETPC_B64_return_gfx [[COPY3]] %val = call amdgpu_gfx i32 @external_gfx_i32_func_void() store volatile i32 %val, i32 addrspace(1)* undef ret void @@ -2480,7 +2480,7 @@ ; GCN-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i32_i64_func_void ; GCN-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>) - ; GCN-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_i64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2 + ; GCN-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_i32_i64_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2 ; GCN-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GCN-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr2 @@ -2488,8 +2488,8 @@ ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; GCN-NEXT: G_STORE [[COPY3]](s32), [[DEF]](p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1) ; GCN-NEXT: G_STORE [[MV]](s64), [[COPY1]](p1) :: (volatile store (s64) into `i64 addrspace(1)* undef`, addrspace 1) - ; GCN-NEXT: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; GCN-NEXT: S_SETPC_B64_return [[COPY6]] + ; GCN-NEXT: [[COPY6:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]] + ; GCN-NEXT: S_SETPC_B64_return_gfx [[COPY6]] %val = call amdgpu_gfx { i32, i64 } @external_gfx_i32_i64_func_void() %val.0 = extractvalue { i32, i64 } %val, 0 %val.1 = extractvalue { i32, i64 } %val, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -156,10 +156,10 @@ ; CHECK-NEXT: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_void_func_void ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_void, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK-NEXT: S_SETPC_B64_return [[COPY2]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]] call amdgpu_gfx void @external_gfx_void_func_void() ret void } @@ -899,10 +899,10 @@ ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] - ; CHECK-NEXT: S_SETPC_B64_return [[COPY3]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY3]] call amdgpu_gfx void @external_gfx_void_func_i32(i32 42) ret void } @@ -920,10 +920,10 @@ ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] - ; CHECK-NEXT: S_SETPC_B64_return [[COPY3]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY3]] call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42) ret void } @@ -3893,10 +3893,10 @@ ; CHECK-NEXT: $vgpr1 = COPY [[LOAD2]](s32) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK-NEXT: S_SETPC_B64_return [[COPY2]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]] %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32({ i8, i32 } %val) @@ -3923,10 +3923,10 @@ ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_highregs, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK-NEXT: S_SETPC_B64_return [[COPY2]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY2]] %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 call amdgpu_gfx void @external_gfx_void_func_struct_i8_i32_inreg({ i8, i32 } inreg %val) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll @@ -4,51 +4,52 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(void()* %fptr) { ; CHECK-LABEL: name: test_indirect_call_sgpr_ptr ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 - ; CHECK: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; CHECK: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; CHECK: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 - ; CHECK: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 - ; CHECK: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK: [[LOAD:%[0-9]+]]:sreg_64(p0) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p0) from %ir.fptr.kernarg.offset.cast, align 16, addrspace 4) - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc - ; CHECK: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) - ; CHECK: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] - ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] - ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) - ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 - ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) - ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] - ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) - ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) - ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) - ; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4) - ; CHECK: $sgpr6_sgpr7 = COPY [[COPY11]](p4) - ; CHECK: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) - ; CHECK: $sgpr10_sgpr11 = COPY [[COPY13]](s64) - ; CHECK: $sgpr12 = COPY [[COPY14]](s32) - ; CHECK: $sgpr13 = COPY [[COPY15]](s32) - ; CHECK: $sgpr14 = COPY [[COPY16]](s32) - ; CHECK: $vgpr31 = COPY [[OR1]](s32) - ; CHECK: $sgpr30_sgpr31 = SI_CALL [[LOAD]](p0), 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31 - ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK: S_ENDPGM 0 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:sreg_64(p0) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load (p0) from %ir.fptr.kernarg.offset.cast, align 16, addrspace 4) + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK-NEXT: $vgpr31 = COPY [[OR1]](s32) + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[LOAD]](p0), 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: S_ENDPGM 0 call void %fptr() ret void } @@ -56,18 +57,19 @@ define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(void()* %fptr) { ; CHECK-LABEL: name: test_gfx_indirect_call_sgpr_ptr ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK: [[MV:%[0-9]+]]:sreg_64(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 - ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc - ; CHECK: [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>) - ; CHECK: $sgpr30_sgpr31 = SI_CALL [[MV]](p0), 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] - ; CHECK: S_SETPC_B64_return [[COPY4]] + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:sreg_64(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>) + ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gfx_ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK-NEXT: S_SETPC_B64_return_gfx [[COPY4]] call amdgpu_gfx void %fptr() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -147,20 +147,20 @@ ; GCN: amdpal.pipelines: ; GCN-NEXT: - .registers: -; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}} -; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}} +; SDAG-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03ca{{$}} +; GISEL-NEXT: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03ce{{$}} ; GCN-NEXT: 0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}} ; GCN-NEXT: .shader_functions: ; GCN-NEXT: dynamic_stack: ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x24{{$}} +; GCN-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; SDAG-NEXT: .vgpr_count: 0x2{{$}} ; GISEL-NEXT: .vgpr_count: 0x3{{$}} ; GCN-NEXT: dynamic_stack_loop: ; GCN-NEXT: .lds_size: 0{{$}} -; SDAG-NEXT: .sgpr_count: 0x22{{$}} -; GISEL-NEXT: .sgpr_count: 0x24{{$}} +; SDAG-NEXT: .sgpr_count: 0x25{{$}} +; GISEL-NEXT: .sgpr_count: 0x26{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; SDAG-NEXT: .vgpr_count: 0x3{{$}} ; GISEL-NEXT: .vgpr_count: 0x4{{$}} @@ -176,26 +176,26 @@ ; GCN-NEXT: .vgpr_count: 0x1{{$}} ; GCN-NEXT: no_stack_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x21{{$}} +; GCN-NEXT: .sgpr_count: 0x26{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0{{$}} ; GCN-NEXT: .vgpr_count: 0x2{{$}} ; GCN-NEXT: no_stack_extern_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x24{{$}} -; GFX9-NEXT: .sgpr_count: 0x28{{$}} +; GFX8-NEXT: .sgpr_count: 0x44{{$}} +; GFX9-NEXT: .sgpr_count: 0x48{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; GCN-NEXT: .vgpr_count: 0x29{{$}} ; GCN-NEXT: no_stack_extern_call_many_args: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x24{{$}} -; GFX9-NEXT: .sgpr_count: 0x28{{$}} +; GFX8-NEXT: .sgpr_count: 0x44{{$}} +; GFX9-NEXT: .sgpr_count: 0x48{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} ; SDAG-NEXT: .vgpr_count: 0x2a{{$}} ; GISEL-NEXT: .vgpr_count: 0x34{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x24{{$}} -; GFX9-NEXT: .sgpr_count: 0x28{{$}} +; GFX8-NEXT: .sgpr_count: 0x44{{$}} +; GFX9-NEXT: .sgpr_count: 0x48{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; SDAG-NEXT: .vgpr_count: 0x2a{{$}} ; GISEL-NEXT: .vgpr_count: 0x34{{$}} @@ -206,7 +206,7 @@ ; GCN-NEXT: .vgpr_count: 0x1{{$}} ; GCN-NEXT: simple_lds_recurse: ; GCN-NEXT: .lds_size: 0x100{{$}} -; GCN-NEXT: .sgpr_count: 0x24{{$}} +; GCN-NEXT: .sgpr_count: 0x44{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} ; GCN-NEXT: .vgpr_count: 0x29{{$}} ; GCN-NEXT: simple_stack: @@ -216,25 +216,25 @@ ; GCN-NEXT: .vgpr_count: 0x2{{$}} ; GCN-NEXT: simple_stack_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x22{{$}} +; GCN-NEXT: .sgpr_count: 0x26{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} ; GCN-NEXT: .vgpr_count: 0x3{{$}} ; GCN-NEXT: simple_stack_extern_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x24{{$}} -; GFX9-NEXT: .sgpr_count: 0x28{{$}} +; GFX8-NEXT: .sgpr_count: 0x44{{$}} +; GFX9-NEXT: .sgpr_count: 0x48{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} ; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: simple_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} -; GFX8-NEXT: .sgpr_count: 0x24{{$}} -; GFX9-NEXT: .sgpr_count: 0x28{{$}} +; GFX8-NEXT: .sgpr_count: 0x44{{$}} +; GFX9-NEXT: .sgpr_count: 0x48{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} ; SDAG-NEXT: .vgpr_count: 0x2b{{$}} ; GISEL-NEXT: .vgpr_count: 0x34{{$}} ; GCN-NEXT: simple_stack_recurse: ; GCN-NEXT: .lds_size: 0{{$}} -; GCN-NEXT: .sgpr_count: 0x24{{$}} +; GCN-NEXT: .sgpr_count: 0x44{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} ; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: ... diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -96,59 +96,59 @@ ; GFX9-LABEL: test_call_external_void_func_i1_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i1@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i1@rel32@hi+12 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i1_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i1@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i1@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -165,20 +165,20 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_i1(i1 true) ret void } @@ -187,63 +187,63 @@ ; GFX9-LABEL: test_call_external_void_func_i1_signext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i1_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i1_signext@rel32@hi+12 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i1_signext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i1_signext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i1_signext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_signext: ; GFX10-SCRATCH: ; %bb.0: @@ -261,21 +261,21 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %var = load volatile i1, i1 addrspace(1)* undef call amdgpu_gfx void @external_void_func_i1_signext(i1 signext%var) ret void @@ -285,63 +285,63 @@ ; GFX9-LABEL: test_call_external_void_func_i1_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i1_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i1_zeroext@rel32@hi+12 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i1_zeroext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i1_zeroext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i1_zeroext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_zeroext: ; GFX10-SCRATCH: ; %bb.0: @@ -359,21 +359,21 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %var = load volatile i1, i1 addrspace(1)* undef call amdgpu_gfx void @external_void_func_i1_zeroext(i1 zeroext %var) ret void @@ -383,57 +383,57 @@ ; GFX9-LABEL: test_call_external_void_func_i8_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i8@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i8@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i8_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i8@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i8@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -450,19 +450,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_i8(i8 123) ret void } @@ -471,59 +471,59 @@ ; GFX9-LABEL: test_call_external_void_func_i8_signext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i8_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i8_signext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i8_signext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i8_signext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i8_signext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_signext: ; GFX10-SCRATCH: ; %bb.0: @@ -541,19 +541,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %var = load volatile i8, i8 addrspace(1)* undef call amdgpu_gfx void @external_void_func_i8_signext(i8 signext %var) ret void @@ -563,59 +563,59 @@ ; GFX9-LABEL: test_call_external_void_func_i8_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i8_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i8_zeroext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i8_zeroext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i8_zeroext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i8_zeroext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_zeroext: ; GFX10-SCRATCH: ; %bb.0: @@ -633,19 +633,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %var = load volatile i8, i8 addrspace(1)* undef call amdgpu_gfx void @external_void_func_i8_zeroext(i8 zeroext %var) ret void @@ -655,57 +655,57 @@ ; GFX9-LABEL: test_call_external_void_func_i16_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i16_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -722,19 +722,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_i16(i16 123) ret void } @@ -743,59 +743,59 @@ ; GFX9-LABEL: test_call_external_void_func_i16_signext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i16_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i16_signext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i16_signext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i16_signext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i16_signext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_signext: ; GFX10-SCRATCH: ; %bb.0: @@ -813,19 +813,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %var = load volatile i16, i16 addrspace(1)* undef call amdgpu_gfx void @external_void_func_i16_signext(i16 signext %var) ret void @@ -835,59 +835,59 @@ ; GFX9-LABEL: test_call_external_void_func_i16_zeroext: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i16_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i16_zeroext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i16_zeroext: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i16_zeroext@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i16_zeroext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_zeroext: ; GFX10-SCRATCH: ; %bb.0: @@ -905,19 +905,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %var = load volatile i16, i16 addrspace(1)* undef call amdgpu_gfx void @external_void_func_i16_zeroext(i16 zeroext %var) ret void @@ -927,57 +927,57 @@ ; GFX9-LABEL: test_call_external_void_func_i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -994,19 +994,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_i32(i32 42) ret void } @@ -1015,59 +1015,59 @@ ; GFX9-LABEL: test_call_external_void_func_i64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i64_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i64_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -1082,22 +1082,22 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_i64(i64 123) ret void } @@ -1106,61 +1106,61 @@ ; GFX9-LABEL: test_call_external_void_func_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2i64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2i64@rel32@hi+12 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64: ; GFX10-SCRATCH: ; %bb.0: @@ -1179,19 +1179,19 @@ ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <2 x i64>, <2 x i64> addrspace(1)* null call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> %val) ret void @@ -1201,63 +1201,63 @@ ; GFX9-LABEL: test_call_external_void_func_v2i64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2i64_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2i64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -1272,24 +1272,24 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> ) ret void } @@ -1298,41 +1298,41 @@ ; GFX9-LABEL: test_call_external_void_func_v3i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 @@ -1340,23 +1340,23 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3i64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i64: ; GFX10-SCRATCH: ; %bb.0: @@ -1373,23 +1373,23 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %load = load <2 x i64>, <2 x i64> addrspace(1)* null %val = shufflevector <2 x i64> %load, <2 x i64> , <3 x i32> @@ -1401,43 +1401,43 @@ ; GFX9-LABEL: test_call_external_void_func_v4i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 ; GFX9-NEXT: v_mov_b32_e32 v6, 3 ; GFX9-NEXT: v_mov_b32_e32 v7, 4 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v4i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v4i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v4i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 @@ -1445,25 +1445,25 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v4i64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v4i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i64: ; GFX10-SCRATCH: ; %bb.0: @@ -1480,25 +1480,25 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %load = load <2 x i64>, <2 x i64> addrspace(1)* null %val = shufflevector <2 x i64> %load, <2 x i64> , <4 x i32> call amdgpu_gfx void @external_void_func_v4i64(<4 x i64> %val) @@ -1509,57 +1509,57 @@ ; GFX9-LABEL: test_call_external_void_func_f16_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_f16_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_f16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -1576,19 +1576,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_f16(half 4.0) ret void } @@ -1597,57 +1597,57 @@ ; GFX9-LABEL: test_call_external_void_func_f32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_f32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_f32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -1664,19 +1664,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_f32(float 4.0) ret void } @@ -1685,59 +1685,59 @@ ; GFX9-LABEL: test_call_external_void_func_v2f32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2f32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2f32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -1752,22 +1752,22 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v2f32(<2 x float> ) ret void } @@ -1776,61 +1776,61 @@ ; GFX9-LABEL: test_call_external_void_func_v3f32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3f32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3f32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -1845,23 +1845,23 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3f32(<3 x float> ) ret void } @@ -1870,65 +1870,65 @@ ; GFX9-LABEL: test_call_external_void_func_v5f32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v5f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v5f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v5f32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v5f32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v5f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5f32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -1943,25 +1943,25 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v5f32(<5 x float> ) ret void } @@ -1970,59 +1970,59 @@ ; GFX9-LABEL: test_call_external_void_func_f64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_f64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_f64_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_f64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_f64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f64_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -2037,22 +2037,22 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_f64(double 4.0) ret void } @@ -2061,63 +2061,63 @@ ; GFX9-LABEL: test_call_external_void_func_v2f64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2f64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2f64_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2f64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2f64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f64_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -2132,24 +2132,24 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v2f64(<2 x double> ) ret void } @@ -2158,67 +2158,67 @@ ; GFX9-LABEL: test_call_external_void_func_v3f64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3f64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3f64_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3f64@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3f64@rel32@hi+12 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f64_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -2233,26 +2233,26 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x40200000 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3f64(<3 x double> ) ret void } @@ -2261,57 +2261,57 @@ ; GFX9-LABEL: test_call_external_void_func_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i16: ; GFX10-SCRATCH: ; %bb.0: @@ -2328,19 +2328,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <2 x i16>, <2 x i16> addrspace(1)* undef call amdgpu_gfx void @external_void_func_v2i16(<2 x i16> %val) ret void @@ -2350,57 +2350,57 @@ ; GFX9-LABEL: test_call_external_void_func_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16: ; GFX10-SCRATCH: ; %bb.0: @@ -2417,19 +2417,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <3 x i16>, <3 x i16> addrspace(1)* undef call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> %val) ret void @@ -2439,57 +2439,57 @@ ; GFX9-LABEL: test_call_external_void_func_v3f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3f16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16: ; GFX10-SCRATCH: ; %bb.0: @@ -2506,19 +2506,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <3 x half>, <3 x half> addrspace(1)* undef call amdgpu_gfx void @external_void_func_v3f16(<3 x half> %val) ret void @@ -2528,59 +2528,59 @@ ; GFX9-LABEL: test_call_external_void_func_v3i16_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 3 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3i16_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -2595,22 +2595,22 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> ) ret void } @@ -2619,59 +2619,59 @@ ; GFX9-LABEL: test_call_external_void_func_v3f16_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3f16_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3f16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -2686,22 +2686,22 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3f16(<3 x half> ) ret void } @@ -2710,57 +2710,57 @@ ; GFX9-LABEL: test_call_external_void_func_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v4i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v4i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v4i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v4i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16: ; GFX10-SCRATCH: ; %bb.0: @@ -2777,19 +2777,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <4 x i16>, <4 x i16> addrspace(1)* undef call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> %val) ret void @@ -2799,59 +2799,59 @@ ; GFX9-LABEL: test_call_external_void_func_v4i16_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v4i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v4i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v4i16_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v4i16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v4i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -2866,22 +2866,22 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> ) ret void } @@ -2890,57 +2890,57 @@ ; GFX9-LABEL: test_call_external_void_func_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2f16@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f16: ; GFX10-SCRATCH: ; %bb.0: @@ -2957,19 +2957,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <2 x half>, <2 x half> addrspace(1)* undef call amdgpu_gfx void @external_void_func_v2f16(<2 x half> %val) ret void @@ -2979,57 +2979,57 @@ ; GFX9-LABEL: test_call_external_void_func_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32: ; GFX10-SCRATCH: ; %bb.0: @@ -3046,19 +3046,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <2 x i32>, <2 x i32> addrspace(1)* undef call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> %val) ret void @@ -3068,59 +3068,59 @@ ; GFX9-LABEL: test_call_external_void_func_v2i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2i32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -3135,22 +3135,22 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> ) ret void } @@ -3159,61 +3159,61 @@ ; GFX9-LABEL: test_call_external_void_func_v3i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3i32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -3228,23 +3228,23 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3i32(<3 x i32> ) ret void } @@ -3253,63 +3253,63 @@ ; GFX9-LABEL: test_call_external_void_func_v3i32_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, 6 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3i32_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3i32_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3i32_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3i32_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3i32_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_i32: ; GFX10-SCRATCH: ; %bb.0: @@ -3324,24 +3324,24 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32> , i32 6) ret void } @@ -3350,57 +3350,57 @@ ; GFX9-LABEL: test_call_external_void_func_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v4i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v4i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v4i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v4i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32: ; GFX10-SCRATCH: ; %bb.0: @@ -3417,19 +3417,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <4 x i32>, <4 x i32> addrspace(1)* undef call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> %val) ret void @@ -3439,63 +3439,63 @@ ; GFX9-LABEL: test_call_external_void_func_v4i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v4i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v4i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v4i32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v4i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v4i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -3510,24 +3510,24 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> ) ret void } @@ -3536,65 +3536,65 @@ ; GFX9-LABEL: test_call_external_void_func_v5i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 ; GFX9-NEXT: v_mov_b32_e32 v4, 5 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v5i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v5i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v5i32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v5i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v5i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -3609,25 +3609,25 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v5i32(<5 x i32> ) ret void } @@ -3636,66 +3636,66 @@ ; GFX9-LABEL: test_call_external_void_func_v8i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[30:31] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[30:31] offset:16 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v8i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v8i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v8i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[30:31] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[30:31] offset:16 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v8i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v8i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32: ; GFX10-SCRATCH: ; %bb.0: @@ -3714,22 +3714,22 @@ ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> %val) @@ -3740,13 +3740,13 @@ ; GFX9-LABEL: test_call_external_void_func_v8i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 @@ -3755,56 +3755,56 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, 6 ; GFX9-NEXT: v_mov_b32_e32 v6, 7 ; GFX9-NEXT: v_mov_b32_e32 v7, 8 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v8i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v8i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v8i32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-NEXT: v_mov_b32_e32 v7, 8 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v8i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v8i32@rel32@hi+12 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -3819,28 +3819,28 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 8 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> ) ret void } @@ -3849,70 +3849,70 @@ ; GFX9-LABEL: test_call_external_void_func_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[30:31] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[30:31] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[30:31] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[30:31] offset:48 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v16i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v16i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v16i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[30:31] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[30:31] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[30:31] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[30:31] offset:48 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v16i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v16i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i32: ; GFX10-SCRATCH: ; %bb.0: @@ -3933,22 +3933,22 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr call amdgpu_gfx void @external_void_func_v16i32(<16 x i32> %val) @@ -3959,79 +3959,79 @@ ; GFX9-LABEL: test_call_external_void_func_v32i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 -; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 -; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[30:31] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[30:31] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[30:31] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[30:31] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[30:31] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[30:31] offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[30:31] offset:96 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[30:31] offset:112 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v32i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[30:31] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v32, s[30:31] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v32, s[30:31] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v32, s[30:31] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v32, s[30:31] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[30:31] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[30:31] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[30:31] offset:112 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32: ; GFX10-SCRATCH: ; %bb.0: @@ -4056,22 +4056,22 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr call amdgpu_gfx void @external_void_func_v32i32(<32 x i32> %val) @@ -4082,53 +4082,53 @@ ; GFX9-LABEL: test_call_external_void_func_v32i32_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 -; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 -; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 -; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 -; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 -; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[30:31] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[30:31] offset:16 +; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[30:31] offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[30:31] offset:48 +; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[30:31] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[30:31] offset:80 +; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[30:31] offset:96 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[30:31] offset:112 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32@rel32@hi+12 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: global_load_dword v32, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v32i32_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 @@ -4136,32 +4136,32 @@ ; GFX10-NEXT: global_load_dword v33, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[30:31] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v32, s[30:31] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v32, s[30:31] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v32, s[30:31] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v32, s[30:31] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[30:31] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[30:31] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[30:31] offset:112 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_i32: ; GFX10-SCRATCH: ; %bb.0: @@ -4187,24 +4187,24 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8) ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0 %val1 = load i32, i32 addrspace(1)* undef @@ -4216,74 +4216,74 @@ ; GFX9-LABEL: test_call_external_i32_func_i32_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 ; GFX9-NEXT: v_mov_b32_e32 v42, v1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_i32_func_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_i32_func_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: global_store_dword v[41:42], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_i32_func_i32_imm: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_mov_b32_e32 v41, v0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v42, v1 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_i32_func_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_i32_func_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX10-NEXT: global_store_dword v[41:42], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_i32_func_i32_imm: ; GFX10-SCRATCH: ; %bb.0: @@ -4298,30 +4298,30 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 ; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42) store volatile i32 %val, i32 addrspace(1)* %out ret void @@ -4331,66 +4331,66 @@ ; GFX9-LABEL: test_call_external_void_func_struct_i8_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:4 -; GFX9-NEXT: global_load_ubyte v0, v2, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: global_load_dword v1, v2, s[30:31] offset:4 +; GFX9-NEXT: global_load_ubyte v0, v2, s[30:31] +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_struct_i8_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ubyte v0, v2, s[4:5] -; GFX10-NEXT: global_load_dword v1, v2, s[4:5] offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: global_load_ubyte v0, v2, s[30:31] +; GFX10-NEXT: global_load_dword v1, v2, s[30:31] offset:4 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_struct_i8_i32: ; GFX10-SCRATCH: ; %bb.0: @@ -4409,22 +4409,22 @@ ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dword v1, v2, s[0:1] offset:4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 call amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 } %val) @@ -4435,65 +4435,65 @@ ; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_byval_struct_i8_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_byval_struct_i8_i32: ; GFX10-SCRATCH: ; %bb.0: @@ -4508,25 +4508,25 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = alloca { i8, i32 }, align 4, addrspace(5) %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 0 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 1 @@ -4540,9 +4540,9 @@ ; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 @@ -4551,18 +4551,18 @@ ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4570,20 +4570,20 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 @@ -4592,19 +4592,19 @@ ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -4612,12 +4612,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: ; GFX10-SCRATCH: ; %bb.0: @@ -4635,19 +4635,19 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: s_add_i32 vcc_lo, s33, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s64, s33, 8 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, vcc_lo +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s64 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s33 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_ubyte v0, off, s33 offset:8 ; GFX10-SCRATCH-NEXT: scratch_load_dword v1, off, s33 offset:12 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -4655,12 +4655,12 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: global_store_dword v[0:1], v1, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %in.val = alloca { i8, i32 }, align 4, addrspace(5) %out.val = alloca { i8, i32 }, align 4, addrspace(5) %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0 @@ -4682,21 +4682,21 @@ ; GFX9-LABEL: test_call_external_void_func_v16i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[30:31] +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v16i8@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v16i8@rel32@hi+12 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0 @@ -4716,38 +4716,38 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v16 ; GFX9-NEXT: v_mov_b32_e32 v2, v17 ; GFX9-NEXT: v_mov_b32_e32 v3, v18 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v16i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[30:31] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v16i8@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v16i8@rel32@hi+12 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v0 @@ -4767,17 +4767,17 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v16 ; GFX10-NEXT: v_mov_b32_e32 v2, v17 ; GFX10-NEXT: v_mov_b32_e32 v3, v18 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i8: ; GFX10-SCRATCH: ; %bb.0: @@ -4792,8 +4792,8 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -4820,16 +4820,16 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, v17 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, v18 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr call amdgpu_gfx void @external_void_func_v16i8(<16 x i8> %val) @@ -4840,44 +4840,269 @@ ; GFX9-LABEL: tail_call_byval_align16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_writelane_b32 v40, s33, 30 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:8 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:12 +; GFX9-NEXT: v_writelane_b32 v40, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s35, 1 +; GFX9-NEXT: v_writelane_b32 v40, s36, 2 +; GFX9-NEXT: v_writelane_b32 v40, s37, 3 +; GFX9-NEXT: v_writelane_b32 v40, s38, 4 +; GFX9-NEXT: v_writelane_b32 v40, s39, 5 +; GFX9-NEXT: v_writelane_b32 v40, s40, 6 +; GFX9-NEXT: v_writelane_b32 v40, s41, 7 +; GFX9-NEXT: v_writelane_b32 v40, s42, 8 +; GFX9-NEXT: v_writelane_b32 v40, s43, 9 +; GFX9-NEXT: v_writelane_b32 v40, s44, 10 +; GFX9-NEXT: v_writelane_b32 v40, s45, 11 +; GFX9-NEXT: v_writelane_b32 v40, s46, 12 +; GFX9-NEXT: v_writelane_b32 v40, s47, 13 +; GFX9-NEXT: v_writelane_b32 v40, s48, 14 +; GFX9-NEXT: v_writelane_b32 v40, s49, 15 +; GFX9-NEXT: v_writelane_b32 v40, s50, 16 +; GFX9-NEXT: v_writelane_b32 v40, s51, 17 +; GFX9-NEXT: v_writelane_b32 v40, s52, 18 +; GFX9-NEXT: v_writelane_b32 v40, s53, 19 +; GFX9-NEXT: v_writelane_b32 v40, s54, 20 +; GFX9-NEXT: v_writelane_b32 v40, s55, 21 +; GFX9-NEXT: v_writelane_b32 v40, s56, 22 +; GFX9-NEXT: v_writelane_b32 v40, s57, 23 +; GFX9-NEXT: v_writelane_b32 v40, s58, 24 +; GFX9-NEXT: v_writelane_b32 v40, s59, 25 +; GFX9-NEXT: v_writelane_b32 v40, s60, 26 +; GFX9-NEXT: v_writelane_b32 v40, s61, 27 +; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: v_writelane_b32 v40, s62, 28 +; GFX9-NEXT: v_writelane_b32 v40, s63, 29 +; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, byval_align16_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, byval_align16_f64_arg@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_readlane_b32 s63, v40, 29 +; GFX9-NEXT: v_readlane_b32 s62, v40, 28 +; GFX9-NEXT: v_readlane_b32 s61, v40, 27 +; GFX9-NEXT: v_readlane_b32 s60, v40, 26 +; GFX9-NEXT: v_readlane_b32 s59, v40, 25 +; GFX9-NEXT: v_readlane_b32 s58, v40, 24 +; GFX9-NEXT: v_readlane_b32 s57, v40, 23 +; GFX9-NEXT: v_readlane_b32 s56, v40, 22 +; GFX9-NEXT: v_readlane_b32 s55, v40, 21 +; GFX9-NEXT: v_readlane_b32 s54, v40, 20 +; GFX9-NEXT: v_readlane_b32 s53, v40, 19 +; GFX9-NEXT: v_readlane_b32 s52, v40, 18 +; GFX9-NEXT: v_readlane_b32 s51, v40, 17 +; GFX9-NEXT: v_readlane_b32 s50, v40, 16 +; GFX9-NEXT: v_readlane_b32 s49, v40, 15 +; GFX9-NEXT: v_readlane_b32 s48, v40, 14 +; GFX9-NEXT: v_readlane_b32 s47, v40, 13 +; GFX9-NEXT: v_readlane_b32 s46, v40, 12 +; GFX9-NEXT: v_readlane_b32 s45, v40, 11 +; GFX9-NEXT: v_readlane_b32 s44, v40, 10 +; GFX9-NEXT: v_readlane_b32 s43, v40, 9 +; GFX9-NEXT: v_readlane_b32 s42, v40, 8 +; GFX9-NEXT: v_readlane_b32 s41, v40, 7 +; GFX9-NEXT: v_readlane_b32 s40, v40, 6 +; GFX9-NEXT: v_readlane_b32 s39, v40, 5 +; GFX9-NEXT: v_readlane_b32 s38, v40, 4 +; GFX9-NEXT: v_readlane_b32 s37, v40, 3 +; GFX9-NEXT: v_readlane_b32 s36, v40, 2 +; GFX9-NEXT: v_readlane_b32 s35, v40, 1 +; GFX9-NEXT: v_readlane_b32 s34, v40, 0 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: v_readlane_b32 s33, v40, 30 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] ; ; GFX10-LABEL: tail_call_byval_align16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: v_writelane_b32 v40, s33, 30 +; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:8 +; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s34, 0 +; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX10-NEXT: s_getpc_b64 s[6:7] +; GFX10-NEXT: s_add_u32 s6, s6, byval_align16_f64_arg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s7, s7, byval_align16_f64_arg@rel32@hi+12 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 +; GFX10-NEXT: v_writelane_b32 v40, s35, 1 +; GFX10-NEXT: v_writelane_b32 v40, s36, 2 +; GFX10-NEXT: v_writelane_b32 v40, s37, 3 +; GFX10-NEXT: v_writelane_b32 v40, s38, 4 +; GFX10-NEXT: v_writelane_b32 v40, s39, 5 +; GFX10-NEXT: v_writelane_b32 v40, s40, 6 +; GFX10-NEXT: v_writelane_b32 v40, s41, 7 +; GFX10-NEXT: v_writelane_b32 v40, s42, 8 +; GFX10-NEXT: v_writelane_b32 v40, s43, 9 +; GFX10-NEXT: v_writelane_b32 v40, s44, 10 +; GFX10-NEXT: v_writelane_b32 v40, s45, 11 +; GFX10-NEXT: v_writelane_b32 v40, s46, 12 +; GFX10-NEXT: v_writelane_b32 v40, s47, 13 +; GFX10-NEXT: v_writelane_b32 v40, s48, 14 +; GFX10-NEXT: v_writelane_b32 v40, s49, 15 +; GFX10-NEXT: v_writelane_b32 v40, s50, 16 +; GFX10-NEXT: v_writelane_b32 v40, s51, 17 +; GFX10-NEXT: v_writelane_b32 v40, s52, 18 +; GFX10-NEXT: v_writelane_b32 v40, s53, 19 +; GFX10-NEXT: v_writelane_b32 v40, s54, 20 +; GFX10-NEXT: v_writelane_b32 v40, s55, 21 +; GFX10-NEXT: v_writelane_b32 v40, s56, 22 +; GFX10-NEXT: v_writelane_b32 v40, s57, 23 +; GFX10-NEXT: v_writelane_b32 v40, s58, 24 +; GFX10-NEXT: v_writelane_b32 v40, s59, 25 +; GFX10-NEXT: v_writelane_b32 v40, s60, 26 +; GFX10-NEXT: v_writelane_b32 v40, s61, 27 +; GFX10-NEXT: v_writelane_b32 v40, s62, 28 +; GFX10-NEXT: v_writelane_b32 v40, s63, 29 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_readlane_b32 s63, v40, 29 +; GFX10-NEXT: v_readlane_b32 s62, v40, 28 +; GFX10-NEXT: v_readlane_b32 s61, v40, 27 +; GFX10-NEXT: v_readlane_b32 s60, v40, 26 +; GFX10-NEXT: v_readlane_b32 s59, v40, 25 +; GFX10-NEXT: v_readlane_b32 s58, v40, 24 +; GFX10-NEXT: v_readlane_b32 s57, v40, 23 +; GFX10-NEXT: v_readlane_b32 s56, v40, 22 +; GFX10-NEXT: v_readlane_b32 s55, v40, 21 +; GFX10-NEXT: v_readlane_b32 s54, v40, 20 +; GFX10-NEXT: v_readlane_b32 s53, v40, 19 +; GFX10-NEXT: v_readlane_b32 s52, v40, 18 +; GFX10-NEXT: v_readlane_b32 s51, v40, 17 +; GFX10-NEXT: v_readlane_b32 s50, v40, 16 +; GFX10-NEXT: v_readlane_b32 s49, v40, 15 +; GFX10-NEXT: v_readlane_b32 s48, v40, 14 +; GFX10-NEXT: v_readlane_b32 s47, v40, 13 +; GFX10-NEXT: v_readlane_b32 s46, v40, 12 +; GFX10-NEXT: v_readlane_b32 s45, v40, 11 +; GFX10-NEXT: v_readlane_b32 s44, v40, 10 +; GFX10-NEXT: v_readlane_b32 s43, v40, 9 +; GFX10-NEXT: v_readlane_b32 s42, v40, 8 +; GFX10-NEXT: v_readlane_b32 s41, v40, 7 +; GFX10-NEXT: v_readlane_b32 s40, v40, 6 +; GFX10-NEXT: v_readlane_b32 s39, v40, 5 +; GFX10-NEXT: v_readlane_b32 s38, v40, 4 +; GFX10-NEXT: v_readlane_b32 s37, v40, 3 +; GFX10-NEXT: v_readlane_b32 s36, v40, 2 +; GFX10-NEXT: v_readlane_b32 s35, v40, 1 +; GFX10-NEXT: v_readlane_b32 s34, v40, 0 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: v_readlane_b32 s33, v40, 30 +; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[4:5] ; ; GFX10-SCRATCH-LABEL: tail_call_byval_align16: ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s32 offset:8 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 offset:16 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 30 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:8 +; GFX10-SCRATCH-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s35, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s40, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s41, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s42, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s43, 9 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s44, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s45, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s46, 12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s47, 13 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 15 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 17 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 18 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 19 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 20 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 21 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s56, 22 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s57, 23 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s58, 24 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s59, 25 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s60, 26 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s61, 27 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s62, 28 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s63, 29 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s63, v40, 29 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s62, v40, 28 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s61, v40, 27 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s60, v40, 26 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s59, v40, 25 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s58, v40, 24 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s57, v40, 23 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s56, v40, 22 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 21 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 20 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 19 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 18 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 17 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 16 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 15 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s47, v40, 13 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s46, v40, 12 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s45, v40, 11 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s44, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s43, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s42, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s41, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s40, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s36, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s35, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s34, v40, 0 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 30 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:16 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca double, align 8, addrspace(5) tail call amdgpu_gfx void @byval_align16_f64_arg(<32 x i32> %val, double addrspace(5)* byval(double) align 16 %alloca) @@ -4889,59 +5114,59 @@ ; GFX9-LABEL: test_call_external_void_func_i1_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_inreg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i1_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i1_inreg@rel32@hi+12 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i1_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i1_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i1_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -4958,20 +5183,20 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_i1_inreg(i1 inreg true) ret void } @@ -4980,57 +5205,61 @@ ; GFX9-LABEL: test_call_external_void_func_i8_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i8_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i8_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i8_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i8_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i8_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i8_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i8_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i8_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i8_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5040,26 +5269,28 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_i8_inreg(i8 inreg 123) ret void } @@ -5068,57 +5299,61 @@ ; GFX9-LABEL: test_call_external_void_func_i16_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i16_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i16_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5128,26 +5363,28 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_i16_inreg(i16 inreg 123) ret void } @@ -5156,57 +5393,61 @@ ; GFX9-LABEL: test_call_external_void_func_i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 ; GFX9-NEXT: s_mov_b32 s4, 42 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i32_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s4, 42 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 42 +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5216,26 +5457,28 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_i32_inreg(i32 inreg 42) ret void } @@ -5244,59 +5487,67 @@ ; GFX9-LABEL: test_call_external_void_func_i64_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_mov_b32 s5, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_i64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_i64_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i64_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_i64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i64_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5306,27 +5557,31 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_i64_inreg(i64 inreg 123) ret void } @@ -5335,59 +5590,75 @@ ; GFX9-LABEL: test_call_external_void_func_v2i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: s_mov_b64 s[30:31], 0 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 4 +; GFX9-NEXT: v_writelane_b32 v40, s65, 5 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 5 +; GFX9-NEXT: v_readlane_b32 s64, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2i64_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b64 s[4:5], 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: s_mov_b64 s[30:31], 0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[8:9] -; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2i64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-NEXT: v_writelane_b32 v40, s65, 5 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5397,27 +5668,35 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <2 x i64>, <2 x i64> addrspace(4)* null call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg %val) ret void @@ -5427,63 +5706,79 @@ ; GFX9-LABEL: test_call_external_void_func_v2i64_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 4 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 5 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 5 +; GFX9-NEXT: v_readlane_b32 s64, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2i64_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2i64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 ; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[8:9] -; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-NEXT: v_writelane_b32 v40, s65, 5 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5493,29 +5788,37 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg ) ret void } @@ -5524,63 +5827,87 @@ ; GFX9-LABEL: test_call_external_void_func_v3i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 8 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: s_mov_b64 s[30:31], 0 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[30:31], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 6 ; GFX9-NEXT: s_mov_b32 s8, 1 ; GFX9-NEXT: s_mov_b32 s9, 2 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[10:11] -; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v3i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v3i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 7 +; GFX9-NEXT: s_getpc_b64 s[34:35] +; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_v3i64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s65, v40, 7 +; GFX9-NEXT: v_readlane_b32 s64, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 8 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3i64_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b64 s[4:5], 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: s_mov_b32 s8, 1 -; GFX10-NEXT: s_mov_b32 s9, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-NEXT: s_mov_b64 s[30:31], 0 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[10:11] -; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v3i64_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v3i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3i64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 1 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: s_mov_b32 s9, 2 +; GFX10-NEXT: v_writelane_b32 v40, s64, 6 +; GFX10-NEXT: v_writelane_b32 v40, s65, 7 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 7 +; GFX10-NEXT: v_readlane_b32 s64, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i64_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5590,29 +5917,41 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 8 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 7 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %load = load <2 x i64>, <2 x i64> addrspace(4)* null %val = shufflevector <2 x i64> %load, <2 x i64> , <3 x i32> @@ -5624,67 +5963,99 @@ ; GFX9-LABEL: test_call_external_void_func_v4i64_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 10 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_mov_b64 s[30:31], 0 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[30:31], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 8 ; GFX9-NEXT: s_mov_b32 s8, 1 ; GFX9-NEXT: s_mov_b32 s9, 2 ; GFX9-NEXT: s_mov_b32 s10, 3 ; GFX9-NEXT: s_mov_b32 s11, 4 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[12:13] -; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v4i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v4i64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 9 +; GFX9-NEXT: s_getpc_b64 s[34:35] +; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_v4i64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s65, v40, 9 +; GFX9-NEXT: v_readlane_b32 s64, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v4i64_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_mov_b64 s[4:5], 0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: s_mov_b64 s[30:31], 0 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v4i64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v4i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-NEXT: s_mov_b32 s8, 1 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-NEXT: s_mov_b32 s9, 2 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 ; GFX10-NEXT: s_mov_b32 s10, 3 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-NEXT: s_mov_b32 s11, 4 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[12:13] -; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v4i64_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v4i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[12:13] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 8 +; GFX10-NEXT: v_writelane_b32 v40, s65, 9 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 9 +; GFX10-NEXT: v_readlane_b32 s64, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i64_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5694,31 +6065,47 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 9 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %load = load <2 x i64>, <2 x i64> addrspace(4)* null %val = shufflevector <2 x i64> %load, <2 x i64> , <4 x i32> call amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> inreg %val) @@ -5729,57 +6116,61 @@ ; GFX9-LABEL: test_call_external_void_func_f16_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x4400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_f16_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f16_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_f16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5789,26 +6180,28 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_f16_inreg(half inreg 4.0) ret void } @@ -5817,57 +6210,61 @@ ; GFX9-LABEL: test_call_external_void_func_f32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 ; GFX9-NEXT: s_mov_b32 s4, 4.0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_f32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_f32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_f32_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s4, 4.0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_f32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5877,26 +6274,28 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_f32_inreg(float inreg 4.0) ret void } @@ -5905,59 +6304,67 @@ ; GFX9-LABEL: test_call_external_void_func_v2f32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2f32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2f32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2f32_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2f32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2f32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -5967,27 +6374,31 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg ) ret void } @@ -5996,61 +6407,73 @@ ; GFX9-LABEL: test_call_external_void_func_v3f32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 5 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 3 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 4.0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3f32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 4 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3f32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 4 +; GFX9-NEXT: v_readlane_b32 s64, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 5 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3f32_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 5 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3f32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 4.0 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[8:9] -; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3f32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 3 +; GFX10-NEXT: v_writelane_b32 v40, s65, 4 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 4 +; GFX10-NEXT: v_readlane_b32 s64, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6060,28 +6483,34 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 5 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 4 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> inreg ) ret void } @@ -6090,65 +6519,85 @@ ; GFX9-LABEL: test_call_external_void_func_v5f32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 7 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 5 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 4.0 ; GFX9-NEXT: s_mov_b32 s7, -1.0 ; GFX9-NEXT: s_mov_b32 s8, 0.5 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[10:11] -; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v5f32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 6 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v5f32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 6 +; GFX9-NEXT: v_readlane_b32 s64, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 7 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v5f32_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 7 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v5f32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 ; GFX10-NEXT: s_mov_b32 s7, -1.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-NEXT: s_mov_b32 s8, 0.5 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[10:11] -; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v5f32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 5 +; GFX10-NEXT: v_writelane_b32 v40, s65, 6 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 6 +; GFX10-NEXT: v_readlane_b32 s64, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5f32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6158,30 +6607,40 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, -1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0.5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 7 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, -1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0.5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 6 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> inreg ) ret void } @@ -6190,59 +6649,67 @@ ; GFX9-LABEL: test_call_external_void_func_f64_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 0x40100000 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_f64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_f64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_f64_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f64_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_f64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f64_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6252,27 +6719,31 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_f64_inreg(double inreg 4.0) ret void } @@ -6281,63 +6752,79 @@ ; GFX9-LABEL: test_call_external_void_func_v2f64_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 4 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2f64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 5 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2f64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 5 +; GFX9-NEXT: v_readlane_b32 s64, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2f64_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2f64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 ; GFX10-NEXT: s_mov_b32 s7, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[8:9] -; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2f64_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-NEXT: v_writelane_b32 v40, s65, 5 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f64_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6347,29 +6834,37 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> inreg ) ret void } @@ -6378,67 +6873,91 @@ ; GFX9-LABEL: test_call_external_void_func_v3f64_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 8 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 6 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 ; GFX9-NEXT: s_mov_b32 s8, 0 ; GFX9-NEXT: s_mov_b32 s9, 0x40200000 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[10:11] -; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v3f64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 7 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3f64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 7 +; GFX9-NEXT: v_readlane_b32 s64, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 8 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3f64_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3f64_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 ; GFX10-NEXT: s_mov_b32 s7, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-NEXT: s_mov_b32 s9, 0x40200000 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[10:11] -; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v3f64_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 6 +; GFX10-NEXT: v_writelane_b32 v40, s65, 7 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 7 +; GFX10-NEXT: v_readlane_b32 s64, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f64_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6448,31 +6967,43 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 8 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 +; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 0x40200000 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] -; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 -; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 7 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 8 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> inreg ) ret void } @@ -6481,57 +7012,61 @@ ; GFX9-LABEL: test_call_external_void_func_v2i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_load_dword s4, s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2i16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2i16_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i16_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_load_dword s4, s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i16_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6541,26 +7076,28 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <2 x i16>, <2 x i16> addrspace(4)* undef call amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> inreg %val) ret void @@ -6570,57 +7107,65 @@ ; GFX9-LABEL: test_call_external_void_func_v3i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3i16_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6630,26 +7175,30 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <3 x i16>, <3 x i16> addrspace(4)* undef call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg %val) ret void @@ -6659,57 +7208,65 @@ ; GFX9-LABEL: test_call_external_void_func_v3f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3f16_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6719,26 +7276,30 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <3 x half>, <3 x half> addrspace(4)* undef call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg %val) ret void @@ -6748,59 +7309,67 @@ ; GFX9-LABEL: test_call_external_void_func_v3i16_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 3 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3i16_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: s_mov_b32 s5, 3 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 3 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6810,27 +7379,31 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg ) ret void } @@ -6839,59 +7412,67 @@ ; GFX9-LABEL: test_call_external_void_func_v3f16_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX9-NEXT: s_movk_i32 s5, 0x4400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3f16_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6901,27 +7482,31 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg ) ret void } @@ -6930,57 +7515,65 @@ ; GFX9-LABEL: test_call_external_void_func_v4i16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v4i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v4i16_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v4i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -6990,26 +7583,30 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <4 x i16>, <4 x i16> addrspace(4)* undef call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg %val) ret void @@ -7019,59 +7616,67 @@ ; GFX9-LABEL: test_call_external_void_func_v4i16_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 0x40003 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v4i16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v4i16_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v4i16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -7081,27 +7686,31 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg ) ret void } @@ -7110,57 +7719,61 @@ ; GFX9-LABEL: test_call_external_void_func_v2f16_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_load_dword s4, s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2f16_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2f16_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2f16_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2f16_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_load_dword s4, s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2f16_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f16_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -7170,26 +7783,28 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <2 x half>, <2 x half> addrspace(4)* undef call amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> inreg %val) ret void @@ -7199,57 +7814,65 @@ ; GFX9-LABEL: test_call_external_void_func_v2i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2i32_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -7259,26 +7882,30 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <2 x i32>, <2 x i32> addrspace(4)* undef call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg %val) ret void @@ -7288,59 +7915,67 @@ ; GFX9-LABEL: test_call_external_void_func_v2i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 4 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v2i32_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -7350,27 +7985,31 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg ) ret void } @@ -7379,61 +8018,73 @@ ; GFX9-LABEL: test_call_external_void_func_v3i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 5 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 3 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 ; GFX9-NEXT: s_mov_b32 s6, 5 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 4 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 4 +; GFX9-NEXT: v_readlane_b32 s64, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 5 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3i32_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 5 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 4 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 5 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[8:9] -; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 3 +; GFX10-NEXT: v_writelane_b32 v40, s65, 4 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 4 +; GFX10-NEXT: v_readlane_b32 s64, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -7443,28 +8094,34 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 5 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 4 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 5 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> inreg ) ret void } @@ -7473,63 +8130,79 @@ ; GFX9-LABEL: test_call_external_void_func_v3i32_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 4 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 ; GFX9-NEXT: s_mov_b32 s6, 5 ; GFX9-NEXT: s_mov_b32 s7, 6 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 5 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v3i32_i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 5 +; GFX9-NEXT: v_readlane_b32 s64, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v3i32_i32_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v3i32_i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 4 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 5 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 ; GFX10-NEXT: s_mov_b32 s7, 6 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[8:9] -; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-NEXT: v_writelane_b32 v40, s65, 5 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_i32_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -7539,29 +8212,37 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> inreg , i32 inreg 6) ret void } @@ -7570,57 +8251,73 @@ ; GFX9-LABEL: test_call_external_void_func_v4i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 4 +; GFX9-NEXT: v_writelane_b32 v40, s65, 5 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 5 +; GFX9-NEXT: v_readlane_b32 s64, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v4i32_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 6 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[8:9] -; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-NEXT: v_writelane_b32 v40, s65, 5 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -7630,26 +8327,34 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %val = load <4 x i32>, <4 x i32> addrspace(4)* undef call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg %val) ret void @@ -7659,63 +8364,79 @@ ; GFX9-LABEL: test_call_external_void_func_v4i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 4 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[8:9] -; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 5 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 5 +; GFX9-NEXT: v_readlane_b32 s64, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 6 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v4i32_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 6 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 ; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[8:9] -; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-NEXT: v_writelane_b32 v40, s65, 5 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -7725,29 +8446,37 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 6 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 6 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg ) ret void } @@ -7756,65 +8485,85 @@ ; GFX9-LABEL: test_call_external_void_func_v5i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 7 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 5 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 ; GFX9-NEXT: s_mov_b32 s8, 5 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[10:11] -; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v5i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 6 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v5i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 6 +; GFX9-NEXT: v_readlane_b32 s64, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 7 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v5i32_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 7 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v5i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 ; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-NEXT: s_mov_b32 s8, 5 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[10:11] -; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v5i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 5 +; GFX10-NEXT: v_writelane_b32 v40, s65, 6 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 6 +; GFX10-NEXT: v_readlane_b32 s64, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -7824,30 +8573,40 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 7 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 6 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 7 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> inreg ) ret void } @@ -7856,61 +8615,93 @@ ; GFX9-LABEL: test_call_external_void_func_v8i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 10 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[12:13] -; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 8 +; GFX9-NEXT: v_writelane_b32 v40, s65, 9 +; GFX9-NEXT: s_getpc_b64 s[34:35] +; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s65, v40, 9 +; GFX9-NEXT: v_readlane_b32 s64, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v8i32_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[12:13] -; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[12:13] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 8 +; GFX10-NEXT: v_writelane_b32 v40, s65, 9 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 9 +; GFX10-NEXT: v_readlane_b32 s64, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -7920,28 +8711,44 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 9 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %ptr = load <8 x i32> addrspace(4)*, <8 x i32> addrspace(4)* addrspace(4)* undef %val = load <8 x i32>, <8 x i32> addrspace(4)* %ptr call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg %val) @@ -7952,13 +8759,21 @@ ; GFX9-LABEL: test_call_external_void_func_v8i32_imm_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 10 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 8 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 @@ -7967,56 +8782,80 @@ ; GFX9-NEXT: s_mov_b32 s9, 6 ; GFX9-NEXT: s_mov_b32 s10, 7 ; GFX9-NEXT: s_mov_b32 s11, 8 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[12:13] -; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 9 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 9 +; GFX9-NEXT: v_readlane_b32 s64, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v8i32_imm_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 ; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-NEXT: s_mov_b32 s8, 5 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-NEXT: s_mov_b32 s9, 6 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 ; GFX10-NEXT: s_mov_b32 s10, 7 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-NEXT: s_mov_b32 s11, 8 -; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[12:13] -; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[12:13] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 8 +; GFX10-NEXT: v_writelane_b32 v40, s65, 9 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 9 +; GFX10-NEXT: v_readlane_b32 s64, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -8026,33 +8865,49 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 ; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 ; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 8 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] -; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 -; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 9 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg ) ret void } @@ -8061,61 +8916,125 @@ ; GFX9-LABEL: test_call_external_void_func_v16i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 18 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v40, s12, 8 +; GFX9-NEXT: v_writelane_b32 v40, s13, 9 +; GFX9-NEXT: v_writelane_b32 v40, s14, 10 +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s15, 11 +; GFX9-NEXT: v_writelane_b32 v40, s16, 12 +; GFX9-NEXT: v_writelane_b32 v40, s17, 13 +; GFX9-NEXT: v_writelane_b32 v40, s18, 14 +; GFX9-NEXT: v_writelane_b32 v40, s19, 15 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[20:21] -; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v16i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v16i32_inreg@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 16 +; GFX9-NEXT: v_writelane_b32 v40, s65, 17 +; GFX9-NEXT: s_getpc_b64 s[34:35] +; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v16i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_v16i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s65, v40, 17 +; GFX9-NEXT: v_readlane_b32 s64, v40, 16 +; GFX9-NEXT: v_readlane_b32 s19, v40, 15 +; GFX9-NEXT: v_readlane_b32 s18, v40, 14 +; GFX9-NEXT: v_readlane_b32 s17, v40, 13 +; GFX9-NEXT: v_readlane_b32 s16, v40, 12 +; GFX9-NEXT: v_readlane_b32 s15, v40, 11 +; GFX9-NEXT: v_readlane_b32 s14, v40, 10 +; GFX9-NEXT: v_readlane_b32 s13, v40, 9 +; GFX9-NEXT: v_readlane_b32 s12, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 18 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v16i32_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 18 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[20:21] -; GFX10-NEXT: s_add_u32 s20, s20, external_void_func_v16i32_inreg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s21, s21, external_void_func_v16i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-NEXT: v_writelane_b32 v40, s12, 8 +; GFX10-NEXT: v_writelane_b32 v40, s13, 9 +; GFX10-NEXT: v_writelane_b32 v40, s14, 10 +; GFX10-NEXT: v_writelane_b32 v40, s15, 11 +; GFX10-NEXT: v_writelane_b32 v40, s16, 12 +; GFX10-NEXT: v_writelane_b32 v40, s17, 13 +; GFX10-NEXT: v_writelane_b32 v40, s18, 14 +; GFX10-NEXT: v_writelane_b32 v40, s19, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[20:21] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v16i32_inreg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v16i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 16 +; GFX10-NEXT: v_writelane_b32 v40, s65, 17 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 17 +; GFX10-NEXT: v_readlane_b32 s64, v40, 16 +; GFX10-NEXT: v_readlane_b32 s19, v40, 15 +; GFX10-NEXT: v_readlane_b32 s18, v40, 14 +; GFX10-NEXT: v_readlane_b32 s17, v40, 13 +; GFX10-NEXT: v_readlane_b32 s16, v40, 12 +; GFX10-NEXT: v_readlane_b32 s15, v40, 11 +; GFX10-NEXT: v_readlane_b32 s14, v40, 10 +; GFX10-NEXT: v_readlane_b32 s13, v40, 9 +; GFX10-NEXT: v_readlane_b32 s12, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 18 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i32_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -8125,28 +9044,60 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s12, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s13, 9 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s14, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s15, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s16, 12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 17 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 17 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 16 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s19, v40, 15 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s18, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s17, v40, 13 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s16, v40, 12 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s15, v40, 11 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s14, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s13, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s12, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 18 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %ptr = load <16 x i32> addrspace(4)*, <16 x i32> addrspace(4)* addrspace(4)* undef %val = load <16 x i32>, <16 x i32> addrspace(4)* %ptr call amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> inreg %val) @@ -8157,36 +9108,44 @@ ; GFX9-LABEL: test_call_external_void_func_v32i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 18 -; GFX9-NEXT: v_writelane_b32 v40, s36, 0 -; GFX9-NEXT: v_writelane_b32 v40, s37, 1 -; GFX9-NEXT: v_writelane_b32 v40, s38, 2 -; GFX9-NEXT: v_writelane_b32 v40, s39, 3 -; GFX9-NEXT: v_writelane_b32 v40, s40, 4 -; GFX9-NEXT: v_writelane_b32 v40, s41, 5 -; GFX9-NEXT: v_writelane_b32 v40, s42, 6 -; GFX9-NEXT: v_writelane_b32 v40, s43, 7 -; GFX9-NEXT: v_writelane_b32 v40, s44, 8 -; GFX9-NEXT: v_writelane_b32 v40, s45, 9 -; GFX9-NEXT: v_writelane_b32 v40, s46, 10 -; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s47, 11 -; GFX9-NEXT: v_writelane_b32 v40, s48, 12 -; GFX9-NEXT: v_writelane_b32 v40, s49, 13 -; GFX9-NEXT: v_writelane_b32 v40, s50, 14 -; GFX9-NEXT: v_writelane_b32 v40, s51, 15 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 28 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v40, s12, 8 +; GFX9-NEXT: v_writelane_b32 v40, s13, 9 +; GFX9-NEXT: v_writelane_b32 v40, s14, 10 +; GFX9-NEXT: v_writelane_b32 v40, s15, 11 +; GFX9-NEXT: v_writelane_b32 v40, s16, 12 +; GFX9-NEXT: v_writelane_b32 v40, s17, 13 +; GFX9-NEXT: v_writelane_b32 v40, s18, 14 +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s19, 15 +; GFX9-NEXT: v_writelane_b32 v40, s20, 16 +; GFX9-NEXT: v_writelane_b32 v40, s21, 17 +; GFX9-NEXT: v_writelane_b32 v40, s22, 18 +; GFX9-NEXT: v_writelane_b32 v40, s23, 19 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[30:31], 0x40 +; GFX9-NEXT: v_writelane_b32 v40, s24, 20 +; GFX9-NEXT: v_writelane_b32 v40, s25, 21 +; GFX9-NEXT: v_writelane_b32 v40, s26, 22 +; GFX9-NEXT: v_writelane_b32 v40, s27, 23 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 17 +; GFX9-NEXT: v_writelane_b32 v40, s28, 24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s46 +; GFX9-NEXT: v_writelane_b32 v40, s29, 25 ; GFX9-NEXT: v_mov_b32_e32 v1, s47 ; GFX9-NEXT: v_mov_b32_e32 v2, s48 ; GFX9-NEXT: v_mov_b32_e32 v3, s49 @@ -8195,6 +9154,7 @@ ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s50 +; GFX9-NEXT: v_writelane_b32 v40, s64, 26 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; GFX9-NEXT: v_mov_b32_e32 v0, s51 ; GFX9-NEXT: s_mov_b32 s20, s36 @@ -8207,89 +9167,100 @@ ; GFX9-NEXT: s_mov_b32 s27, s43 ; GFX9-NEXT: s_mov_b32 s28, s44 ; GFX9-NEXT: s_mov_b32 s29, s45 +; GFX9-NEXT: v_writelane_b32 v40, s65, 27 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_getpc_b64 s[30:31] ; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] -; GFX9-NEXT: v_readlane_b32 s4, v40, 16 -; GFX9-NEXT: v_readlane_b32 s5, v40, 17 -; GFX9-NEXT: v_readlane_b32 s51, v40, 15 -; GFX9-NEXT: v_readlane_b32 s50, v40, 14 -; GFX9-NEXT: v_readlane_b32 s49, v40, 13 -; GFX9-NEXT: v_readlane_b32 s48, v40, 12 -; GFX9-NEXT: v_readlane_b32 s47, v40, 11 -; GFX9-NEXT: v_readlane_b32 s46, v40, 10 -; GFX9-NEXT: v_readlane_b32 s45, v40, 9 -; GFX9-NEXT: v_readlane_b32 s44, v40, 8 -; GFX9-NEXT: v_readlane_b32 s43, v40, 7 -; GFX9-NEXT: v_readlane_b32 s42, v40, 6 -; GFX9-NEXT: v_readlane_b32 s41, v40, 5 -; GFX9-NEXT: v_readlane_b32 s40, v40, 4 -; GFX9-NEXT: v_readlane_b32 s39, v40, 3 -; GFX9-NEXT: v_readlane_b32 s38, v40, 2 -; GFX9-NEXT: v_readlane_b32 s37, v40, 1 -; GFX9-NEXT: v_readlane_b32 s36, v40, 0 +; GFX9-NEXT: v_readlane_b32 s65, v40, 27 +; GFX9-NEXT: v_readlane_b32 s64, v40, 26 +; GFX9-NEXT: v_readlane_b32 s29, v40, 25 +; GFX9-NEXT: v_readlane_b32 s28, v40, 24 +; GFX9-NEXT: v_readlane_b32 s27, v40, 23 +; GFX9-NEXT: v_readlane_b32 s26, v40, 22 +; GFX9-NEXT: v_readlane_b32 s25, v40, 21 +; GFX9-NEXT: v_readlane_b32 s24, v40, 20 +; GFX9-NEXT: v_readlane_b32 s23, v40, 19 +; GFX9-NEXT: v_readlane_b32 s22, v40, 18 +; GFX9-NEXT: v_readlane_b32 s21, v40, 17 +; GFX9-NEXT: v_readlane_b32 s20, v40, 16 +; GFX9-NEXT: v_readlane_b32 s19, v40, 15 +; GFX9-NEXT: v_readlane_b32 s18, v40, 14 +; GFX9-NEXT: v_readlane_b32 s17, v40, 13 +; GFX9-NEXT: v_readlane_b32 s16, v40, 12 +; GFX9-NEXT: v_readlane_b32 s15, v40, 11 +; GFX9-NEXT: v_readlane_b32 s14, v40, 10 +; GFX9-NEXT: v_readlane_b32 s13, v40, 9 +; GFX9-NEXT: v_readlane_b32 s12, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 18 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 28 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v32i32_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 18 -; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 28 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s36, 0 -; GFX10-NEXT: v_writelane_b32 v40, s37, 1 -; GFX10-NEXT: v_writelane_b32 v40, s38, 2 -; GFX10-NEXT: v_writelane_b32 v40, s39, 3 -; GFX10-NEXT: v_writelane_b32 v40, s40, 4 -; GFX10-NEXT: v_writelane_b32 v40, s41, 5 -; GFX10-NEXT: v_writelane_b32 v40, s42, 6 -; GFX10-NEXT: v_writelane_b32 v40, s43, 7 -; GFX10-NEXT: v_writelane_b32 v40, s44, 8 -; GFX10-NEXT: v_writelane_b32 v40, s45, 9 -; GFX10-NEXT: v_writelane_b32 v40, s46, 10 -; GFX10-NEXT: v_writelane_b32 v40, s47, 11 -; GFX10-NEXT: v_writelane_b32 v40, s48, 12 -; GFX10-NEXT: v_writelane_b32 v40, s49, 13 -; GFX10-NEXT: v_writelane_b32 v40, s50, 14 -; GFX10-NEXT: v_writelane_b32 v40, s51, 15 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-NEXT: v_writelane_b32 v40, s12, 8 +; GFX10-NEXT: v_writelane_b32 v40, s13, 9 +; GFX10-NEXT: v_writelane_b32 v40, s14, 10 +; GFX10-NEXT: v_writelane_b32 v40, s15, 11 +; GFX10-NEXT: v_writelane_b32 v40, s16, 12 +; GFX10-NEXT: v_writelane_b32 v40, s17, 13 +; GFX10-NEXT: v_writelane_b32 v40, s18, 14 +; GFX10-NEXT: v_writelane_b32 v40, s19, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 16 -; GFX10-NEXT: v_writelane_b32 v40, s31, 17 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[30:31], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 ; GFX10-NEXT: s_getpc_b64 s[30:31] ; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s20, 16 +; GFX10-NEXT: v_writelane_b32 v40, s21, 17 +; GFX10-NEXT: v_writelane_b32 v40, s22, 18 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-NEXT: v_writelane_b32 v40, s23, 19 ; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-NEXT: s_mov_b32 s20, s36 +; GFX10-NEXT: v_writelane_b32 v40, s24, 20 ; GFX10-NEXT: s_mov_b32 s21, s37 ; GFX10-NEXT: s_mov_b32 s22, s38 ; GFX10-NEXT: s_mov_b32 s23, s39 ; GFX10-NEXT: s_mov_b32 s24, s40 +; GFX10-NEXT: v_writelane_b32 v40, s25, 21 ; GFX10-NEXT: s_mov_b32 s25, s41 -; GFX10-NEXT: s_mov_b32 s26, s42 -; GFX10-NEXT: s_mov_b32 s27, s43 -; GFX10-NEXT: s_mov_b32 s28, s44 -; GFX10-NEXT: s_mov_b32 s29, s45 ; GFX10-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-NEXT: v_mov_b32_e32 v5, s51 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 @@ -8298,33 +9269,53 @@ ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 +; GFX10-NEXT: v_writelane_b32 v40, s26, 22 +; GFX10-NEXT: s_mov_b32 s26, s42 +; GFX10-NEXT: v_writelane_b32 v40, s27, 23 +; GFX10-NEXT: s_mov_b32 s27, s43 +; GFX10-NEXT: v_writelane_b32 v40, s28, 24 +; GFX10-NEXT: s_mov_b32 s28, s44 +; GFX10-NEXT: v_writelane_b32 v40, s29, 25 +; GFX10-NEXT: s_mov_b32 s29, s45 +; GFX10-NEXT: v_writelane_b32 v40, s64, 26 +; GFX10-NEXT: v_writelane_b32 v40, s65, 27 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] -; GFX10-NEXT: v_readlane_b32 s4, v40, 16 -; GFX10-NEXT: v_readlane_b32 s5, v40, 17 -; GFX10-NEXT: v_readlane_b32 s51, v40, 15 -; GFX10-NEXT: v_readlane_b32 s50, v40, 14 -; GFX10-NEXT: v_readlane_b32 s49, v40, 13 -; GFX10-NEXT: v_readlane_b32 s48, v40, 12 -; GFX10-NEXT: v_readlane_b32 s47, v40, 11 -; GFX10-NEXT: v_readlane_b32 s46, v40, 10 -; GFX10-NEXT: v_readlane_b32 s45, v40, 9 -; GFX10-NEXT: v_readlane_b32 s44, v40, 8 -; GFX10-NEXT: v_readlane_b32 s43, v40, 7 -; GFX10-NEXT: v_readlane_b32 s42, v40, 6 -; GFX10-NEXT: v_readlane_b32 s41, v40, 5 -; GFX10-NEXT: v_readlane_b32 s40, v40, 4 -; GFX10-NEXT: v_readlane_b32 s39, v40, 3 -; GFX10-NEXT: v_readlane_b32 s38, v40, 2 -; GFX10-NEXT: v_readlane_b32 s37, v40, 1 -; GFX10-NEXT: v_readlane_b32 s36, v40, 0 +; GFX10-NEXT: v_readlane_b32 s65, v40, 27 +; GFX10-NEXT: v_readlane_b32 s64, v40, 26 +; GFX10-NEXT: v_readlane_b32 s29, v40, 25 +; GFX10-NEXT: v_readlane_b32 s28, v40, 24 +; GFX10-NEXT: v_readlane_b32 s27, v40, 23 +; GFX10-NEXT: v_readlane_b32 s26, v40, 22 +; GFX10-NEXT: v_readlane_b32 s25, v40, 21 +; GFX10-NEXT: v_readlane_b32 s24, v40, 20 +; GFX10-NEXT: v_readlane_b32 s23, v40, 19 +; GFX10-NEXT: v_readlane_b32 s22, v40, 18 +; GFX10-NEXT: v_readlane_b32 s21, v40, 17 +; GFX10-NEXT: v_readlane_b32 s20, v40, 16 +; GFX10-NEXT: v_readlane_b32 s19, v40, 15 +; GFX10-NEXT: v_readlane_b32 s18, v40, 14 +; GFX10-NEXT: v_readlane_b32 s17, v40, 13 +; GFX10-NEXT: v_readlane_b32 s16, v40, 12 +; GFX10-NEXT: v_readlane_b32 s15, v40, 11 +; GFX10-NEXT: v_readlane_b32 s14, v40, 10 +; GFX10-NEXT: v_readlane_b32 s13, v40, 9 +; GFX10-NEXT: v_readlane_b32 s12, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 18 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -8334,26 +9325,26 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 28 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s40, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s41, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s42, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s43, 7 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s44, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s45, 9 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s46, 10 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s47, 11 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 13 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 14 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 15 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s12, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s13, 9 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s14, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s15, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s16, 12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 @@ -8361,54 +9352,74 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 ; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 ; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 ; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 ; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 ; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 ; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24 ; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25 ; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 26 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 27 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 17 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 15 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 14 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 13 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 12 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s47, v40, 11 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s46, v40, 10 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s45, v40, 9 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s44, v40, 8 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s43, v40, 7 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s42, v40, 6 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s41, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s40, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s36, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 27 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 26 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s28, v40, 24 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s27, v40, 23 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s26, v40, 22 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s25, v40, 21 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s24, v40, 20 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s23, v40, 19 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s22, v40, 18 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s21, v40, 17 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s20, v40, 16 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s19, v40, 15 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s18, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s17, v40, 13 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s16, v40, 12 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s15, v40, 11 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s14, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s13, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s12, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 18 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %ptr = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef %val = load <32 x i32>, <32 x i32> addrspace(4)* %ptr call amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> inreg %val) @@ -8419,46 +9430,58 @@ ; GFX9-LABEL: test_call_external_void_func_v32i32_i32_inreg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 18 -; GFX9-NEXT: v_writelane_b32 v40, s36, 0 -; GFX9-NEXT: v_writelane_b32 v40, s37, 1 -; GFX9-NEXT: v_writelane_b32 v40, s38, 2 -; GFX9-NEXT: v_writelane_b32 v40, s39, 3 -; GFX9-NEXT: v_writelane_b32 v40, s40, 4 -; GFX9-NEXT: v_writelane_b32 v40, s41, 5 -; GFX9-NEXT: v_writelane_b32 v40, s42, 6 -; GFX9-NEXT: v_writelane_b32 v40, s43, 7 -; GFX9-NEXT: v_writelane_b32 v40, s44, 8 -; GFX9-NEXT: v_writelane_b32 v40, s45, 9 -; GFX9-NEXT: v_writelane_b32 v40, s46, 10 -; GFX9-NEXT: v_writelane_b32 v40, s47, 11 -; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s22, s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s48, 12 -; GFX9-NEXT: v_writelane_b32 v40, s49, 13 -; GFX9-NEXT: v_writelane_b32 v40, s50, 14 -; GFX9-NEXT: v_writelane_b32 v40, s51, 15 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 28 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v40, s12, 8 +; GFX9-NEXT: v_writelane_b32 v40, s13, 9 +; GFX9-NEXT: v_writelane_b32 v40, s14, 10 +; GFX9-NEXT: v_writelane_b32 v40, s15, 11 +; GFX9-NEXT: v_writelane_b32 v40, s16, 12 +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s17, 13 +; GFX9-NEXT: v_writelane_b32 v40, s18, 14 +; GFX9-NEXT: v_writelane_b32 v40, s19, 15 +; GFX9-NEXT: v_writelane_b32 v40, s20, 16 +; GFX9-NEXT: v_writelane_b32 v40, s21, 17 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 +; GFX9-NEXT: s_load_dword s34, s[30:31], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr30_sgpr31 +; GFX9-NEXT: ; kill: killed $sgpr30_sgpr31 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[30:31], 0x40 +; GFX9-NEXT: v_writelane_b32 v40, s22, 18 +; GFX9-NEXT: v_writelane_b32 v40, s23, 19 +; GFX9-NEXT: v_writelane_b32 v40, s24, 20 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_writelane_b32 v40, s25, 21 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-NEXT: v_writelane_b32 v40, s26, 22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, s46 +; GFX9-NEXT: v_writelane_b32 v40, s27, 23 ; GFX9-NEXT: v_mov_b32_e32 v1, s47 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s48 +; GFX9-NEXT: v_writelane_b32 v40, s28, 24 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s49 +; GFX9-NEXT: v_writelane_b32 v40, s29, 25 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s50 -; GFX9-NEXT: v_writelane_b32 v40, s30, 16 +; GFX9-NEXT: v_writelane_b32 v40, s64, 26 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; GFX9-NEXT: v_mov_b32_e32 v0, s51 ; GFX9-NEXT: s_mov_b32 s20, s36 @@ -8471,129 +9494,160 @@ ; GFX9-NEXT: s_mov_b32 s27, s43 ; GFX9-NEXT: s_mov_b32 s28, s44 ; GFX9-NEXT: s_mov_b32 s29, s45 -; GFX9-NEXT: v_writelane_b32 v40, s31, 17 +; GFX9-NEXT: v_writelane_b32 v40, s65, 27 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_getpc_b64 s[30:31] ; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] -; GFX9-NEXT: v_readlane_b32 s4, v40, 16 -; GFX9-NEXT: v_readlane_b32 s5, v40, 17 -; GFX9-NEXT: v_readlane_b32 s51, v40, 15 -; GFX9-NEXT: v_readlane_b32 s50, v40, 14 -; GFX9-NEXT: v_readlane_b32 s49, v40, 13 -; GFX9-NEXT: v_readlane_b32 s48, v40, 12 -; GFX9-NEXT: v_readlane_b32 s47, v40, 11 -; GFX9-NEXT: v_readlane_b32 s46, v40, 10 -; GFX9-NEXT: v_readlane_b32 s45, v40, 9 -; GFX9-NEXT: v_readlane_b32 s44, v40, 8 -; GFX9-NEXT: v_readlane_b32 s43, v40, 7 -; GFX9-NEXT: v_readlane_b32 s42, v40, 6 -; GFX9-NEXT: v_readlane_b32 s41, v40, 5 -; GFX9-NEXT: v_readlane_b32 s40, v40, 4 -; GFX9-NEXT: v_readlane_b32 s39, v40, 3 -; GFX9-NEXT: v_readlane_b32 s38, v40, 2 -; GFX9-NEXT: v_readlane_b32 s37, v40, 1 -; GFX9-NEXT: v_readlane_b32 s36, v40, 0 +; GFX9-NEXT: v_readlane_b32 s65, v40, 27 +; GFX9-NEXT: v_readlane_b32 s64, v40, 26 +; GFX9-NEXT: v_readlane_b32 s29, v40, 25 +; GFX9-NEXT: v_readlane_b32 s28, v40, 24 +; GFX9-NEXT: v_readlane_b32 s27, v40, 23 +; GFX9-NEXT: v_readlane_b32 s26, v40, 22 +; GFX9-NEXT: v_readlane_b32 s25, v40, 21 +; GFX9-NEXT: v_readlane_b32 s24, v40, 20 +; GFX9-NEXT: v_readlane_b32 s23, v40, 19 +; GFX9-NEXT: v_readlane_b32 s22, v40, 18 +; GFX9-NEXT: v_readlane_b32 s21, v40, 17 +; GFX9-NEXT: v_readlane_b32 s20, v40, 16 +; GFX9-NEXT: v_readlane_b32 s19, v40, 15 +; GFX9-NEXT: v_readlane_b32 s18, v40, 14 +; GFX9-NEXT: v_readlane_b32 s17, v40, 13 +; GFX9-NEXT: v_readlane_b32 s16, v40, 12 +; GFX9-NEXT: v_readlane_b32 s15, v40, 11 +; GFX9-NEXT: v_readlane_b32 s14, v40, 10 +; GFX9-NEXT: v_readlane_b32 s13, v40, 9 +; GFX9-NEXT: v_readlane_b32 s12, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 18 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 28 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_v32i32_i32_inreg: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 18 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 -; GFX10-NEXT: s_load_dword s22, s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 28 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s36, 0 -; GFX10-NEXT: v_writelane_b32 v40, s37, 1 -; GFX10-NEXT: v_writelane_b32 v40, s38, 2 -; GFX10-NEXT: v_writelane_b32 v40, s39, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-NEXT: v_writelane_b32 v40, s12, 8 +; GFX10-NEXT: v_writelane_b32 v40, s13, 9 +; GFX10-NEXT: v_writelane_b32 v40, s14, 10 +; GFX10-NEXT: v_writelane_b32 v40, s15, 11 +; GFX10-NEXT: v_writelane_b32 v40, s16, 12 +; GFX10-NEXT: v_writelane_b32 v40, s17, 13 +; GFX10-NEXT: v_writelane_b32 v40, s18, 14 +; GFX10-NEXT: v_writelane_b32 v40, s19, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s22 -; GFX10-NEXT: v_writelane_b32 v40, s40, 4 -; GFX10-NEXT: v_writelane_b32 v40, s41, 5 -; GFX10-NEXT: v_writelane_b32 v40, s42, 6 -; GFX10-NEXT: v_writelane_b32 v40, s43, 7 -; GFX10-NEXT: v_writelane_b32 v40, s44, 8 -; GFX10-NEXT: v_writelane_b32 v40, s45, 9 -; GFX10-NEXT: v_writelane_b32 v40, s46, 10 -; GFX10-NEXT: v_writelane_b32 v40, s47, 11 -; GFX10-NEXT: v_writelane_b32 v40, s48, 12 -; GFX10-NEXT: v_writelane_b32 v40, s49, 13 -; GFX10-NEXT: v_writelane_b32 v40, s50, 14 -; GFX10-NEXT: v_writelane_b32 v40, s51, 15 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 -; GFX10-NEXT: v_writelane_b32 v40, s30, 16 -; GFX10-NEXT: v_writelane_b32 v40, s31, 17 +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s34, s[30:31], 0x0 +; GFX10-NEXT: ; meta instruction +; GFX10-NEXT: ; meta instruction +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[30:31], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0 ; GFX10-NEXT: s_getpc_b64 s[30:31] ; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s20, 16 +; GFX10-NEXT: v_writelane_b32 v40, s21, 17 +; GFX10-NEXT: v_writelane_b32 v40, s22, 18 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-NEXT: v_mov_b32_e32 v0, s34 ; GFX10-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-NEXT: v_writelane_b32 v40, s23, 19 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-NEXT: v_writelane_b32 v40, s24, 20 ; GFX10-NEXT: s_mov_b32 s20, s36 ; GFX10-NEXT: s_mov_b32 s21, s37 ; GFX10-NEXT: s_mov_b32 s22, s38 ; GFX10-NEXT: s_mov_b32 s23, s39 +; GFX10-NEXT: v_writelane_b32 v40, s25, 21 ; GFX10-NEXT: s_mov_b32 s24, s40 ; GFX10-NEXT: s_mov_b32 s25, s41 -; GFX10-NEXT: s_mov_b32 s26, s42 -; GFX10-NEXT: s_mov_b32 s27, s43 -; GFX10-NEXT: s_mov_b32 s28, s44 -; GFX10-NEXT: s_mov_b32 s29, s45 ; GFX10-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-NEXT: v_writelane_b32 v40, s26, 22 +; GFX10-NEXT: s_mov_b32 s26, s42 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 +; GFX10-NEXT: v_writelane_b32 v40, s27, 23 +; GFX10-NEXT: s_mov_b32 s27, s43 +; GFX10-NEXT: v_writelane_b32 v40, s28, 24 +; GFX10-NEXT: s_mov_b32 s28, s44 +; GFX10-NEXT: v_writelane_b32 v40, s29, 25 +; GFX10-NEXT: s_mov_b32 s29, s45 +; GFX10-NEXT: v_writelane_b32 v40, s64, 26 +; GFX10-NEXT: v_writelane_b32 v40, s65, 27 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] -; GFX10-NEXT: v_readlane_b32 s4, v40, 16 -; GFX10-NEXT: v_readlane_b32 s5, v40, 17 -; GFX10-NEXT: v_readlane_b32 s51, v40, 15 -; GFX10-NEXT: v_readlane_b32 s50, v40, 14 -; GFX10-NEXT: v_readlane_b32 s49, v40, 13 -; GFX10-NEXT: v_readlane_b32 s48, v40, 12 -; GFX10-NEXT: v_readlane_b32 s47, v40, 11 -; GFX10-NEXT: v_readlane_b32 s46, v40, 10 -; GFX10-NEXT: v_readlane_b32 s45, v40, 9 -; GFX10-NEXT: v_readlane_b32 s44, v40, 8 -; GFX10-NEXT: v_readlane_b32 s43, v40, 7 -; GFX10-NEXT: v_readlane_b32 s42, v40, 6 -; GFX10-NEXT: v_readlane_b32 s41, v40, 5 -; GFX10-NEXT: v_readlane_b32 s40, v40, 4 -; GFX10-NEXT: v_readlane_b32 s39, v40, 3 -; GFX10-NEXT: v_readlane_b32 s38, v40, 2 -; GFX10-NEXT: v_readlane_b32 s37, v40, 1 -; GFX10-NEXT: v_readlane_b32 s36, v40, 0 +; GFX10-NEXT: v_readlane_b32 s65, v40, 27 +; GFX10-NEXT: v_readlane_b32 s64, v40, 26 +; GFX10-NEXT: v_readlane_b32 s29, v40, 25 +; GFX10-NEXT: v_readlane_b32 s28, v40, 24 +; GFX10-NEXT: v_readlane_b32 s27, v40, 23 +; GFX10-NEXT: v_readlane_b32 s26, v40, 22 +; GFX10-NEXT: v_readlane_b32 s25, v40, 21 +; GFX10-NEXT: v_readlane_b32 s24, v40, 20 +; GFX10-NEXT: v_readlane_b32 s23, v40, 19 +; GFX10-NEXT: v_readlane_b32 s22, v40, 18 +; GFX10-NEXT: v_readlane_b32 s21, v40, 17 +; GFX10-NEXT: v_readlane_b32 s20, v40, 16 +; GFX10-NEXT: v_readlane_b32 s19, v40, 15 +; GFX10-NEXT: v_readlane_b32 s18, v40, 14 +; GFX10-NEXT: v_readlane_b32 s17, v40, 13 +; GFX10-NEXT: v_readlane_b32 s16, v40, 12 +; GFX10-NEXT: v_readlane_b32 s15, v40, 11 +; GFX10-NEXT: v_readlane_b32 s14, v40, 10 +; GFX10-NEXT: v_readlane_b32 s13, v40, 9 +; GFX10-NEXT: v_readlane_b32 s12, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 18 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_i32_inreg: ; GFX10-SCRATCH: ; %bb.0: @@ -8603,26 +9657,26 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 18 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 28 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s40, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s41, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s42, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s43, 7 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s44, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s45, 9 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s46, 10 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s47, 11 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 13 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 14 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 15 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s12, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s13, 9 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s14, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s15, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s16, 12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x2 ; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0 @@ -8633,56 +9687,76 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 ; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 ; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 ; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 ; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 ; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s32 offset:24 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 ; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24 ; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25 ; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s32 offset:24 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s32 offset:16 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 26 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 27 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 17 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 15 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 14 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 13 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 12 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s47, v40, 11 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s46, v40, 10 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s45, v40, 9 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s44, v40, 8 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s43, v40, 7 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s42, v40, 6 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s41, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s40, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s36, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 27 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 26 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s28, v40, 24 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s27, v40, 23 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s26, v40, 22 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s25, v40, 21 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s24, v40, 20 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s23, v40, 19 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s22, v40, 18 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s21, v40, 17 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s20, v40, 16 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s19, v40, 15 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s18, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s17, v40, 13 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s16, v40, 12 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s15, v40, 11 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s14, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s13, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s12, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 18 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 28 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] %ptr0 = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef %val0 = load <32 x i32>, <32 x i32> addrspace(4)* %ptr0 %val1 = load i32, i32 addrspace(4)* undef @@ -8694,68 +9768,68 @@ ; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, stack_passed_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, stack_passed_f64_arg@rel32@hi+12 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: stack_passed_arg_alignment_v32i32_f64: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, stack_passed_f64_arg@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, stack_passed_f64_arg@rel32@hi+12 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: stack_passed_arg_alignment_v32i32_f64: ; GFX10-SCRATCH: ; %bb.0: ; %entry @@ -8772,21 +9846,21 @@ ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] entry: call amdgpu_gfx void @stack_passed_f64_arg(<32 x i32> %val, double %tmp) ret void @@ -8796,9 +9870,9 @@ ; GFX9-LABEL: stack_12xv3i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -8809,7 +9883,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 14 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -8843,29 +9917,29 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 9 ; GFX9-NEXT: v_mov_b32_e32 v30, 10 ; GFX9-NEXT: v_mov_b32_e32 v31, 11 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_12xv3i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_12xv3i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: stack_12xv3i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-NEXT: v_mov_b32_e32 v1, 13 @@ -8873,7 +9947,7 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 @@ -8910,21 +9984,21 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-NEXT: v_mov_b32_e32 v31, 11 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_12xv3i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_12xv3i32@rel32@hi+12 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: stack_12xv3i32: ; GFX10-SCRATCH: ; %bb.0: ; %entry @@ -8941,7 +10015,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 @@ -8975,21 +10049,21 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 11 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] entry: call amdgpu_gfx void @external_void_func_12xv3i32( <3 x i32>, @@ -9011,9 +10085,9 @@ ; GFX9-LABEL: stack_8xv5i32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -9032,7 +10106,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 14 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -9066,29 +10140,29 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 5 ; GFX9-NEXT: v_mov_b32_e32 v30, 6 ; GFX9-NEXT: v_mov_b32_e32 v31, 7 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_8xv5i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_8xv5i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: stack_8xv5i32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 10 @@ -9103,7 +10177,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 13 ; GFX10-NEXT: v_mov_b32_e32 v3, 14 ; GFX10-NEXT: v_mov_b32_e32 v4, 15 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 @@ -9141,21 +10215,21 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-NEXT: v_mov_b32_e32 v31, 7 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_8xv5i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_8xv5i32@rel32@hi+12 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: stack_8xv5i32: ; GFX10-SCRATCH: ; %bb.0: ; %entry @@ -9176,7 +10250,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 @@ -9211,21 +10285,21 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 7 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] entry: call amdgpu_gfx void @external_void_func_8xv5i32( <5 x i32>, @@ -9243,9 +10317,9 @@ ; GFX9-LABEL: stack_8xv5f32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -9264,7 +10338,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -9298,29 +10372,29 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_8xv5f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_8xv5f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: stack_8xv5f32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000 @@ -9335,7 +10409,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 @@ -9373,21 +10447,21 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-NEXT: v_mov_b32_e32 v31, 0x40e00000 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_8xv5f32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_8xv5f32@rel32@hi+12 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-SCRATCH-LABEL: stack_8xv5f32: ; GFX10-SCRATCH: ; %bb.0: ; %entry @@ -9408,7 +10482,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 @@ -9443,21 +10517,21 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 0x40e00000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s65, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: s_setpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_setpc_b64 s[64:65] entry: call amdgpu_gfx void @external_void_func_8xv5f32( <5 x float>, diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -8,69 +8,69 @@ ; GFX9-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 4 -; GFX9-NEXT: v_writelane_b32 v40, s34, 0 -; GFX9-NEXT: v_writelane_b32 v40, s35, 1 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 -; GFX9-NEXT: s_getpc_b64 s[34:35] -; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s64, 2 +; GFX9-NEXT: v_writelane_b32 v40, s65, 3 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 3 -; GFX9-NEXT: v_readlane_b32 s35, v40, 1 -; GFX9-NEXT: v_readlane_b32 s34, v40, 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s65, v40, 3 +; GFX9-NEXT: v_readlane_b32 s64, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 4 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 4 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s35, 1 -; GFX10-NEXT: s_getpc_b64 s[34:35] -; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 2 +; GFX10-NEXT: v_writelane_b32 v40, s65, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s4, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 3 -; GFX10-NEXT: v_readlane_b32 s35, v40, 1 -; GFX10-NEXT: v_readlane_b32 s34, v40, 0 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_readlane_b32 s65, v40, 3 +; GFX10-NEXT: v_readlane_b32 s64, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 4 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "", ""() #0 call amdgpu_gfx void @external_void_func_void() @@ -81,21 +81,21 @@ ; GFX9-LABEL: void_func_void_clobber_s30_s31: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[36:37] ; ; GFX10-LABEL: void_func_void_clobber_s30_s31: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[36:37] call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 ret void } @@ -104,75 +104,75 @@ ; GFX9-LABEL: test_call_void_func_void_mayclobber_s31: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 -; GFX9-NEXT: v_writelane_b32 v40, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s34, s31 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 1 -; GFX9-NEXT: s_mov_b32 s31, s34 +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 +; GFX9-NEXT: s_mov_b32 s4, s31 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: s_mov_b32 s31, s4 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_readlane_b32 s5, v40, 2 -; GFX9-NEXT: v_readlane_b32 s34, v40, 0 +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 3 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_void_func_void_mayclobber_s31: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: s_getpc_b64 s[34:35] +; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_mov_b32 s34, s31 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 1 -; GFX10-NEXT: s_mov_b32 s31, s34 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, s31 +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: s_mov_b32 s31, s4 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_readlane_b32 s5, v40, 2 -; GFX10-NEXT: v_readlane_b32 s34, v40, 0 +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 3 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] %s31 = call i32 asm sideeffect "; def $0", "={s31}"() call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "{s31}"(i32 %s31) @@ -183,46 +183,46 @@ ; GFX9-LABEL: test_call_void_func_void_mayclobber_v31: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v31 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_mov_b32_e32 v41, v31 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_mov_b32_e32 v31, v41 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v31 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_void_func_void_mayclobber_v31: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -230,28 +230,28 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, v31 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX10-NEXT: v_mov_b32_e32 v31, v41 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v31 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] %v31 = call i32 asm sideeffect "; def $0", "={v31}"() call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "{v31}"(i32 %v31) @@ -263,67 +263,75 @@ ; GFX9-LABEL: test_call_void_func_void_preserves_s33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 2 +; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: v_writelane_b32 v40, s33, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s33 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: s_mov_b32 s33, s4 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s33 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: v_readlane_b32 s33, v40, 3 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_void_func_void_preserves_s33: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 2 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, s33 +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: s_mov_b32 s33, s4 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s33 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: v_readlane_b32 s33, v40, 3 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] %s33 = call i32 asm sideeffect "; def $0", "={s33}"() call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "{s33}"(i32 %s33) @@ -334,71 +342,75 @@ ; GFX9-LABEL: test_call_void_func_void_preserves_s34: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 -; GFX9-NEXT: v_writelane_b32 v40, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s34 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 1 +; GFX9-NEXT: s_mov_b32 s4, s34 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: s_mov_b32 s34, s4 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s34 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_readlane_b32 s5, v40, 2 -; GFX9-NEXT: v_readlane_b32 s34, v40, 0 +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 3 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_void_func_void_preserves_s34: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s34, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, s34 +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: s_mov_b32 s34, s4 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s34 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_readlane_b32 s5, v40, 2 -; GFX10-NEXT: v_readlane_b32 s34, v40, 0 +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 3 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] %s34 = call i32 asm sideeffect "; def $0", "={s34}"() call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "{s34}"(i32 %s34) @@ -409,44 +421,44 @@ ; GFX9-LABEL: test_call_void_func_void_preserves_v40: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v41, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s64, 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v41, s31, 1 +; GFX9-NEXT: v_writelane_b32 v41, s65, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v40 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v40 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 -; GFX9-NEXT: v_readlane_b32 s5, v41, 1 +; GFX9-NEXT: v_readlane_b32 s65, v41, 1 +; GFX9-NEXT: v_readlane_b32 s64, v41, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v41, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_void_func_void_preserves_v40: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v41, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -454,26 +466,26 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: v_writelane_b32 v41, s64, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v41, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s4, v41, 0 -; GFX10-NEXT: v_readlane_b32 s5, v41, 1 +; GFX10-NEXT: v_readlane_b32 s65, v41, 1 +; GFX10-NEXT: v_readlane_b32 s64, v41, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v41, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] %v40 = call i32 asm sideeffect "; def $0", "={v40}"() call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "{v40}"(i32 %v40) @@ -568,55 +580,55 @@ ; GFX9-LABEL: test_call_void_func_void_clobber_s33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s33@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, void_func_void_clobber_s33@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, void_func_void_clobber_s33@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_void_func_void_clobber_s33: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s33@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, void_func_void_clobber_s33@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, void_func_void_clobber_s33@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @void_func_void_clobber_s33() ret void } @@ -625,55 +637,55 @@ ; GFX9-LABEL: test_call_void_func_void_clobber_s34: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s34@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 0 +; GFX9-NEXT: v_writelane_b32 v40, s65, 1 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, void_func_void_clobber_s34@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, void_func_void_clobber_s34@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: v_readlane_b32 s65, v40, 1 +; GFX9-NEXT: v_readlane_b32 s64, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: test_call_void_func_void_clobber_s34: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s34@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, void_func_void_clobber_s34@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, void_func_void_clobber_s34@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 0 +; GFX10-NEXT: v_writelane_b32 v40, s65, 1 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: v_readlane_b32 s65, v40, 1 +; GFX10-NEXT: v_readlane_b32 s64, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] call amdgpu_gfx void @void_func_void_clobber_s34() ret void } @@ -682,71 +694,73 @@ ; GFX9-LABEL: callee_saved_sgpr_kernel: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 -; GFX9-NEXT: v_writelane_b32 v40, s40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s4, v40, 1 +; GFX9-NEXT: s_mov_b32 s4, s40 +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s40 +; GFX9-NEXT: ; use s4 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_readlane_b32 s5, v40, 2 -; GFX9-NEXT: v_readlane_b32 s40, v40, 0 +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 3 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: callee_saved_sgpr_kernel: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s40, 0 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s4, v40, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, s40 +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use s40 +; GFX10-NEXT: ; use s4 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_readlane_b32 s5, v40, 2 -; GFX10-NEXT: v_readlane_b32 s40, v40, 0 +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 3 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 @@ -757,89 +771,91 @@ ; GFX9-LABEL: callee_saved_sgpr_vgpr_kernel: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 -; GFX9-NEXT: v_writelane_b32 v40, s40, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s64, 1 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s65, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s4, s40 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v32 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_mov_b32_e32 v41, v32 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s40 +; GFX9-NEXT: ; use s4 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v41 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s4, v40, 1 -; GFX9-NEXT: v_readlane_b32 s5, v40, 2 -; GFX9-NEXT: v_readlane_b32 s40, v40, 0 +; GFX9-NEXT: v_readlane_b32 s65, v40, 2 +; GFX9-NEXT: v_readlane_b32 s64, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: v_readlane_b32 s33, v40, 3 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_setpc_b64 s[64:65] ; ; GFX10-LABEL: callee_saved_sgpr_vgpr_kernel: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_or_saveexec_b32 s4, -1 +; GFX10-NEXT: s_or_saveexec_b32 s34, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s40, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_mov_b32_e32 v41, v32 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s64, 1 +; GFX10-NEXT: v_writelane_b32 v40, s65, 2 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use s40 +; GFX10-NEXT: ; use s4 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v41 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s4, v40, 1 -; GFX10-NEXT: v_readlane_b32 s5, v40, 2 -; GFX10-NEXT: v_readlane_b32 s40, v40, 0 +; GFX10-NEXT: v_readlane_b32 s65, v40, 2 +; GFX10-NEXT: v_readlane_b32 s64, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: v_readlane_b32 s33, v40, 3 -; GFX10-NEXT: s_or_saveexec_b32 s6, -1 +; GFX10-NEXT: s_or_saveexec_b32 s30, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s6 +; GFX10-NEXT: s_mov_b32 exec_lo, s30 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_setpc_b64 s[64:65] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 call amdgpu_gfx void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -23,27 +23,27 @@ ; GFX9-LABEL: call_i1: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, return_i1@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, return_i1@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, return_i1@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, return_i1@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: s_setpc_b64 s[36:37] ; ; GFX10-LABEL: call_i1: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, return_i1@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, return_i1@gotpcrel32@hi+12 -; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31] -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, return_i1@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, return_i1@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: s_setpc_b64 s[36:37] entry: call amdgpu_gfx i1 @return_i1() ret void @@ -70,27 +70,27 @@ ; GFX9-LABEL: call_i16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, return_i16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, return_i16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, return_i16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, return_i16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: s_setpc_b64 s[36:37] ; ; GFX10-LABEL: call_i16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, return_i16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, return_i16@gotpcrel32@hi+12 -; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31] -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, return_i16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, return_i16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: s_setpc_b64 s[36:37] entry: call amdgpu_gfx i16 @return_i16() ret void @@ -117,27 +117,27 @@ ; GFX9-LABEL: call_2xi16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, return_2xi16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, return_2xi16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, return_2xi16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, return_2xi16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: s_setpc_b64 s[36:37] ; ; GFX10-LABEL: call_2xi16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, return_2xi16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, return_2xi16@gotpcrel32@hi+12 -; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31] -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, return_2xi16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, return_2xi16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: s_setpc_b64 s[36:37] entry: call amdgpu_gfx <2 x i16> @return_2xi16() ret void @@ -166,27 +166,27 @@ ; GFX9-LABEL: call_3xi16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, return_3xi16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, return_3xi16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, return_3xi16@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, return_3xi16@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-NEXT: s_setpc_b64 s[36:37] ; ; GFX10-LABEL: call_3xi16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, return_3xi16@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, return_3xi16@gotpcrel32@hi+12 -; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31] -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31] +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, return_3xi16@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, return_3xi16@gotpcrel32@hi+12 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX10-NEXT: s_setpc_b64 s[36:37] entry: call amdgpu_gfx <3 x i16> @return_3xi16() ret void @@ -1241,41 +1241,41 @@ ; GFX9-LABEL: call_512xi32: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_add_i32 s33, s32, 0x1ffc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xfffe0000 ; GFX9-NEXT: s_add_i32 s32, s32, 0x60000 -; GFX9-NEXT: s_getpc_b64 s[6:7] -; GFX9-NEXT: s_add_u32 s6, s6, return_512xi32@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s7, s7, return_512xi32@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[30:31] +; GFX9-NEXT: s_getpc_b64 s[30:31] +; GFX9-NEXT: s_add_u32 s30, s30, return_512xi32@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s31, s31, return_512xi32@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GFX9-NEXT: s_mov_b64 s[4:5], s[30:31] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: s_add_i32 s32, s32, 0xfffa0000 -; GFX9-NEXT: s_mov_b32 s33, s8 -; GFX9-NEXT: s_setpc_b64 s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s34 +; GFX9-NEXT: s_setpc_b64 s[36:37] ; ; GFX10-LABEL: call_512xi32: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s8, s33 +; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_add_i32 s33, s32, 0xffe0 -; GFX10-NEXT: s_mov_b64 s[4:5], s[30:31] +; GFX10-NEXT: s_mov_b64 s[36:37], s[30:31] ; GFX10-NEXT: s_and_b32 s33, s33, 0xffff0000 ; GFX10-NEXT: s_add_i32 s32, s32, 0x30000 -; GFX10-NEXT: s_getpc_b64 s[6:7] -; GFX10-NEXT: s_add_u32 s6, s6, return_512xi32@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s7, s7, return_512xi32@gotpcrel32@hi+12 +; GFX10-NEXT: s_getpc_b64 s[30:31] +; GFX10-NEXT: s_add_u32 s30, s30, return_512xi32@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s31, s31, return_512xi32@gotpcrel32@hi+12 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX10-NEXT: s_add_i32 s32, s32, 0xfffd0000 -; GFX10-NEXT: s_mov_b32 s33, s8 -; GFX10-NEXT: s_setpc_b64 s[4:5] +; GFX10-NEXT: s_mov_b32 s33, s34 +; GFX10-NEXT: s_setpc_b64 s[36:37] entry: call amdgpu_gfx <512 x i32> @return_512xi32() ret void diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -560,41 +560,90 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 6 +; GCN-NEXT: v_writelane_b32 v40, s33, 30 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s34, 0 ; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: v_writelane_b32 v40, s36, 2 ; GCN-NEXT: v_writelane_b32 v40, s37, 3 -; GCN-NEXT: v_writelane_b32 v40, s30, 4 -; GCN-NEXT: v_writelane_b32 v40, s31, 5 -; GCN-NEXT: s_mov_b64 s[34:35], exec -; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s40, 6 +; GCN-NEXT: v_writelane_b32 v40, s41, 7 +; GCN-NEXT: v_writelane_b32 v40, s42, 8 +; GCN-NEXT: v_writelane_b32 v40, s43, 9 +; GCN-NEXT: v_writelane_b32 v40, s44, 10 +; GCN-NEXT: v_writelane_b32 v40, s45, 11 +; GCN-NEXT: v_writelane_b32 v40, s46, 12 +; GCN-NEXT: v_writelane_b32 v40, s47, 13 +; GCN-NEXT: v_writelane_b32 v40, s48, 14 +; GCN-NEXT: v_writelane_b32 v40, s49, 15 +; GCN-NEXT: v_writelane_b32 v40, s50, 16 +; GCN-NEXT: v_writelane_b32 v40, s51, 17 +; GCN-NEXT: v_writelane_b32 v40, s52, 18 +; GCN-NEXT: v_writelane_b32 v40, s53, 19 +; GCN-NEXT: v_writelane_b32 v40, s54, 20 +; GCN-NEXT: v_writelane_b32 v40, s55, 21 +; GCN-NEXT: v_writelane_b32 v40, s56, 22 +; GCN-NEXT: v_writelane_b32 v40, s57, 23 +; GCN-NEXT: v_writelane_b32 v40, s58, 24 +; GCN-NEXT: v_writelane_b32 v40, s59, 25 +; GCN-NEXT: v_writelane_b32 v40, s60, 26 +; GCN-NEXT: v_writelane_b32 v40, s61, 27 +; GCN-NEXT: v_writelane_b32 v40, s62, 28 +; GCN-NEXT: v_writelane_b32 v40, s63, 29 +; GCN-NEXT: s_mov_b64 s[6:7], s[30:31] +; GCN-NEXT: s_mov_b64 s[8:9], exec ; GCN-NEXT: s_movk_i32 s4, 0x7b -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[12:13], v[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_xor_b64 exec, exec, s[36:37] +; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] ; GCN-NEXT: s_cbranch_execnz BB6_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_readlane_b32 s4, v40, 4 -; GCN-NEXT: v_readlane_b32 s5, v40, 5 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_readlane_b32 s63, v40, 29 +; GCN-NEXT: v_readlane_b32 s62, v40, 28 +; GCN-NEXT: v_readlane_b32 s61, v40, 27 +; GCN-NEXT: v_readlane_b32 s60, v40, 26 +; GCN-NEXT: v_readlane_b32 s59, v40, 25 +; GCN-NEXT: v_readlane_b32 s58, v40, 24 +; GCN-NEXT: v_readlane_b32 s57, v40, 23 +; GCN-NEXT: v_readlane_b32 s56, v40, 22 +; GCN-NEXT: v_readlane_b32 s55, v40, 21 +; GCN-NEXT: v_readlane_b32 s54, v40, 20 +; GCN-NEXT: v_readlane_b32 s53, v40, 19 +; GCN-NEXT: v_readlane_b32 s52, v40, 18 +; GCN-NEXT: v_readlane_b32 s51, v40, 17 +; GCN-NEXT: v_readlane_b32 s50, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 15 +; GCN-NEXT: v_readlane_b32 s48, v40, 14 +; GCN-NEXT: v_readlane_b32 s47, v40, 13 +; GCN-NEXT: v_readlane_b32 s46, v40, 12 +; GCN-NEXT: v_readlane_b32 s45, v40, 11 +; GCN-NEXT: v_readlane_b32 s44, v40, 10 +; GCN-NEXT: v_readlane_b32 s43, v40, 9 +; GCN-NEXT: v_readlane_b32 s42, v40, 8 +; GCN-NEXT: v_readlane_b32 s41, v40, 7 +; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 ; GCN-NEXT: v_readlane_b32 s37, v40, 3 ; GCN-NEXT: v_readlane_b32 s36, v40, 2 ; GCN-NEXT: v_readlane_b32 s35, v40, 1 ; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 6 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: v_readlane_b32 s33, v40, 30 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[4:5] +; GCN-NEXT: s_setpc_b64 s[6:7] call amdgpu_gfx void %fptr(i32 inreg 123) ret void } @@ -606,7 +655,7 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 6 +; GCN-NEXT: v_writelane_b32 v40, s33, 30 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill @@ -614,32 +663,81 @@ ; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: v_writelane_b32 v40, s36, 2 ; GCN-NEXT: v_writelane_b32 v40, s37, 3 -; GCN-NEXT: v_writelane_b32 v40, s30, 4 -; GCN-NEXT: v_writelane_b32 v40, s31, 5 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s40, 6 +; GCN-NEXT: v_writelane_b32 v40, s41, 7 +; GCN-NEXT: v_writelane_b32 v40, s42, 8 +; GCN-NEXT: v_writelane_b32 v40, s43, 9 +; GCN-NEXT: v_writelane_b32 v40, s44, 10 +; GCN-NEXT: v_writelane_b32 v40, s45, 11 +; GCN-NEXT: v_writelane_b32 v40, s46, 12 +; GCN-NEXT: v_writelane_b32 v40, s47, 13 +; GCN-NEXT: v_writelane_b32 v40, s48, 14 +; GCN-NEXT: v_writelane_b32 v40, s49, 15 +; GCN-NEXT: v_writelane_b32 v40, s50, 16 +; GCN-NEXT: v_writelane_b32 v40, s51, 17 +; GCN-NEXT: v_writelane_b32 v40, s52, 18 +; GCN-NEXT: v_writelane_b32 v40, s53, 19 +; GCN-NEXT: v_writelane_b32 v40, s54, 20 +; GCN-NEXT: v_writelane_b32 v40, s55, 21 +; GCN-NEXT: v_writelane_b32 v40, s56, 22 +; GCN-NEXT: v_writelane_b32 v40, s57, 23 +; GCN-NEXT: v_writelane_b32 v40, s58, 24 +; GCN-NEXT: v_writelane_b32 v40, s59, 25 +; GCN-NEXT: v_writelane_b32 v40, s60, 26 +; GCN-NEXT: v_writelane_b32 v40, s61, 27 +; GCN-NEXT: v_writelane_b32 v40, s62, 28 +; GCN-NEXT: v_writelane_b32 v40, s63, 29 +; GCN-NEXT: s_mov_b64 s[4:5], s[30:31] ; GCN-NEXT: v_mov_b32_e32 v41, v0 -; GCN-NEXT: s_mov_b64 s[34:35], exec +; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s4, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v2 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2] -; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: v_mov_b32_e32 v0, v41 -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_swappc_b64 s[30:31], s[10:11] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN-NEXT: s_xor_b64 exec, exec, s[36:37] +; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execnz BB7_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v0, v41 -; GCN-NEXT: v_readlane_b32 s4, v40, 4 -; GCN-NEXT: v_readlane_b32 s5, v40, 5 +; GCN-NEXT: v_readlane_b32 s63, v40, 29 +; GCN-NEXT: v_readlane_b32 s62, v40, 28 +; GCN-NEXT: v_readlane_b32 s61, v40, 27 +; GCN-NEXT: v_readlane_b32 s60, v40, 26 +; GCN-NEXT: v_readlane_b32 s59, v40, 25 +; GCN-NEXT: v_readlane_b32 s58, v40, 24 +; GCN-NEXT: v_readlane_b32 s57, v40, 23 +; GCN-NEXT: v_readlane_b32 s56, v40, 22 +; GCN-NEXT: v_readlane_b32 s55, v40, 21 +; GCN-NEXT: v_readlane_b32 s54, v40, 20 +; GCN-NEXT: v_readlane_b32 s53, v40, 19 +; GCN-NEXT: v_readlane_b32 s52, v40, 18 +; GCN-NEXT: v_readlane_b32 s51, v40, 17 +; GCN-NEXT: v_readlane_b32 s50, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 15 +; GCN-NEXT: v_readlane_b32 s48, v40, 14 +; GCN-NEXT: v_readlane_b32 s47, v40, 13 +; GCN-NEXT: v_readlane_b32 s46, v40, 12 +; GCN-NEXT: v_readlane_b32 s45, v40, 11 +; GCN-NEXT: v_readlane_b32 s44, v40, 10 +; GCN-NEXT: v_readlane_b32 s43, v40, 9 +; GCN-NEXT: v_readlane_b32 s42, v40, 8 +; GCN-NEXT: v_readlane_b32 s41, v40, 7 +; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 ; GCN-NEXT: v_readlane_b32 s37, v40, 3 ; GCN-NEXT: v_readlane_b32 s36, v40, 2 ; GCN-NEXT: v_readlane_b32 s35, v40, 1 ; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 6 +; GCN-NEXT: v_readlane_b32 s33, v40, 30 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -660,38 +758,87 @@ ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v40, s33, 6 +; GCN-NEXT: v_writelane_b32 v40, s33, 30 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s34, 0 ; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: v_writelane_b32 v40, s36, 2 ; GCN-NEXT: v_writelane_b32 v40, s37, 3 -; GCN-NEXT: v_writelane_b32 v40, s30, 4 -; GCN-NEXT: v_writelane_b32 v40, s31, 5 -; GCN-NEXT: s_mov_b64 s[34:35], exec +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s40, 6 +; GCN-NEXT: v_writelane_b32 v40, s41, 7 +; GCN-NEXT: v_writelane_b32 v40, s42, 8 +; GCN-NEXT: v_writelane_b32 v40, s43, 9 +; GCN-NEXT: v_writelane_b32 v40, s44, 10 +; GCN-NEXT: v_writelane_b32 v40, s45, 11 +; GCN-NEXT: v_writelane_b32 v40, s46, 12 +; GCN-NEXT: v_writelane_b32 v40, s47, 13 +; GCN-NEXT: v_writelane_b32 v40, s48, 14 +; GCN-NEXT: v_writelane_b32 v40, s49, 15 +; GCN-NEXT: v_writelane_b32 v40, s50, 16 +; GCN-NEXT: v_writelane_b32 v40, s51, 17 +; GCN-NEXT: v_writelane_b32 v40, s52, 18 +; GCN-NEXT: v_writelane_b32 v40, s53, 19 +; GCN-NEXT: v_writelane_b32 v40, s54, 20 +; GCN-NEXT: v_writelane_b32 v40, s55, 21 +; GCN-NEXT: v_writelane_b32 v40, s56, 22 +; GCN-NEXT: v_writelane_b32 v40, s57, 23 +; GCN-NEXT: v_writelane_b32 v40, s58, 24 +; GCN-NEXT: v_writelane_b32 v40, s59, 25 +; GCN-NEXT: v_writelane_b32 v40, s60, 26 +; GCN-NEXT: v_writelane_b32 v40, s61, 27 +; GCN-NEXT: v_writelane_b32 v40, s62, 28 +; GCN-NEXT: v_writelane_b32 v40, s63, 29 +; GCN-NEXT: s_mov_b64 s[4:5], s[30:31] +; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s4, v1 -; GCN-NEXT: v_readfirstlane_b32 s5, v2 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2] -; GCN-NEXT: s_and_saveexec_b64 s[36:37], vcc -; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s11, v2 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[1:2] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[10:11] ; GCN-NEXT: v_mov_b32_e32 v3, v0 ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_xor_b64 exec, exec, s[36:37] +; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execnz BB8_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v0, v3 -; GCN-NEXT: v_readlane_b32 s4, v40, 4 -; GCN-NEXT: v_readlane_b32 s5, v40, 5 +; GCN-NEXT: v_readlane_b32 s63, v40, 29 +; GCN-NEXT: v_readlane_b32 s62, v40, 28 +; GCN-NEXT: v_readlane_b32 s61, v40, 27 +; GCN-NEXT: v_readlane_b32 s60, v40, 26 +; GCN-NEXT: v_readlane_b32 s59, v40, 25 +; GCN-NEXT: v_readlane_b32 s58, v40, 24 +; GCN-NEXT: v_readlane_b32 s57, v40, 23 +; GCN-NEXT: v_readlane_b32 s56, v40, 22 +; GCN-NEXT: v_readlane_b32 s55, v40, 21 +; GCN-NEXT: v_readlane_b32 s54, v40, 20 +; GCN-NEXT: v_readlane_b32 s53, v40, 19 +; GCN-NEXT: v_readlane_b32 s52, v40, 18 +; GCN-NEXT: v_readlane_b32 s51, v40, 17 +; GCN-NEXT: v_readlane_b32 s50, v40, 16 +; GCN-NEXT: v_readlane_b32 s49, v40, 15 +; GCN-NEXT: v_readlane_b32 s48, v40, 14 +; GCN-NEXT: v_readlane_b32 s47, v40, 13 +; GCN-NEXT: v_readlane_b32 s46, v40, 12 +; GCN-NEXT: v_readlane_b32 s45, v40, 11 +; GCN-NEXT: v_readlane_b32 s44, v40, 10 +; GCN-NEXT: v_readlane_b32 s43, v40, 9 +; GCN-NEXT: v_readlane_b32 s42, v40, 8 +; GCN-NEXT: v_readlane_b32 s41, v40, 7 +; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 ; GCN-NEXT: v_readlane_b32 s37, v40, 3 ; GCN-NEXT: v_readlane_b32 s36, v40, 2 ; GCN-NEXT: v_readlane_b32 s35, v40, 1 ; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: v_readlane_b32 s33, v40, 6 +; GCN-NEXT: v_readlane_b32 s33, v40, 30 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:SGPR_128 */, def %4 - ; GFX908: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:SGPR_128 */, [[COPY]] - ; GFX908: S_ENDPGM 0 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5439498 /* regdef:SGPR_128 */, def %4 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5439497 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:SGPR_128 */, def %4 - ; GFX90A: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:SGPR_128 */, [[COPY]] - ; GFX90A: S_ENDPGM 0 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5439498 /* regdef:SGPR_128 */, def %4 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5439497 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) ret void @@ -26,16 +26,16 @@ define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4718602 /* regdef:VReg_128 */, def %4 - ; GFX908: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 - ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4718601 /* reguse:VReg_128 */, [[COPY]] - ; GFX908: S_ENDPGM 0 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5046282 /* regdef:VReg_128 */, def %4 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5046281 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4849674 /* regdef:VReg_128_Align2 */, def %4 - ; GFX90A: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4 - ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4849673 /* reguse:VReg_128_Align2 */, [[COPY]] - ; GFX90A: S_ENDPGM 0 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5177354 /* regdef:VReg_128_Align2 */, def %4 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %4 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5177353 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) ret void @@ -44,16 +44,16 @@ define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4653066 /* regdef:AReg_128 */, def %4 - ; GFX908: [[COPY:%[0-9]+]]:areg_128 = COPY %4 - ; GFX908: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_128 */, [[COPY]] - ; GFX908: S_ENDPGM 0 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4980746 /* regdef:AReg_128 */, def %4 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %4 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4980745 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: S_ENDPGM 0 ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4784138 /* regdef:AReg_128_Align2 */, def %4 - ; GFX90A: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4 - ; GFX90A: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:AReg_128_Align2 */, [[COPY]] - ; GFX90A: S_ENDPGM 0 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5111818 /* regdef:AReg_128_Align2 */, def %4 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %4 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5111817 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) ret void diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -28,31 +28,31 @@ ; Attributor adds work-group-size attribute. This should be ok. ; GFX9-LABEL: test_simple_indirect_call: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_getpc_b64 s[36:37] -; GFX9-NEXT: s_mov_b32 s36, s0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x10 +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_mov_b32 s8, s0 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s36, s36, s0 -; GFX9-NEXT: s_addc_u32 s37, s37, 0 -; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_add_u32 s8, s8, s0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11] ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; GFX10-LABEL: test_simple_indirect_call: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_getpc_b64 s[36:37] -; GFX10-NEXT: s_mov_b32 s36, s0 +; GFX10-NEXT: s_getpc_b64 s[8:9] +; GFX10-NEXT: s_mov_b32 s8, s0 ; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bitset0_b32 s39, 21 -; GFX10-NEXT: s_add_u32 s36, s36, s0 -; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-NEXT: s_bitset0_b32 s11, 21 +; GFX10-NEXT: s_add_u32 s8, s8, s0 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9] +; GFX10-NEXT: s_mov_b64 s[2:3], s[10:11] ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll --- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll @@ -17,12 +17,28 @@ ; GCN-LABEL: caller: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_writelane_b32 v1, s33, 1 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v1, s4, 0 ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 2.0 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, callee@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, callee@rel32@hi+12 -; GCN-NEXT: s_setpc_b64 s[6:7] +; GCN-NEXT: s_mov_b64 s[36:37], s[30:31] +; GCN-NEXT: s_getpc_b64 s[30:31] +; GCN-NEXT: s_add_u32 s30, s30, callee@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s31, s31, callee@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GCN-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: v_readlane_b32 s33, v1, 1 +; GCN-NEXT: s_or_saveexec_b64 s[30:31], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[30:31] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[36:37] %add = fadd float %arg0, 1.0 %call = tail call amdgpu_gfx float @callee(float %add, float inreg 2.0) ret float %call diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -5,32 +5,40 @@ define amdgpu_ps float @else1(i32 %z, float %v) #0 { ; SI-LABEL: name: else1 ; SI: bb.0.main_body: - ; SI: successors: %bb.3(0x40000000), %bb.1(0x40000000) - ; SI: liveins: $vgpr0, $vgpr1 - ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 - ; SI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec - ; SI: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: S_BRANCH %bb.3 - ; SI: bb.1.Flow: - ; SI: successors: %bb.2(0x40000000), %bb.4(0x40000000) - ; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %13:vgpr_32, %bb.0, %4, %bb.3 - ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, undef %15:vgpr_32, %bb.3 - ; SI: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: S_BRANCH %bb.2 - ; SI: bb.2.if: - ; SI: successors: %bb.4(0x80000000) - ; SI: %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], [[PHI1]], implicit $mode, implicit $exec - ; SI: S_BRANCH %bb.4 - ; SI: bb.3.else: - ; SI: successors: %bb.1(0x80000000) - ; SI: %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, killed [[COPY]], implicit $mode, implicit $exec - ; SI: S_BRANCH %bb.1 - ; SI: bb.4.end: - ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2 - ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: $vgpr0 = COPY killed [[PHI2]] - ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 + ; SI-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; SI-NEXT: liveins: $vgpr0, $vgpr1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI-NEXT: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec + ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.3 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.1.Flow: + ; SI-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %13:vgpr_32, %bb.0, %4, %bb.3 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, undef %15:vgpr_32, %bb.3 + ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.2 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.2.if: + ; SI-NEXT: successors: %bb.4(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], [[PHI1]], implicit $mode, implicit $exec + ; SI-NEXT: S_BRANCH %bb.4 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.3.else: + ; SI-NEXT: successors: %bb.1(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, killed [[COPY]], implicit $mode, implicit $exec + ; SI-NEXT: S_BRANCH %bb.1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.4.end: + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2 + ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: $vgpr0 = COPY killed [[PHI2]] + ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -53,33 +61,41 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 { ; SI-LABEL: name: else2 ; SI: bb.0.main_body: - ; SI: successors: %bb.3(0x40000000), %bb.1(0x40000000) - ; SI: liveins: $vgpr0, $vgpr1 - ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 - ; SI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec - ; SI: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: S_BRANCH %bb.3 - ; SI: bb.1.Flow: - ; SI: successors: %bb.2(0x40000000), %bb.4(0x40000000) - ; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %15:vgpr_32, %bb.0, %4, %bb.3 - ; SI: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: S_BRANCH %bb.2 - ; SI: bb.2.if: - ; SI: successors: %bb.4(0x80000000) - ; SI: %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[COPY]], [[COPY]], implicit $mode, implicit $exec - ; SI: S_BRANCH %bb.4 - ; SI: bb.3.else: - ; SI: successors: %bb.1(0x80000000) - ; SI: %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, [[COPY]], implicit $mode, implicit $exec - ; SI: S_BRANCH %bb.1 - ; SI: bb.4.end: - ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.1, %3, %bb.2 - ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2 - ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: %14:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], killed [[PHI2]], implicit $mode, implicit $exec - ; SI: $vgpr0 = COPY killed %14 - ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 + ; SI-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; SI-NEXT: liveins: $vgpr0, $vgpr1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI-NEXT: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec + ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.3 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.1.Flow: + ; SI-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %15:vgpr_32, %bb.0, %4, %bb.3 + ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.2 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.2.if: + ; SI-NEXT: successors: %bb.4(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[COPY]], [[COPY]], implicit $mode, implicit $exec + ; SI-NEXT: S_BRANCH %bb.4 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.3.else: + ; SI-NEXT: successors: %bb.1(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, [[COPY]], implicit $mode, implicit $exec + ; SI-NEXT: S_BRANCH %bb.1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.4.end: + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.1, %3, %bb.2 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2 + ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: %14:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], killed [[PHI2]], implicit $mode, implicit $exec + ; SI-NEXT: $vgpr0 = COPY killed %14 + ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -103,51 +119,63 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-LABEL: name: else3 ; SI: bb.0.entry: - ; SI: successors: %bb.1(0x80000000) - ; SI: liveins: $vgpr0, $vgpr1, $sgpr0, $vgpr2 - ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr2 - ; SI: [[COPY1:%[0-9]+]]:sgpr_32 = COPY killed $sgpr0 - ; SI: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 - ; SI: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY3]], implicit $exec - ; SI: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; SI: bb.1.for.body: - ; SI: successors: %bb.4(0x40000000), %bb.2(0x40000000) - ; SI: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %14, %bb.5 - ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %13, %bb.5 - ; SI: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_GT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: S_BRANCH %bb.4 - ; SI: bb.2.Flow: - ; SI: successors: %bb.3(0x40000000), %bb.5(0x40000000) - ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4 - ; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4 - ; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4 - ; SI: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: S_BRANCH %bb.3 - ; SI: bb.3.if: - ; SI: successors: %bb.5(0x80000000) - ; SI: %7:vgpr_32 = nofpexcept V_MUL_F32_e32 [[PHI]], [[COPY2]], implicit $mode, implicit $exec - ; SI: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, killed [[PHI4]], implicit $exec - ; SI: S_BRANCH %bb.5 - ; SI: bb.4.else: - ; SI: successors: %bb.2(0x80000000) - ; SI: %9:vgpr_32 = nofpexcept V_MUL_F32_e32 [[COPY2]], [[PHI1]], implicit $mode, implicit $exec - ; SI: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec - ; SI: S_BRANCH %bb.2 - ; SI: bb.5.if.end: - ; SI: successors: %bb.6(0x04000000), %bb.1(0x7c000000) - ; SI: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.2, %7, %bb.3 - ; SI: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.2, [[V_ADD_U32_e32_]], %bb.3 - ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[PHI6]], implicit $exec - ; SI: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[PHI]], 1, implicit-def dead $scc - ; SI: S_CMP_LT_I32 [[S_ADD_I32_]], [[COPY1]], implicit-def $scc - ; SI: S_CBRANCH_SCC1 %bb.1, implicit killed $scc - ; SI: S_BRANCH %bb.6 - ; SI: bb.6.for.end: - ; SI: %31:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec - ; SI: $vgpr0 = COPY killed %31 - ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 + ; SI-NEXT: successors: %bb.1(0x80000000) + ; SI-NEXT: liveins: $vgpr0, $vgpr1, $sgpr0, $vgpr2 + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr2 + ; SI-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY killed $sgpr0 + ; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI-NEXT: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY3]], implicit $exec + ; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.1.for.body: + ; SI-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %14, %bb.5 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %13, %bb.5 + ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_GT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.4 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.2.Flow: + ; SI-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4 + ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.3 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.3.if: + ; SI-NEXT: successors: %bb.5(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: %7:vgpr_32 = nofpexcept V_MUL_F32_e32 [[PHI]], [[COPY2]], implicit $mode, implicit $exec + ; SI-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, killed [[PHI4]], implicit $exec + ; SI-NEXT: S_BRANCH %bb.5 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.4.else: + ; SI-NEXT: successors: %bb.2(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: %9:vgpr_32 = nofpexcept V_MUL_F32_e32 [[COPY2]], [[PHI1]], implicit $mode, implicit $exec + ; SI-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec + ; SI-NEXT: S_BRANCH %bb.2 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.5.if.end: + ; SI-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.2, %7, %bb.3 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.2, [[V_ADD_U32_e32_]], %bb.3 + ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[PHI6]], implicit $exec + ; SI-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[PHI]], 1, implicit-def dead $scc + ; SI-NEXT: S_CMP_LT_I32 [[S_ADD_I32_]], [[COPY1]], implicit-def $scc + ; SI-NEXT: S_CBRANCH_SCC1 %bb.1, implicit killed $scc + ; SI-NEXT: S_BRANCH %bb.6 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.6.for.end: + ; SI-NEXT: %31:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec + ; SI-NEXT: $vgpr0 = COPY killed %31 + ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 entry: ; %break = icmp sgt i32 %bound, 0 ; br i1 %break, label %for.body, label %for.end @@ -190,84 +218,100 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 { ; SI-LABEL: name: loop ; SI: bb.0.main_body: - ; SI: successors: %bb.5(0x40000000), %bb.1(0x40000000) - ; SI: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr5 - ; SI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr4 - ; SI: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed $vgpr3 - ; SI: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed $vgpr2 - ; SI: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 - ; SI: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY5]], implicit $exec - ; SI: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: S_BRANCH %bb.5 - ; SI: bb.1.Flow: - ; SI: successors: %bb.2(0x40000000), %bb.8(0x40000000) - ; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %29:vgpr_32, %bb.0, %4, %bb.7 - ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %45:vgpr_32, %bb.7 - ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %47:vgpr_32, %bb.7 - ; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %49:vgpr_32, %bb.7 - ; SI: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: S_BRANCH %bb.2 - ; SI: bb.2.if: - ; SI: successors: %bb.3(0x80000000) - ; SI: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI2]], %subreg.sub0, killed [[PHI3]], %subreg.sub1 - ; SI: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo - ; SI: bb.3: - ; SI: successors: %bb.3(0x40000000), %bb.4(0x40000000) - ; SI: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %51:vreg_64, %bb.3, [[REG_SEQUENCE]], %bb.2 - ; SI: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %53:vgpr_32, %bb.3, [[PHI1]], %bb.2 - ; SI: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec - ; SI: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec - ; SI: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; SI: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], killed [[PHI4]], implicit $exec - ; SI: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec - ; SI: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; SI: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]] - ; SI: $vgpr0 = COPY killed [[PHI5]] - ; SI: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 - ; SI: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc - ; SI: SI_WATERFALL_LOOP %bb.3, implicit $exec - ; SI: bb.4: - ; SI: successors: %bb.8(0x80000000) - ; SI: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] - ; SI: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] - ; SI: S_BRANCH %bb.8 - ; SI: bb.5.else: - ; SI: successors: %bb.6(0x80000000) - ; SI: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1 - ; SI: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo - ; SI: bb.6: - ; SI: successors: %bb.6(0x40000000), %bb.7(0x40000000) - ; SI: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.6, [[REG_SEQUENCE2]], %bb.5 - ; SI: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.6, [[COPY4]], %bb.5 - ; SI: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec - ; SI: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec - ; SI: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; SI: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], killed [[PHI6]], implicit $exec - ; SI: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_1]], implicit-def $exec, implicit-def dead $scc, implicit $exec - ; SI: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; SI: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]] - ; SI: $vgpr0 = COPY killed [[PHI7]] - ; SI: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 - ; SI: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc - ; SI: SI_WATERFALL_LOOP %bb.6, implicit $exec - ; SI: bb.7: - ; SI: successors: %bb.1(0x80000000) - ; SI: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] - ; SI: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]] - ; SI: S_BRANCH %bb.1 - ; SI: bb.8.end: - ; SI: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.4 - ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: $vgpr0 = COPY killed [[PHI8]] - ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 + ; SI-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) + ; SI-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr5 + ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr4 + ; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed $vgpr3 + ; SI-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed $vgpr2 + ; SI-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI-NEXT: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY5]], implicit $exec + ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.5 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.1.Flow: + ; SI-NEXT: successors: %bb.2(0x40000000), %bb.8(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %29:vgpr_32, %bb.0, %4, %bb.7 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %45:vgpr_32, %bb.7 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %47:vgpr_32, %bb.7 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %49:vgpr_32, %bb.7 + ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.2 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.2.if: + ; SI-NEXT: successors: %bb.3(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI2]], %subreg.sub0, killed [[PHI3]], %subreg.sub1 + ; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.3: + ; SI-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %51:vreg_64, %bb.3, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %53:vgpr_32, %bb.3, [[PHI1]], %bb.2 + ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], killed [[PHI4]], implicit $exec + ; SI-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; SI-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]] + ; SI-NEXT: $vgpr0 = COPY killed [[PHI5]] + ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 + ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; SI-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc + ; SI-NEXT: SI_WATERFALL_LOOP %bb.3, implicit $exec + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.4: + ; SI-NEXT: successors: %bb.8(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] + ; SI-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] + ; SI-NEXT: S_BRANCH %bb.8 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.5.else: + ; SI-NEXT: successors: %bb.6(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1 + ; SI-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.6: + ; SI-NEXT: successors: %bb.6(0x40000000), %bb.7(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.6, [[REG_SEQUENCE2]], %bb.5 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.6, [[COPY4]], %bb.5 + ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], killed [[PHI6]], implicit $exec + ; SI-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_1]], implicit-def $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; SI-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]] + ; SI-NEXT: $vgpr0 = COPY killed [[PHI7]] + ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 + ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; SI-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc + ; SI-NEXT: SI_WATERFALL_LOOP %bb.6, implicit $exec + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.7: + ; SI-NEXT: successors: %bb.1(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] + ; SI-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]] + ; SI-NEXT: S_BRANCH %bb.1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.8.end: + ; SI-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.4 + ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: $vgpr0 = COPY killed [[PHI8]] + ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -289,82 +333,98 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 { ; SI-LABEL: name: loop_with_use ; SI: bb.0.main_body: - ; SI: successors: %bb.5(0x40000000), %bb.1(0x40000000) - ; SI: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr5 - ; SI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr4 - ; SI: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed $vgpr3 - ; SI: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed $vgpr2 - ; SI: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 - ; SI: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY5]], implicit $exec - ; SI: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: S_BRANCH %bb.5 - ; SI: bb.1.Flow: - ; SI: successors: %bb.2(0x40000000), %bb.8(0x40000000) - ; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %30:vgpr_32, %bb.0, %4, %bb.7 - ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %46:vgpr_32, %bb.7 - ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %48:vgpr_32, %bb.7 - ; SI: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: S_BRANCH %bb.2 - ; SI: bb.2.if: - ; SI: successors: %bb.3(0x80000000) - ; SI: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI1]], %subreg.sub0, killed [[PHI2]], %subreg.sub1 - ; SI: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo - ; SI: bb.3: - ; SI: successors: %bb.3(0x40000000), %bb.4(0x40000000) - ; SI: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %50:vreg_64, %bb.3, [[REG_SEQUENCE]], %bb.2 - ; SI: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec - ; SI: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec - ; SI: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; SI: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], killed [[PHI3]], implicit $exec - ; SI: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec - ; SI: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; SI: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]] - ; SI: $vgpr0 = COPY [[COPY4]] - ; SI: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 - ; SI: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc - ; SI: SI_WATERFALL_LOOP %bb.3, implicit $exec - ; SI: bb.4: - ; SI: successors: %bb.8(0x80000000) - ; SI: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] - ; SI: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] - ; SI: S_BRANCH %bb.8 - ; SI: bb.5.else: - ; SI: successors: %bb.6(0x80000000) - ; SI: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1 - ; SI: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo - ; SI: bb.6: - ; SI: successors: %bb.6(0x40000000), %bb.7(0x40000000) - ; SI: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %52:vreg_64, %bb.6, [[REG_SEQUENCE2]], %bb.5 - ; SI: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec - ; SI: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec - ; SI: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; SI: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], killed [[PHI4]], implicit $exec - ; SI: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_1]], implicit-def $exec, implicit-def dead $scc, implicit $exec - ; SI: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; SI: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]] - ; SI: $vgpr0 = COPY [[COPY4]] - ; SI: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_highregs, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 - ; SI: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; SI: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 - ; SI: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc - ; SI: SI_WATERFALL_LOOP %bb.6, implicit $exec - ; SI: bb.7: - ; SI: successors: %bb.1(0x80000000) - ; SI: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] - ; SI: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]] - ; SI: S_BRANCH %bb.1 - ; SI: bb.8.end: - ; SI: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.4 - ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; SI: %27:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI5]], killed [[COPY4]], implicit $mode, implicit $exec - ; SI: $vgpr0 = COPY killed %27 - ; SI: SI_RETURN_TO_EPILOG killed $vgpr0 + ; SI-NEXT: successors: %bb.5(0x40000000), %bb.1(0x40000000) + ; SI-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr5 + ; SI-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr4 + ; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed $vgpr3 + ; SI-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed $vgpr2 + ; SI-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1 + ; SI-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI-NEXT: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY5]], implicit $exec + ; SI-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.5 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.1.Flow: + ; SI-NEXT: successors: %bb.2(0x40000000), %bb.8(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %30:vgpr_32, %bb.0, %4, %bb.7 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %46:vgpr_32, %bb.7 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %48:vgpr_32, %bb.7 + ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: S_BRANCH %bb.2 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.2.if: + ; SI-NEXT: successors: %bb.3(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[PHI1]], %subreg.sub0, killed [[PHI2]], %subreg.sub1 + ; SI-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.3: + ; SI-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %50:vreg_64, %bb.3, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; SI-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], killed [[PHI3]], implicit $exec + ; SI-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; SI-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY6]] + ; SI-NEXT: $vgpr0 = COPY [[COPY4]] + ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 + ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; SI-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_]], implicit-def dead $scc + ; SI-NEXT: SI_WATERFALL_LOOP %bb.3, implicit $exec + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.4: + ; SI-NEXT: successors: %bb.8(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]] + ; SI-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]] + ; SI-NEXT: S_BRANCH %bb.8 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.5.else: + ; SI-NEXT: successors: %bb.6(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY1]], %subreg.sub0, killed [[COPY]], %subreg.sub1 + ; SI-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.6: + ; SI-NEXT: successors: %bb.6(0x40000000), %bb.7(0x40000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %52:vreg_64, %bb.6, [[REG_SEQUENCE2]], %bb.5 + ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec + ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; SI-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], killed [[PHI4]], implicit $exec + ; SI-NEXT: [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U64_e64_1]], implicit-def $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; SI-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; SI-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed [[COPY9]] + ; SI-NEXT: $vgpr0 = COPY [[COPY4]] + ; SI-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE3]], 0, csr_amdgpu_si_gfx, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0, implicit-def $vgpr0 + ; SI-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; SI-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0 + ; SI-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc + ; SI-NEXT: SI_WATERFALL_LOOP %bb.6, implicit $exec + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.7: + ; SI-NEXT: successors: %bb.1(0x80000000) + ; SI-NEXT: {{ $}} + ; SI-NEXT: $exec_lo = S_MOV_B32 killed [[S_MOV_B32_1]] + ; SI-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY10]] + ; SI-NEXT: S_BRANCH %bb.1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: bb.8.end: + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, [[COPY8]], %bb.4 + ; SI-NEXT: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; SI-NEXT: %27:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI5]], killed [[COPY4]], implicit $mode, implicit $exec + ; SI-NEXT: $vgpr0 = COPY killed %27 + ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -158,61 +158,61 @@ ; SI-LABEL: loop: ; SI: ; %bb.0: ; %main_body ; SI-NEXT: v_mov_b32_e32 v6, v0 -; SI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s38, -1 +; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6 -; SI-NEXT: s_mov_b32 s39, 0x31c16000 -; SI-NEXT: s_add_u32 s36, s36, s1 -; SI-NEXT: s_addc_u32 s37, s37, 0 +; SI-NEXT: s_mov_b32 s15, 0x31c16000 +; SI-NEXT: s_add_u32 s12, s12, s1 +; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_mov_b32 s32, 0 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s33, exec_lo, s0 +; SI-NEXT: s_xor_b32 s4, exec_lo, s0 ; SI-NEXT: s_cbranch_execz BB3_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s34, exec_lo +; SI-NEXT: s_mov_b32 s5, exec_lo ; SI-NEXT: BB3_2: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_readfirstlane_b32 s4, v4 -; SI-NEXT: v_readfirstlane_b32 s5, v5 -; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; SI-NEXT: s_and_saveexec_b32 s35, vcc_lo -; SI-NEXT: s_mov_b64 s[0:1], s[36:37] -; SI-NEXT: s_mov_b64 s[2:3], s[38:39] -; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[4:5] +; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo +; SI-NEXT: s_mov_b64 s[0:1], s[12:13] +; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_swappc_b64 s[30:31], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s35 +; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 ; SI-NEXT: s_cbranch_execnz BB3_2 ; SI-NEXT: ; %bb.3: -; SI-NEXT: s_mov_b32 exec_lo, s34 +; SI-NEXT: s_mov_b32 exec_lo, s5 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: BB3_4: ; %Flow -; SI-NEXT: s_or_saveexec_b32 s33, s33 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s33 +; SI-NEXT: s_or_saveexec_b32 s4, s4 +; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; SI-NEXT: s_cbranch_execz BB3_8 ; SI-NEXT: ; %bb.5: ; %if -; SI-NEXT: s_mov_b32 s34, exec_lo +; SI-NEXT: s_mov_b32 s5, exec_lo ; SI-NEXT: BB3_6: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_readfirstlane_b32 s5, v3 -; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] -; SI-NEXT: s_and_saveexec_b32 s35, vcc_lo -; SI-NEXT: s_mov_b64 s[0:1], s[36:37] -; SI-NEXT: s_mov_b64 s[2:3], s[38:39] -; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[2:3] +; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo +; SI-NEXT: s_mov_b64 s[0:1], s[12:13] +; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_swappc_b64 s[30:31], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s35 +; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 ; SI-NEXT: s_cbranch_execnz BB3_6 ; SI-NEXT: ; %bb.7: -; SI-NEXT: s_mov_b32 exec_lo, s34 +; SI-NEXT: s_mov_b32 exec_lo, s5 ; SI-NEXT: BB3_8: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s33 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; SI-NEXT: v_mov_b32_e32 v0, v1 ; SI-NEXT: ; return to shader part epilog main_body: @@ -236,58 +236,58 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 { ; SI-LABEL: loop_with_use: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; SI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; SI-NEXT: s_mov_b32 s38, -1 +; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 -; SI-NEXT: s_mov_b32 s39, 0x31c16000 -; SI-NEXT: s_add_u32 s36, s36, s1 -; SI-NEXT: s_addc_u32 s37, s37, 0 +; SI-NEXT: s_mov_b32 s15, 0x31c16000 +; SI-NEXT: s_add_u32 s12, s12, s1 +; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_mov_b32 s32, 0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo -; SI-NEXT: s_xor_b32 s33, exec_lo, s0 +; SI-NEXT: s_xor_b32 s4, exec_lo, s0 ; SI-NEXT: s_cbranch_execz BB4_4 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s34, exec_lo +; SI-NEXT: s_mov_b32 s5, exec_lo ; SI-NEXT: BB4_2: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_readfirstlane_b32 s4, v4 -; SI-NEXT: v_readfirstlane_b32 s5, v5 -; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] -; SI-NEXT: s_and_saveexec_b32 s35, vcc_lo +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: v_readfirstlane_b32 s7, v5 +; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[4:5] +; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo ; SI-NEXT: v_mov_b32_e32 v0, v40 -; SI-NEXT: s_mov_b64 s[0:1], s[36:37] -; SI-NEXT: s_mov_b64 s[2:3], s[38:39] -; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_mov_b64 s[0:1], s[12:13] +; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_swappc_b64 s[30:31], s[6:7] ; SI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s35 +; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 ; SI-NEXT: s_cbranch_execnz BB4_2 ; SI-NEXT: ; %bb.3: -; SI-NEXT: s_mov_b32 exec_lo, s34 +; SI-NEXT: s_mov_b32 exec_lo, s5 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: BB4_4: ; %Flow -; SI-NEXT: s_or_saveexec_b32 s33, s33 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s33 +; SI-NEXT: s_or_saveexec_b32 s4, s4 +; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; SI-NEXT: s_cbranch_execz BB4_8 ; SI-NEXT: ; %bb.5: ; %if -; SI-NEXT: s_mov_b32 s34, exec_lo +; SI-NEXT: s_mov_b32 s5, exec_lo ; SI-NEXT: BB4_6: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_readfirstlane_b32 s5, v3 -; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] -; SI-NEXT: s_and_saveexec_b32 s35, vcc_lo +; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[2:3] +; SI-NEXT: s_and_saveexec_b32 s8, vcc_lo ; SI-NEXT: v_mov_b32_e32 v0, v40 -; SI-NEXT: s_mov_b64 s[0:1], s[36:37] -; SI-NEXT: s_mov_b64 s[2:3], s[38:39] -; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_mov_b64 s[0:1], s[12:13] +; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: s_swappc_b64 s[30:31], s[6:7] ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s35 +; SI-NEXT: s_xor_b32 exec_lo, exec_lo, s8 ; SI-NEXT: s_cbranch_execnz BB4_6 ; SI-NEXT: ; %bb.7: -; SI-NEXT: s_mov_b32 exec_lo, s34 +; SI-NEXT: s_mov_b32 exec_lo, s5 ; SI-NEXT: BB4_8: ; %end -; SI-NEXT: s_or_b32 exec_lo, exec_lo, s33 +; SI-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; SI-NEXT: v_add_f32_e32 v0, v0, v40 ; SI-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -6,77 +6,75 @@ ; GFX9-O0-LABEL: strict_wwm_no_cfg: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-O0-NEXT: s_mov_b32 s8, s7 -; GFX9-O0-NEXT: s_mov_b32 s9, s6 -; GFX9-O0-NEXT: s_mov_b32 s10, s5 -; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s5, s10 -; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[5:6], off, s[4:7], s8 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s37, s5 +; GFX9-O0-NEXT: s_mov_b32 s38, s6 +; GFX9-O0-NEXT: s_mov_b32 s39, s7 +; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s34, 0 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[5:6], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v0, v0, v2 -; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v0, v1, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[10:11], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[10:11] -; GFX9-O0-NEXT: s_mov_b32 s9, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s9, v3 -; GFX9-O0-NEXT: s_mov_b32 s9, 2 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[40:41], v3, v4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[40:41] +; GFX9-O0-NEXT: s_mov_b32 s35, 1 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s35, v3 +; GFX9-O0-NEXT: s_mov_b32 s35, 2 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s35 ; GFX9-O0-NEXT: v_and_b32_e32 v3, v3, v4 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[4:7], s8 offset:4 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O3-LABEL: strict_wwm_no_cfg: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 @@ -84,12 +82,12 @@ ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-O3-NEXT: v_add_u32_e32 v0, v3, v0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 @@ -97,14 +95,14 @@ ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX9-O3-NEXT: v_and_b32_e32 v4, 2, v4 ; GFX9-O3-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:4 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) @@ -136,52 +134,51 @@ ; GFX9-O0-LABEL: strict_wwm_cfg: ; GFX9-O0: ; %bb.0: ; %entry ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_writelane_b32 v5, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v5, s31, 1 -; GFX9-O0-NEXT: s_mov_b32 s8, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 -; GFX9-O0-NEXT: s_mov_b32 s9, s5 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[10:11] -; GFX9-O0-NEXT: v_writelane_b32 v5, s4, 2 -; GFX9-O0-NEXT: v_writelane_b32 v5, s5, 3 -; GFX9-O0-NEXT: v_writelane_b32 v5, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v5, s7, 5 -; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[8:11], s4 +; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s37, s5 +; GFX9-O0-NEXT: s_mov_b32 s38, s6 +; GFX9-O0-NEXT: s_mov_b32 s39, s7 +; GFX9-O0-NEXT: s_mov_b64 s[42:43], s[38:39] +; GFX9-O0-NEXT: s_mov_b64 s[40:41], s[36:37] +; GFX9-O0-NEXT: v_writelane_b32 v5, s40, 2 +; GFX9-O0-NEXT: v_writelane_b32 v5, s41, 3 +; GFX9-O0-NEXT: v_writelane_b32 v5, s42, 4 +; GFX9-O0-NEXT: v_writelane_b32 v5, s43, 5 +; GFX9-O0-NEXT: s_mov_b32 s30, 0 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s30 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s30 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s30 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, s30 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s30 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec -; GFX9-O0-NEXT: v_writelane_b32 v5, s4, 6 -; GFX9-O0-NEXT: v_writelane_b32 v5, s5, 7 -; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[30:31], exec +; GFX9-O0-NEXT: v_writelane_b32 v5, s30, 6 +; GFX9-O0-NEXT: v_writelane_b32 v5, s31, 7 +; GFX9-O0-NEXT: s_and_b64 s[30:31], s[30:31], s[34:35] +; GFX9-O0-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-O0-NEXT: s_cbranch_execz BB1_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload @@ -189,100 +186,100 @@ ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: BB1_2: ; %merge -; GFX9-O0-NEXT: v_readlane_b32 s6, v5, 6 -; GFX9-O0-NEXT: v_readlane_b32 s7, v5, 7 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-O0-NEXT: v_readlane_b32 s4, v5, 0 -; GFX9-O0-NEXT: v_readlane_b32 s5, v5, 1 -; GFX9-O0-NEXT: v_readlane_b32 s8, v5, 2 -; GFX9-O0-NEXT: v_readlane_b32 s9, v5, 3 -; GFX9-O0-NEXT: v_readlane_b32 s10, v5, 4 -; GFX9-O0-NEXT: v_readlane_b32 s11, v5, 5 +; GFX9-O0-NEXT: v_readlane_b32 s34, v5, 6 +; GFX9-O0-NEXT: v_readlane_b32 s35, v5, 7 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-O0-NEXT: v_readlane_b32 s30, v5, 0 +; GFX9-O0-NEXT: v_readlane_b32 s31, v5, 1 +; GFX9-O0-NEXT: v_readlane_b32 s36, v5, 2 +; GFX9-O0-NEXT: v_readlane_b32 s37, v5, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v5, 4 +; GFX9-O0-NEXT: v_readlane_b32 s39, v5, 5 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, v3 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] -; GFX9-O0-NEXT: s_mov_b32 s6, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s6, v0 -; GFX9-O0-NEXT: s_mov_b32 s6, 2 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v3 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] +; GFX9-O0-NEXT: s_mov_b32 s34, 1 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 +; GFX9-O0-NEXT: s_mov_b32 s34, 2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 ; GFX9-O0-NEXT: v_and_b32_e32 v0, v0, v3 -; GFX9-O0-NEXT: s_mov_b32 s6, 0 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[8:11], s6 offset:4 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: s_mov_b32 s34, 0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: s_setpc_b64 s[4:5] +; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O3-LABEL: strict_wwm_cfg: ; GFX9-O3: ; %bb.0: ; %entry ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-O3-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX9-O3-NEXT: ; %bb.1: ; %if -; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O3-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O3-NEXT: v_add_u32_e32 v1, v3, v1 -; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O3-NEXT: ; %bb.2: ; %merge -; GFX9-O3-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] entry: @@ -343,92 +340,89 @@ ; GFX9-O0-LABEL: strict_wwm_call: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 2 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-O0-NEXT: s_mov_b32 s9, s8 -; GFX9-O0-NEXT: s_mov_b32 s8, s7 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: s_mov_b32 s11, s5 -; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s5, s11 -; GFX9-O0-NEXT: s_mov_b32 s6, s10 -; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: ; kill: def $sgpr12_sgpr13_sgpr14_sgpr15 killed $sgpr4_sgpr5_sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s37, s5 +; GFX9-O0-NEXT: s_mov_b32 s38, s6 +; GFX9-O0-NEXT: s_mov_b32 s39, s7 +; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s34, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-O0-NEXT: s_getpc_b64 s[12:13] -; GFX9-O0-NEXT: s_add_u32 s12, s12, strict_wwm_called@rel32@lo+4 -; GFX9-O0-NEXT: s_addc_u32 s13, s13, strict_wwm_called@rel32@hi+12 -; GFX9-O0-NEXT: s_mov_b64 s[18:19], s[2:3] -; GFX9-O0-NEXT: s_mov_b64 s[16:17], s[0:1] -; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[16:17] -; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[18:19] +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: s_getpc_b64 s[30:31] +; GFX9-O0-NEXT: s_add_u32 s30, s30, strict_wwm_called@rel32@lo+4 +; GFX9-O0-NEXT: s_addc_u32 s31, s31, strict_wwm_called@rel32@hi+12 +; GFX9-O0-NEXT: s_mov_b64 s[46:47], s[2:3] +; GFX9-O0-NEXT: s_mov_b64 s[44:45], s[0:1] +; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[44:45] +; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 -; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[4:7], s8 offset:4 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O3-LABEL: strict_wwm_call: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-O3-NEXT: s_mov_b32 s14, s33 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: s_mov_b32 s38, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX9-O3-NEXT: s_mov_b64 s[36:37], s[30:31] ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O3-NEXT: s_getpc_b64 s[12:13] -; GFX9-O3-NEXT: s_add_u32 s12, s12, strict_wwm_called@rel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s13, s13, strict_wwm_called@rel32@hi+12 -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX9-O3-NEXT: s_getpc_b64 s[30:31] +; GFX9-O3-NEXT: s_add_u32 s30, s30, strict_wwm_called@rel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s31, s31, strict_wwm_called@rel32@hi+12 +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-O3-NEXT: s_mov_b32 s33, s14 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_mov_b32 s33, s38 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: s_setpc_b64 s[10:11] +; GFX9-O3-NEXT: s_setpc_b64 s[36:37] %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0) %tmp134 = call amdgpu_gfx i32 @strict_wwm_called(i32 %tmp107) %tmp136 = add i32 %tmp134, %tmp107 @@ -449,32 +443,32 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[4:5], v2, v3 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v1, s[4:5] +; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[34:35], v2, v3 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v0, s[34:35], v0, v1, s[34:35] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-O0-NEXT: s_mov_b32 s4, 32 +; GFX9-O0-NEXT: s_mov_b32 s34, 32 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s34, v[0:1] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mul_hi_u32 v1, v0, v6 -; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s4, v[4:5] +; GFX9-O0-NEXT: v_lshrrev_b64 v[7:8], s34, v[4:5] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v7 ; GFX9-O0-NEXT: v_mul_lo_u32 v3, v3, v6 ; GFX9-O0-NEXT: v_add3_u32 v1, v1, v2, v3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s35 ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[1:2] +; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s34, v[1:2] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mul_lo_u32 v6, v0, v6 -; GFX9-O0-NEXT: s_mov_b32 s5, 0 +; GFX9-O0-NEXT: s_mov_b32 s35, 0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 @@ -489,12 +483,12 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v3 -; GFX9-O0-NEXT: v_subb_co_u32_e64 v0, s[6:7], v0, v2, s[6:7] +; GFX9-O0-NEXT: v_sub_co_u32_e64 v1, s[36:37], v1, v3 +; GFX9-O0-NEXT: v_subb_co_u32_e64 v0, s[36:37], v0, v2, s[36:37] ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] +; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s34, v[1:2] ; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -521,7 +515,7 @@ ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) @@ -536,78 +530,75 @@ ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-O0-NEXT: v_writelane_b32 v11, s33, 9 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_writelane_b32 v11, s33, 8 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 ; GFX9-O0-NEXT: v_writelane_b32 v11, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v11, s31, 1 -; GFX9-O0-NEXT: v_writelane_b32 v11, s9, 2 -; GFX9-O0-NEXT: v_writelane_b32 v11, s8, 3 -; GFX9-O0-NEXT: s_mov_b32 s8, s6 -; GFX9-O0-NEXT: v_readlane_b32 s6, v11, 3 -; GFX9-O0-NEXT: v_writelane_b32 v11, s8, 4 -; GFX9-O0-NEXT: s_mov_b32 s12, s5 -; GFX9-O0-NEXT: v_readlane_b32 s5, v11, 4 -; GFX9-O0-NEXT: s_mov_b32 s8, s4 -; GFX9-O0-NEXT: v_readlane_b32 s4, v11, 2 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 -; GFX9-O0-NEXT: s_mov_b32 s9, s12 -; GFX9-O0-NEXT: s_mov_b32 s10, s5 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_writelane_b32 v11, s8, 5 -; GFX9-O0-NEXT: v_writelane_b32 v11, s9, 6 -; GFX9-O0-NEXT: v_writelane_b32 v11, s10, 7 -; GFX9-O0-NEXT: v_writelane_b32 v11, s11, 8 -; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s7, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-O0-NEXT: s_mov_b32 s34, s8 +; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s37, s5 +; GFX9-O0-NEXT: s_mov_b32 s38, s6 +; GFX9-O0-NEXT: s_mov_b32 s39, s7 +; GFX9-O0-NEXT: v_writelane_b32 v11, s36, 2 +; GFX9-O0-NEXT: v_writelane_b32 v11, s37, 3 +; GFX9-O0-NEXT: v_writelane_b32 v11, s38, 4 +; GFX9-O0-NEXT: v_writelane_b32 v11, s39, 5 +; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_mov_b32 s35, s9 +; GFX9-O0-NEXT: ; kill: def $sgpr30_sgpr31 killed $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_mov_b64 s[30:31], 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s35 ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s30 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s31 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1 +; GFX9-O0-NEXT: v_writelane_b32 v11, s30, 6 +; GFX9-O0-NEXT: v_writelane_b32 v11, s31, 7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 -; GFX9-O0-NEXT: s_mov_b32 s4, 32 -; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[9:10] -; GFX9-O0-NEXT: s_getpc_b64 s[4:5] -; GFX9-O0-NEXT: s_add_u32 s4, s4, strict_wwm_called_i64@gotpcrel32@lo+4 -; GFX9-O0-NEXT: s_addc_u32 s5, s5, strict_wwm_called_i64@gotpcrel32@hi+12 -; GFX9-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[2:3] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[0:1] -; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[12:13] -; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[14:15] +; GFX9-O0-NEXT: s_mov_b32 s30, 32 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s30, v[9:10] +; GFX9-O0-NEXT: s_getpc_b64 s[30:31] +; GFX9-O0-NEXT: s_add_u32 s30, s30, strict_wwm_called_i64@gotpcrel32@lo+4 +; GFX9-O0-NEXT: s_addc_u32 s31, s31, strict_wwm_called_i64@gotpcrel32@hi+12 +; GFX9-O0-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 +; GFX9-O0-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX9-O0-NEXT: s_mov_b64 s[36:37], s[0:1] +; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-O0-NEXT: v_readlane_b32 s4, v11, 5 -; GFX9-O0-NEXT: v_readlane_b32 s5, v11, 6 -; GFX9-O0-NEXT: v_readlane_b32 s6, v11, 7 -; GFX9-O0-NEXT: v_readlane_b32 s7, v11, 8 +; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[30:31] +; GFX9-O0-NEXT: v_readlane_b32 s34, v11, 6 +; GFX9-O0-NEXT: v_readlane_b32 s35, v11, 7 +; GFX9-O0-NEXT: v_readlane_b32 s36, v11, 2 +; GFX9-O0-NEXT: v_readlane_b32 s37, v11, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v11, 4 +; GFX9-O0-NEXT: v_readlane_b32 s39, v11, 5 ; GFX9-O0-NEXT: v_readlane_b32 s30, v11, 0 ; GFX9-O0-NEXT: v_readlane_b32 s31, v11, 1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v9 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[10:11], v2, v4 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v5, s[10:11] -; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], s8 offset:4 +; GFX9-O0-NEXT: s_mov_b32 s34, 0 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 -; GFX9-O0-NEXT: v_readlane_b32 s33, v11, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: v_readlane_b32 s33, v11, 8 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload @@ -628,14 +619,14 @@ ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O3-LABEL: strict_wwm_call_i64: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O3-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -644,37 +635,37 @@ ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_mov_b64 exec, s[10:11] -; GFX9-O3-NEXT: s_mov_b32 s14, s33 +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: s_mov_b32 s38, s33 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 -; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX9-O3-NEXT: s_mov_b64 s[36:37], s[30:31] ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GFX9-O3-NEXT: s_getpc_b64 s[12:13] -; GFX9-O3-NEXT: s_add_u32 s12, s12, strict_wwm_called_i64@gotpcrel32@lo+4 -; GFX9-O3-NEXT: s_addc_u32 s13, s13, strict_wwm_called_i64@gotpcrel32@hi+12 -; GFX9-O3-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: s_getpc_b64 s[30:31] +; GFX9-O3-NEXT: s_add_u32 s30, s30, strict_wwm_called_i64@gotpcrel32@lo+4 +; GFX9-O3-NEXT: s_addc_u32 s31, s31, strict_wwm_called_i64@gotpcrel32@hi+12 +; GFX9-O3-NEXT: s_load_dwordx2 s[30:31], s[30:31], 0x0 ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[12:13] +; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-O3-NEXT: s_mov_b32 s33, s14 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_mov_b32 s33, s38 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[30:31], -1 ; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -685,9 +676,9 @@ ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: s_mov_b64 exec, s[30:31] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: s_setpc_b64 s[10:11] +; GFX9-O3-NEXT: s_setpc_b64 s[36:37] %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0) %tmp134 = call amdgpu_gfx i64 @strict_wwm_called_i64(i64 %tmp107) %tmp136 = add i64 %tmp134, %tmp107 @@ -701,38 +692,36 @@ ; GFX9-O0-LABEL: strict_wwm_amdgpu_cs_main: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-O0-NEXT: s_mov_b32 s8, s7 -; GFX9-O0-NEXT: s_mov_b32 s9, s6 -; GFX9-O0-NEXT: s_mov_b32 s10, s5 -; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s5, s10 -; GFX9-O0-NEXT: s_mov_b32 s6, s9 -; GFX9-O0-NEXT: s_mov_b32 s7, s8 -; GFX9-O0-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7 -; GFX9-O0-NEXT: s_mov_b32 s8, 5 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s8, v0 -; GFX9-O0-NEXT: s_mov_b32 s8, 0 -; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[4:7], s8 offen -; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[4:7], s8 offen offset:16 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s37, s5 +; GFX9-O0-NEXT: s_mov_b32 s38, s6 +; GFX9-O0-NEXT: s_mov_b32 s39, s7 +; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s34, 5 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 +; GFX9-O0-NEXT: s_mov_b32 s34, 0 +; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[36:39], s34 offen +; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: s_mov_b32 s9, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s10, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 -; GFX9-O0-NEXT: s_mov_b32 s11, s9 +; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff +; GFX9-O0-NEXT: s_mov_b32 s40, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_mov_b32 s41, s35 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 @@ -743,8 +732,8 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 @@ -752,8 +741,8 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 @@ -765,20 +754,20 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[4:7], s8 offen -; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[4:7], s8 offen offset:16 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[36:39], s34 offen +; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -787,25 +776,25 @@ ; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; GFX9-O3-NEXT: buffer_load_dwordx4 v[1:4], v0, s[4:7], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[5:6], v0, s[4:7], 0 offen offset:16 -; GFX9-O3-NEXT: s_mov_b32 s8, -1 -; GFX9-O3-NEXT: s_brev_b32 s9, -2 +; GFX9-O3-NEXT: s_mov_b32 s34, -1 +; GFX9-O3-NEXT: s_brev_b32 s35, -2 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, s35 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_not_b64 exec, exec -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, s34 +; GFX9-O3-NEXT: v_mov_b32_e32 v4, s35 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-O3-NEXT: v_mov_b32_e32 v5, s34 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s35 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 @@ -815,7 +804,7 @@ ; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6 ; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen ; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -827,7 +816,7 @@ ; GFX9-O3-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] %tmp17 = shl i32 %index, 5