Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -74,6 +74,9 @@ def gi_flat_offset_signed : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_global_saddr : + GIComplexOperandMatcher, + GIComplexPatternEquiv; def gi_mubuf_scratch_offset : GIComplexOperandMatcher, Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -185,6 +185,9 @@ InstructionSelector::ComplexRendererFns selectFlatOffsetSigned(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddr(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectMUBUFScratchOffen(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3254,6 +3254,79 @@ return selectFlatOffsetImpl(Root); } +/// Match a zero extend from a 32-bit value to 64-bits. +static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { + Register ZExtSrc; + if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) + return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); + + // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) + const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return false; + + int64_t MergeRHS; + if (mi_match(Def->getOperand(2).getReg(), MRI, m_ICst(MergeRHS)) && + MergeRHS == 0) { + return Def->getOperand(1).getReg(); + } + + return Register(); +} + +// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { + Register PtrBase; + int64_t ImmOffset; + + // Match the immediate offset first, which canonically is moved as low as + // possible. + std::tie(PtrBase, ImmOffset) = getPtrBaseWithConstantOffset(Root.getReg(), + *MRI); + + // TODO: Could split larger constant into VGPR offset. + if (ImmOffset != 0 && + !TII.isLegalFLATOffset(ImmOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) { + PtrBase = Root.getReg(); + ImmOffset = 0; + } + + // Match the variable offset. + const MachineInstr *PtrBaseDef = getDefIgnoringCopies(PtrBase, *MRI); + if (PtrBaseDef->getOpcode() != AMDGPU::G_PTR_ADD) + return None; + + // Look through the SGPR->VGPR copy. + Register PtrBaseSrc = + getSrcRegIgnoringCopies(PtrBaseDef->getOperand(1).getReg(), *MRI); + if (!PtrBaseSrc) + return None; + + const RegisterBank *BaseRB = RBI.getRegBank(PtrBaseSrc, *MRI, TRI); + if (BaseRB->getID() != AMDGPU::SGPRRegBankID) + return None; + + Register SAddr = PtrBaseSrc; + Register PtrBaseOffset = PtrBaseDef->getOperand(2).getReg(); + + // It's possible voffset is an SGPR here, but the copy to VGPR will be + // inserted later. + Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset); + if (!VOffset) + return None; + + return {{[=](MachineInstrBuilder &MIB) { // saddr + MIB.addReg(SAddr); + }, + [=](MachineInstrBuilder &MIB) { // voffset + MIB.addReg(VOffset); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(ImmOffset); + }}}; +} + static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { auto PSV = PtrInfo.V.dyn_cast(); return PSV && PSV->isStack(); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir @@ -0,0 +1,275 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s + +# TODO: Better to initialize 0 vgpr and use sgpr base +--- + +name: load_global_s32_from_sgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr + ; GFX9: liveins: $sgpr0_sgpr1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr + ; GFX10: liveins: $sgpr0_sgpr1 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:vgpr(p1) = COPY %0 + %2:vgpr(s32) = G_LOAD %1 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %2 + +... + +# FIXME: This zext wouldn't select on its own. +--- + +name: load_global_s32_from_sgpr_zext_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr0 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_zext_vgpr + ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_zext_vgpr + ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:vgpr(s32) = COPY $vgpr0 + %2:vgpr(p1) = COPY %0 + %3:vgpr(s64) = G_ZEXT %1 + %4:vgpr(p1) = G_PTR_ADD %2, %3 + %5:vgpr(s32) = G_LOAD %4 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %5 + +... + +# Test with zext lowered to G_MERGE_VALUES +--- + +name: load_global_s32_from_sgpr_merge_zext_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr0 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_merge_zext_vgpr + ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_merge_zext_vgpr + ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:vgpr(s32) = COPY $vgpr0 + %2:vgpr(p1) = COPY %0 + %zero:vgpr(s32) = G_CONSTANT i32 0 + %3:vgpr(s64) = G_MERGE_VALUES %1, %zero + %4:vgpr(p1) = G_PTR_ADD %2, %3 + %5:vgpr(s32) = G_LOAD %4 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %5 + +... + +--- + +name: load_global_s32_from_sgpr_merge_not_0_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr0 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_merge_not_0_vgpr + ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX9: %notzero:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, %notzero, %subreg.sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub1 + ; GFX9: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec + ; GFX9: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_merge_not_0_vgpr + ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX10: %notzero:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, %notzero, %subreg.sub1 + ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GFX10: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub1 + ; GFX10: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec + ; GFX10: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:vgpr(s32) = COPY $vgpr0 + %2:vgpr(p1) = COPY %0 + %notzero:vgpr(s32) = G_CONSTANT i32 1 + %3:vgpr(s64) = G_MERGE_VALUES %1, %notzero + %4:vgpr(p1) = G_PTR_ADD %2, %3 + %5:vgpr(s32) = G_LOAD %4 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %5 + +... + +--- + +name: load_global_s32_from_sgpr_zext_vgpr_offset4095 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr0 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset4095 + ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 4095, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset4095 + ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX10: %zero:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: %zext:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, %zero, %subreg.sub1 + ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GFX10: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %zext.sub0 + ; GFX10: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub1 + ; GFX10: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %zext.sub1 + ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec + ; GFX10: %24:vgpr_32, dead %26:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %24, %subreg.sub1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX10: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX10: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX10: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec + ; GFX10: %14:vgpr_32, dead %16:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec + ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %14, %subreg.sub1 + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:vgpr(s32) = COPY $vgpr0 + %2:vgpr(p1) = COPY %0 + %zero:vgpr(s32) = G_CONSTANT i32 0 + %zext:vgpr(s64) = G_MERGE_VALUES %1, %zero + %4:vgpr(p1) = G_PTR_ADD %2, %zext + %5:vgpr(s64) = G_CONSTANT i64 4095 + %6:vgpr(p1) = G_PTR_ADD %4, %5 + %7:vgpr(s32) = G_LOAD %6 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %7 + +... + +--- + +name: load_global_s32_from_sgpr_zext_vgpr_offset_neg4096 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr0 + + ; GFX9-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset_neg4096 + ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], -4096, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; GFX10-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset_neg4096 + ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]] + ; GFX10: %zero:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX10: %zext:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, %zero, %subreg.sub1 + ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 + ; GFX10: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %zext.sub0 + ; GFX10: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub1 + ; GFX10: [[COPY6:%[0-9]+]]:vgpr_32 = COPY %zext.sub1 + ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec + ; GFX10: %24:vgpr_32, dead %26:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %24, %subreg.sub1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec + ; GFX10: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1 + ; GFX10: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; GFX10: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; GFX10: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; GFX10: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; GFX10: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec + ; GFX10: %14:vgpr_32, dead %16:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec + ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %14, %subreg.sub1 + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:vgpr(s32) = COPY $vgpr0 + %2:vgpr(p1) = COPY %0 + %zero:vgpr(s32) = G_CONSTANT i32 0 + %zext:vgpr(s64) = G_MERGE_VALUES %1, %zero + %4:vgpr(p1) = G_PTR_ADD %2, %zext + %5:vgpr(s64) = G_CONSTANT i64 -4096 + %6:vgpr(p1) = G_PTR_ADD %4, %5 + %7:vgpr(s32) = G_LOAD %6 :: (load 4, align 4, addrspace 1) + $vgpr0 = COPY %7 + +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -407,22 +407,12 @@ ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: global_atomic_inc v2, v[2:3], v4, off glc +; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id @@ -467,16 +457,10 @@ ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc +; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[0:1] offset:20 glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id @@ -964,22 +948,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[2:3], v[4:5], off glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id @@ -1027,16 +1001,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -897,32 +897,25 @@ ; ; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX10_W32: ; %bb.0: -; GFX10_W32-NEXT: s_clause 0x1 ; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x54 -; GFX10_W32-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s7 +; GFX10_W32-NEXT: s_clause 0x2 +; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] +; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4 +; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8 ; GFX10_W32-NEXT: s_add_u32 s0, s4, 8 ; GFX10_W32-NEXT: s_addc_u32 s1, s5, 0 ; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10_W32-NEXT: v_add_co_u32_e64 v1, vcc_lo, v1, v3 ; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W32-NEXT: v_add_co_u32_e64 v3, vcc_lo, v1, 8 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s2, 0, s2 -; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo -; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10_W32-NEXT: s_clause 0x2 -; GFX10_W32-NEXT: global_load_dword v1, v[1:2], off -; GFX10_W32-NEXT: global_load_dword v2, v[3:4], off offset:-4 -; GFX10_W32-NEXT: global_load_dword v3, v[3:4], off ; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s2 ; GFX10_W32-NEXT: s_waitcnt vmcnt(0) -; GFX10_W32-NEXT: v_div_fmas_f32 v2, v1, v2, v3 +; GFX10_W32-NEXT: v_div_fmas_f32 v2, v2, v3, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off @@ -930,31 +923,24 @@ ; ; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX10_W64: ; %bb.0: -; GFX10_W64-NEXT: s_clause 0x1 ; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x54 -; GFX10_W64-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s7 +; GFX10_W64-NEXT: s_clause 0x2 +; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] +; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4 +; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8 ; GFX10_W64-NEXT: s_add_u32 s0, s4, 8 ; GFX10_W64-NEXT: s_addc_u32 s1, s5, 0 ; GFX10_W64-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10_W64-NEXT: v_add_co_u32_e64 v1, vcc, v1, v3 ; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, 0, v2, vcc ; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 -; GFX10_W64-NEXT: v_add_co_u32_e64 v3, vcc, v1, 8 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 -; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v4, vcc, 0, v2, vcc -; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10_W64-NEXT: s_clause 0x2 -; GFX10_W64-NEXT: global_load_dword v1, v[1:2], off -; GFX10_W64-NEXT: global_load_dword v2, v[3:4], off offset:-4 -; GFX10_W64-NEXT: global_load_dword v3, v[3:4], off ; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[2:3] ; GFX10_W64-NEXT: s_waitcnt vmcnt(0) -; GFX10_W64-NEXT: v_div_fmas_f32 v2, v1, v2, v3 +; GFX10_W64-NEXT: v_div_fmas_f32 v2, v2, v3, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1 ; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off @@ -1051,17 +1037,14 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W32: ; %bb.0: ; %entry ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10_W32-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: s_mov_b32 s4, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s2 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s3 +; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] +; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W32-NEXT: v_add_co_u32_e64 v1, vcc_lo, v1, v3 -; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo -; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v[1:2], off ; GFX10_W32-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX10_W32-NEXT: s_cbranch_execz BB13_2 ; GFX10_W32-NEXT: ; %bb.1: ; %bb @@ -1072,7 +1055,6 @@ ; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W32-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10_W32-NEXT: BB13_2: ; %exit -; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 @@ -1089,16 +1071,13 @@ ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W64: ; %bb.0: ; %entry ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10_W64-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: s_mov_b32 s6, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s2 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s3 +; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] +; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W64-NEXT: v_add_co_u32_e64 v1, vcc, v1, v3 -; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, 0, v2, vcc -; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v[1:2], off ; GFX10_W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10_W64-NEXT: s_cbranch_execz BB13_2 ; GFX10_W64-NEXT: ; %bb.1: ; %bb @@ -1109,7 +1088,6 @@ ; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10_W64-NEXT: BB13_2: ; %exit -; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -46,20 +46,14 @@ ; GFX10-LABEL: test_div_scale_f32_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v1, v0 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -119,20 +113,14 @@ ; GFX10-LABEL: test_div_scale_f32_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v1, v0 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -200,15 +188,9 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 @@ -278,15 +260,9 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 @@ -345,17 +321,12 @@ ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, v0, v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 @@ -412,17 +383,12 @@ ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, s0, v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 @@ -479,17 +445,12 @@ ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, s0, s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 @@ -546,17 +507,12 @@ ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, v0, s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 @@ -613,19 +569,14 @@ ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -680,19 +631,14 @@ ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -747,19 +693,14 @@ ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -814,19 +755,14 @@ ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_2: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_mov_b32_e32 v1, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -1068,14 +1004,10 @@ ; GFX10-LABEL: test_div_scale_f32_inline_imm_num: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1130,14 +1062,10 @@ ; GFX10-LABEL: test_div_scale_f32_inline_imm_den: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s2, 2.0, 2.0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1200,22 +1128,16 @@ ; GFX10-LABEL: test_div_scale_f32_fabs_num: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v1, v0 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -1279,21 +1201,15 @@ ; GFX10-LABEL: test_div_scale_f32_fabs_den: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: global_load_dword v1, v[2:3], off +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -25,13 +25,9 @@ ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -25,13 +25,9 @@ ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -58,20 +58,16 @@ ; GFX10-LABEL: update_dpp64_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 -; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[6:7], off +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v[6:7], v[4:5], off +; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -247,24 +247,16 @@ ; GFX9-LABEL: muli24_shl64: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: global_load_dword v4, v[1:2], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v0, 0xff800000, v4 -; GFX9-NEXT: v_mul_i32_i24_e32 v0, -7, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_or_b32_e32 v1, 0xff800000, v1 +; GFX9-NEXT: v_mul_i32_i24_e32 v1, -7, v1 +; GFX9-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()