Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -539,10 +539,12 @@ ArrayRef SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); MachineInstrBuilder MIB = BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); + LaneBitmask Covered = LaneBitmask::getNone(); for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { MachineOperand &Src = MI.getOperand(I + 1); MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); MIB.addImm(SubRegs[I]); + Covered |= TRI.getSubRegIndexLaneMask(SubRegs[I]); const TargetRegisterClass *SrcRC = TRI.getConstrainedRegClassForOperand(Src, *MRI); @@ -550,6 +552,32 @@ return false; } + // There can be a reminder if DstRC is wider than sources. + LaneBitmask Reminder = DstRC->getLaneMask() & ~Covered; + if (Reminder.any()) { + const TargetRegisterClass *PartRC = + TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI); + + for (unsigned Idx = 1, E = TRI.getNumSubRegIndices() - 1; Idx < E; ++Idx) { + if (TRI.getSubRegIdxSize(Idx) != 32) + continue; + LaneBitmask SubMask = TRI.getSubRegIndexLaneMask(Idx); + if ((SubMask & Reminder).none()) + continue; + + Register UndefReg = MRI->createVirtualRegister(PartRC); + BuildMI(*BB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); + MIB.addReg(UndefReg, RegState::Undef); + MIB.addImm(Idx); + + Reminder &= ~SubMask; + if (Reminder.none()) + break; + } + + assert(Reminder.none()); + } + if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) return false; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -33,6 +33,12 @@ bool isWave32; BitVector RegPressureIgnoredUnits; + /// Sub reg indexes for getRegSplitParts. + /// First index represents subreg size. + /// The inner vector is sorted by bit offset. + /// All elements of the inner vector combined give a full lane mask. + std::vector> RegSplitParts; + void reserveRegisterTuples(BitVector &, MCRegister Reg) const; public: @@ -118,9 +124,12 @@ return getEncodingValue(Reg) & 0xff; } - static const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth); - static const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth); - static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); + static const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth, + bool Exact = true); + static const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth, + bool Exact = true); + static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth, + bool Exact = true); /// Return the 'base' register class for this register. /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" +#include using namespace llvm; @@ -53,6 +54,23 @@ RegPressureIgnoredUnits.set(*MCRegUnitIterator(AMDGPU::M0, this)); for (auto Reg : AMDGPU::VGPR_HI16RegClass) RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); + + RegSplitParts.resize(16); // SubReg sizes are from 1 to 16 DWORDs. + for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) { + unsigned Size = getSubRegIdxSize(Idx); + if (Size & 31) + continue; + std::vector &Vec = RegSplitParts[Size / 32 - 1]; + unsigned Pos = getSubRegIdxOffset(Idx); + if (Pos % Size) + continue; + Pos /= Size; + if (Vec.empty()) { + unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits. + Vec.resize(MaxNumParts); + } + Vec[Pos] = Idx; + } } void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, @@ -1311,7 +1329,7 @@ } const TargetRegisterClass * -SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) { +SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth, bool Exact) { switch (BitWidth) { case 1: return &AMDGPU::VReg_1RegClass; @@ -1336,12 +1354,36 @@ case 1024: return &AMDGPU::VReg_1024RegClass; default: - return nullptr; + if (Exact) + return nullptr; } + + if (BitWidth <= 16) + return &AMDGPU::VGPR_LO16RegClass; + if (BitWidth <= 32) + return &AMDGPU::VGPR_32RegClass; + if (BitWidth <= 64) + return &AMDGPU::VReg_64RegClass; + if (BitWidth <= 96) + return &AMDGPU::VReg_96RegClass; + if (BitWidth <= 128) + return &AMDGPU::VReg_128RegClass; + if (BitWidth <= 160) + return &AMDGPU::VReg_160RegClass; + if (BitWidth <= 192) + return &AMDGPU::VReg_192RegClass; + if (BitWidth <= 256) + return &AMDGPU::VReg_256RegClass; + if (BitWidth <= 512) + return &AMDGPU::VReg_512RegClass; + if (BitWidth <= 1024) + return &AMDGPU::VReg_1024RegClass; + + return nullptr; } const TargetRegisterClass * -SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) { +SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth, bool Exact) { switch (BitWidth) { case 16: return &AMDGPU::AGPR_LO16RegClass; @@ -1364,12 +1406,36 @@ case 1024: return &AMDGPU::AReg_1024RegClass; default: - return nullptr; + if (Exact) + return nullptr; } + + if (BitWidth <= 16) + return &AMDGPU::AGPR_LO16RegClass; + if (BitWidth <= 32) + return &AMDGPU::AGPR_32RegClass; + if (BitWidth <= 64) + return &AMDGPU::AReg_64RegClass; + if (BitWidth <= 96) + return &AMDGPU::AReg_96RegClass; + if (BitWidth <= 128) + return &AMDGPU::AReg_128RegClass; + if (BitWidth <= 160) + return &AMDGPU::AReg_160RegClass; + if (BitWidth <= 192) + return &AMDGPU::AReg_192RegClass; + if (BitWidth <= 256) + return &AMDGPU::AReg_256RegClass; + if (BitWidth <= 512) + return &AMDGPU::AReg_512RegClass; + if (BitWidth <= 1024) + return &AMDGPU::AReg_1024RegClass; + + return nullptr; } const TargetRegisterClass * -SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { +SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth, bool Exact) { switch (BitWidth) { case 16: return &AMDGPU::SGPR_LO16RegClass; @@ -1392,8 +1458,32 @@ case 1024: return &AMDGPU::SGPR_1024RegClass; default: - return nullptr; + if (Exact) + return nullptr; } + + if (BitWidth <= 16) + return &AMDGPU::SGPR_LO16RegClass; + if (BitWidth <= 32) + return &AMDGPU::SReg_32RegClass; + if (BitWidth <= 64) + return &AMDGPU::SReg_64RegClass; + if (BitWidth <= 96) + return &AMDGPU::SGPR_96RegClass; + if (BitWidth <= 128) + return &AMDGPU::SGPR_128RegClass; + if (BitWidth <= 160) + return &AMDGPU::SGPR_160RegClass; + if (BitWidth <= 192) + return &AMDGPU::SGPR_192RegClass; + if (BitWidth <= 256) + return &AMDGPU::SGPR_256RegClass; + if (BitWidth <= 512) + return &AMDGPU::SGPR_512RegClass; + if (BitWidth <= 1024) + return &AMDGPU::SGPR_1024RegClass; + + return nullptr; } // FIXME: This is very slow. It might be worth creating a map from physreg to @@ -1578,65 +1668,14 @@ const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); assert(RegBitWidth >= 32 && RegBitWidth <= 1024); - const unsigned EltBitWidth = EltSize * 8; - assert(EltBitWidth >= 32 && EltBitWidth < 1024 && isPowerOf2_32(EltBitWidth)); - const unsigned LogEltBitWidth = Log2_32(EltBitWidth); - - assert(RegBitWidth % EltBitWidth == 0); - - if (RegBitWidth == EltBitWidth) - return {}; - - static const int16_t Sub_32[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, - AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, - AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, - AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, - AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31 - }; - - static const int16_t Sub_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, - AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, - AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, - AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, - AMDGPU::sub16_sub17, AMDGPU::sub18_sub19, - AMDGPU::sub20_sub21, AMDGPU::sub22_sub23, - AMDGPU::sub24_sub25, AMDGPU::sub26_sub27, - AMDGPU::sub28_sub29, AMDGPU::sub30_sub31 - }; + const unsigned RegDWORDs = RegBitWidth / 32; + const unsigned EltDWORDs = EltSize / 4; + assert(RegSplitParts.size() + 1 >= EltDWORDs); - static const int16_t Sub_128[] = { - AMDGPU::sub0_sub1_sub2_sub3, - AMDGPU::sub4_sub5_sub6_sub7, - AMDGPU::sub8_sub9_sub10_sub11, - AMDGPU::sub12_sub13_sub14_sub15, - AMDGPU::sub16_sub17_sub18_sub19, - AMDGPU::sub20_sub21_sub22_sub23, - AMDGPU::sub24_sub25_sub26_sub27, - AMDGPU::sub28_sub29_sub30_sub31 - }; - - static const int16_t Sub_256[] = { - AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, - AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, - AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23, - AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 - }; - - static const int16_t Sub_512[] = { - AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, - AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31 - }; - - static const int16_t *const Subs[] = { - Sub_32, Sub_64, Sub_128, Sub_256, Sub_512 - }; + const std::vector &Parts = RegSplitParts[EltDWORDs - 1]; + const unsigned NumParts = RegDWORDs / EltDWORDs; - return makeArrayRef(Subs[LogEltBitWidth - 5], RegBitWidth >> LogEltBitWidth); + return makeArrayRef(Parts.data(), NumParts); } const TargetRegisterClass* @@ -1733,15 +1772,15 @@ const MachineRegisterInfo &MRI) const { switch (RB.getID()) { case AMDGPU::VGPRRegBankID: - return getVGPRClassForBitWidth(std::max(32u, Size)); + return getVGPRClassForBitWidth(std::max(32u, Size), false); case AMDGPU::VCCRegBankID: assert(Size == 1); return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass; case AMDGPU::SGPRRegBankID: - return getSGPRClassForBitWidth(std::max(32u, Size)); + return getSGPRClassForBitWidth(std::max(32u, Size), false); case AMDGPU::AGPRRegBankID: - return getAGPRClassForBitWidth(std::max(32u, Size)); + return getAGPRClassForBitWidth(std::max(32u, Size), false); default: llvm_unreachable("unknown register bank"); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1517,3 +1517,639 @@ %ext = extractelement <16 x double> , i32 %sel ret double %ext } + +define amdgpu_ps float @dyn_extract_v6f32_s_v(<6 x float> inreg %vec, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v6f32_s_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b64 s[6:7], exec +; GPRIDX-NEXT: BB33_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s8, v0 +; GPRIDX-NEXT: s_mov_b32 m0, s8 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s8, v0 +; GPRIDX-NEXT: s_movrels_b32 s8, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s8 +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB33_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[6:7] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v6f32_s_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b64 s[6:7], exec +; MOVREL-NEXT: BB33_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s8, v0 +; MOVREL-NEXT: s_mov_b32 m0, s8 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s8, v0 +; MOVREL-NEXT: s_movrels_b32 s8, s0 +; MOVREL-NEXT: v_mov_b32_e32 v1, s8 +; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; MOVREL-NEXT: s_cbranch_execnz BB33_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b64 exec, s[6:7] +; MOVREL-NEXT: v_mov_b32_e32 v0, v1 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <6 x float> %vec, i32 %sel + ret float %ext +} + +define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v6f32_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GPRIDX-NEXT: s_mov_b64 s[4:5], exec +; GPRIDX-NEXT: BB34_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v6 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v7, v0 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB34_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v7 +; GPRIDX-NEXT: s_setpc_b64 s[30:31] +; +; MOVREL-LABEL: dyn_extract_v6f32_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_mov_b64 s[4:5], exec +; MOVREL-NEXT: BB34_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s6, v6 +; MOVREL-NEXT: s_mov_b32 m0, s6 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 +; MOVREL-NEXT: v_movrels_b32_e32 v7, v0 +; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; MOVREL-NEXT: s_cbranch_execnz BB34_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b64 exec, s[4:5] +; MOVREL-NEXT: v_mov_b32_e32 v0, v7 +; MOVREL-NEXT: s_setpc_b64 s[30:31] +entry: + %ext = extractelement <6 x float> %vec, i32 %sel + ret float %ext +} + +define amdgpu_ps float @dyn_extract_v6f32_v_s(<6 x float> %vec, i32 inreg %sel) { +; GPRIDX-LABEL: dyn_extract_v6f32_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v0 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v6f32_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <6 x float> %vec, i32 %sel + ret float %ext +} + +define amdgpu_ps float @dyn_extract_v6f32_s_s(<6 x float> inreg %vec, i32 inreg %sel) { +; GPRIDX-LABEL: dyn_extract_v6f32_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 m0, s8 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_movrels_b32 s0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v6f32_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 m0, s8 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_movrels_b32 s0, s0 +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <6 x float> %vec, i32 %sel + ret float %ext +} + +define amdgpu_ps float @dyn_extract_v7f32_s_v(<7 x float> inreg %vec, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v7f32_s_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b64 s[8:9], exec +; GPRIDX-NEXT: BB37_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s7, v0 +; GPRIDX-NEXT: s_mov_b32 m0, s7 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s7, v0 +; GPRIDX-NEXT: s_movrels_b32 s7, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s7 +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB37_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[8:9] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v7f32_s_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b64 s[8:9], exec +; MOVREL-NEXT: BB37_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s7, v0 +; MOVREL-NEXT: s_mov_b32 m0, s7 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s7, v0 +; MOVREL-NEXT: s_movrels_b32 s7, s0 +; MOVREL-NEXT: v_mov_b32_e32 v1, s7 +; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; MOVREL-NEXT: s_cbranch_execnz BB37_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b64 exec, s[8:9] +; MOVREL-NEXT: v_mov_b32_e32 v0, v1 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <7 x float> %vec, i32 %sel + ret float %ext +} + +define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v7f32_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GPRIDX-NEXT: s_mov_b64 s[4:5], exec +; GPRIDX-NEXT: BB38_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB38_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 +; GPRIDX-NEXT: s_setpc_b64 s[30:31] +; +; MOVREL-LABEL: dyn_extract_v7f32_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_mov_b64 s[4:5], exec +; MOVREL-NEXT: BB38_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s6, v7 +; MOVREL-NEXT: s_mov_b32 m0, s6 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; MOVREL-NEXT: v_movrels_b32_e32 v8, v0 +; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; MOVREL-NEXT: s_cbranch_execnz BB38_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b64 exec, s[4:5] +; MOVREL-NEXT: v_mov_b32_e32 v0, v8 +; MOVREL-NEXT: s_setpc_b64 s[30:31] +entry: + %ext = extractelement <7 x float> %vec, i32 %sel + ret float %ext +} + +define amdgpu_ps float @dyn_extract_v7f32_v_s(<7 x float> %vec, i32 inreg %sel) { +; GPRIDX-LABEL: dyn_extract_v7f32_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v0 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v7f32_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <7 x float> %vec, i32 %sel + ret float %ext +} + +define amdgpu_ps float @dyn_extract_v7f32_s_s(<7 x float> inreg %vec, i32 inreg %sel) { +; GPRIDX-LABEL: dyn_extract_v7f32_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 m0, s9 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_movrels_b32 s0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v7f32_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 m0, s9 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_movrels_b32 s0, s0 +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <7 x float> %vec, i32 %sel + ret float %ext +} + +define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v6f64_s_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s16, s2 +; GPRIDX-NEXT: s_mov_b32 s17, s3 +; GPRIDX-NEXT: s_mov_b32 s18, s4 +; GPRIDX-NEXT: s_mov_b32 s19, s5 +; GPRIDX-NEXT: s_mov_b32 s20, s6 +; GPRIDX-NEXT: s_mov_b32 s21, s7 +; GPRIDX-NEXT: s_mov_b32 s22, s8 +; GPRIDX-NEXT: s_mov_b32 s23, s9 +; GPRIDX-NEXT: s_mov_b32 s24, s10 +; GPRIDX-NEXT: s_mov_b32 s25, s11 +; GPRIDX-NEXT: s_mov_b32 s26, s12 +; GPRIDX-NEXT: s_mov_b32 s27, s13 +; GPRIDX-NEXT: s_mov_b64 s[2:3], exec +; GPRIDX-NEXT: BB41_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 +; GPRIDX-NEXT: s_lshl_b32 m0, s0, 1 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; GPRIDX-NEXT: s_movrels_b32 s0, s16 +; GPRIDX-NEXT: s_movrels_b32 s1, s17 +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB41_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[2:3] +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v6f64_s_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s16, s2 +; MOVREL-NEXT: s_mov_b32 s17, s3 +; MOVREL-NEXT: s_mov_b32 s18, s4 +; MOVREL-NEXT: s_mov_b32 s19, s5 +; MOVREL-NEXT: s_mov_b32 s20, s6 +; MOVREL-NEXT: s_mov_b32 s21, s7 +; MOVREL-NEXT: s_mov_b32 s22, s8 +; MOVREL-NEXT: s_mov_b32 s23, s9 +; MOVREL-NEXT: s_mov_b32 s24, s10 +; MOVREL-NEXT: s_mov_b32 s25, s11 +; MOVREL-NEXT: s_mov_b32 s26, s12 +; MOVREL-NEXT: s_mov_b32 s27, s13 +; MOVREL-NEXT: s_mov_b64 s[2:3], exec +; MOVREL-NEXT: BB41_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 +; MOVREL-NEXT: s_lshl_b32 m0, s0, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; MOVREL-NEXT: s_movrels_b32 s0, s16 +; MOVREL-NEXT: s_movrels_b32 s1, s17 +; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; MOVREL-NEXT: s_cbranch_execnz BB41_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b64 exec, s[2:3] +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <6 x double> %vec, i32 %sel + ret double %ext +} + +define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v6f64_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GPRIDX-NEXT: s_mov_b64 s[4:5], exec +; GPRIDX-NEXT: BB42_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v12 +; GPRIDX-NEXT: s_lshl_b32 s7, s6, 1 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v12 +; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v13, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v14, v1 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB42_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v14 +; GPRIDX-NEXT: s_setpc_b64 s[30:31] +; +; MOVREL-LABEL: dyn_extract_v6f64_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_mov_b64 s[4:5], exec +; MOVREL-NEXT: BB42_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s6, v12 +; MOVREL-NEXT: s_lshl_b32 m0, s6, 1 +; MOVREL-NEXT: v_movrels_b32_e32 v13, v0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v12 +; MOVREL-NEXT: v_movrels_b32_e32 v14, v1 +; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; MOVREL-NEXT: s_cbranch_execnz BB42_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b64 exec, s[4:5] +; MOVREL-NEXT: v_mov_b32_e32 v0, v13 +; MOVREL-NEXT: v_mov_b32_e32 v1, v14 +; MOVREL-NEXT: s_setpc_b64 s[30:31] +entry: + %ext = extractelement <6 x double> %vec, i32 %sel + ret double %ext +} + +define amdgpu_ps double @dyn_extract_v6f64_v_s(<6 x double> %vec, i32 inreg %sel) { +; GPRIDX-LABEL: dyn_extract_v6f64_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v12, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_readfirstlane_b32 s0, v12 +; GPRIDX-NEXT: v_readfirstlane_b32 s1, v0 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v6f64_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_lshl_b32 m0, s2, 1 +; MOVREL-NEXT: v_movrels_b32_e32 v12, v0 +; MOVREL-NEXT: v_movrels_b32_e32 v0, v1 +; MOVREL-NEXT: v_readfirstlane_b32 s0, v12 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <6 x double> %vec, i32 %sel + ret double %ext +} + +define amdgpu_ps double @dyn_extract_v6f64_s_s(<6 x double> inreg %vec, i32 inreg %sel) { +; GPRIDX-LABEL: dyn_extract_v6f64_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 m0, s14 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v6f64_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 m0, s14 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 +; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s11, s13 +; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1] +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <6 x double> %vec, i32 %sel + ret double %ext +} + +define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v7f64_s_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s16, s2 +; GPRIDX-NEXT: s_mov_b32 s17, s3 +; GPRIDX-NEXT: s_mov_b32 s18, s4 +; GPRIDX-NEXT: s_mov_b32 s19, s5 +; GPRIDX-NEXT: s_mov_b32 s20, s6 +; GPRIDX-NEXT: s_mov_b32 s21, s7 +; GPRIDX-NEXT: s_mov_b32 s22, s8 +; GPRIDX-NEXT: s_mov_b32 s23, s9 +; GPRIDX-NEXT: s_mov_b32 s24, s10 +; GPRIDX-NEXT: s_mov_b32 s25, s11 +; GPRIDX-NEXT: s_mov_b32 s26, s12 +; GPRIDX-NEXT: s_mov_b32 s27, s13 +; GPRIDX-NEXT: s_mov_b32 s28, s14 +; GPRIDX-NEXT: s_mov_b32 s29, s15 +; GPRIDX-NEXT: s_mov_b64 s[2:3], exec +; GPRIDX-NEXT: BB45_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 +; GPRIDX-NEXT: s_lshl_b32 m0, s0, 1 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; GPRIDX-NEXT: s_movrels_b32 s0, s16 +; GPRIDX-NEXT: s_movrels_b32 s1, s17 +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB45_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[2:3] +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v7f64_s_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s16, s2 +; MOVREL-NEXT: s_mov_b32 s17, s3 +; MOVREL-NEXT: s_mov_b32 s18, s4 +; MOVREL-NEXT: s_mov_b32 s19, s5 +; MOVREL-NEXT: s_mov_b32 s20, s6 +; MOVREL-NEXT: s_mov_b32 s21, s7 +; MOVREL-NEXT: s_mov_b32 s22, s8 +; MOVREL-NEXT: s_mov_b32 s23, s9 +; MOVREL-NEXT: s_mov_b32 s24, s10 +; MOVREL-NEXT: s_mov_b32 s25, s11 +; MOVREL-NEXT: s_mov_b32 s26, s12 +; MOVREL-NEXT: s_mov_b32 s27, s13 +; MOVREL-NEXT: s_mov_b32 s28, s14 +; MOVREL-NEXT: s_mov_b32 s29, s15 +; MOVREL-NEXT: s_mov_b64 s[2:3], exec +; MOVREL-NEXT: BB45_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 +; MOVREL-NEXT: s_lshl_b32 m0, s0, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; MOVREL-NEXT: s_movrels_b32 s0, s16 +; MOVREL-NEXT: s_movrels_b32 s1, s17 +; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; MOVREL-NEXT: s_cbranch_execnz BB45_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b64 exec, s[2:3] +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <7 x double> %vec, i32 %sel + ret double %ext +} + +define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v7f64_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GPRIDX-NEXT: s_mov_b64 s[4:5], exec +; GPRIDX-NEXT: BB46_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v14 +; GPRIDX-NEXT: s_lshl_b32 s7, s6, 1 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v14 +; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v15, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v16, v1 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB46_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v16 +; GPRIDX-NEXT: s_setpc_b64 s[30:31] +; +; MOVREL-LABEL: dyn_extract_v7f64_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_mov_b64 s[4:5], exec +; MOVREL-NEXT: BB46_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s6, v14 +; MOVREL-NEXT: s_lshl_b32 m0, s6, 1 +; MOVREL-NEXT: v_movrels_b32_e32 v15, v0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v14 +; MOVREL-NEXT: v_movrels_b32_e32 v16, v1 +; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; MOVREL-NEXT: s_cbranch_execnz BB46_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b64 exec, s[4:5] +; MOVREL-NEXT: v_mov_b32_e32 v0, v15 +; MOVREL-NEXT: v_mov_b32_e32 v1, v16 +; MOVREL-NEXT: s_setpc_b64 s[30:31] +entry: + %ext = extractelement <7 x double> %vec, i32 %sel + ret double %ext +} + +define amdgpu_ps double @dyn_extract_v7f64_v_s(<7 x double> %vec, i32 inreg %sel) { +; GPRIDX-LABEL: dyn_extract_v7f64_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v14, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_readfirstlane_b32 s0, v14 +; GPRIDX-NEXT: v_readfirstlane_b32 s1, v0 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v7f64_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_lshl_b32 m0, s2, 1 +; MOVREL-NEXT: v_movrels_b32_e32 v14, v0 +; MOVREL-NEXT: v_movrels_b32_e32 v0, v1 +; MOVREL-NEXT: v_readfirstlane_b32 s0, v14 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <7 x double> %vec, i32 %sel + ret double %ext +} + +define amdgpu_ps double @dyn_extract_v7f64_s_s(<7 x double> inreg %vec, i32 inreg %sel) { +; GPRIDX-LABEL: dyn_extract_v7f64_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 m0, s16 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_extract_v7f64_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 m0, s16 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 +; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s11, s13 +; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s13, s15 +; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1] +; MOVREL-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <7 x double> %vec, i32 %sel + ret double %ext +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir @@ -1,10 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -o - %s 2> %t | FileCheck -check-prefix=GCN %s -# RUN: FileCheck -check-prefix=ERR %s < %t +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -o - %s | FileCheck -check-prefix=GCN %s -# ERR-NOT: remark: -# ERR: remark: :0:0: cannot select: %2:sgpr(<6 x s64>) = G_CONCAT_VECTORS %0:sgpr(<3 x s64>), %1:sgpr(<3 x s64>) (in function: test_concat_vectors_s_v6s64_s_v3s64_s_v3s64) -# ERR-NOT: remark: --- name: test_concat_vectors_v_v4s16_v_v2s16_v_v2s16 @@ -634,10 +630,14 @@ liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GCN-LABEL: name: test_concat_vectors_s_v6s64_s_v3s64_s_v3s64 - ; GCN: [[DEF:%[0-9]+]]:sgpr(<3 x s64>) = G_IMPLICIT_DEF - ; GCN: [[DEF1:%[0-9]+]]:sgpr(<3 x s64>) = G_IMPLICIT_DEF - ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<6 x s64>) = G_CONCAT_VECTORS [[DEF]](<3 x s64>), [[DEF1]](<3 x s64>) - ; GCN: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s64>) + ; GCN: [[DEF:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF + ; GCN: [[DEF1:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF + ; GCN: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN: [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[DEF1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11, undef [[DEF2]], %subreg.sub12, undef [[DEF3]], %subreg.sub13, undef [[DEF4]], %subreg.sub14, undef [[DEF5]], %subreg.sub15 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:sgpr(<3 x s64>) = G_IMPLICIT_DEF %1:sgpr(<3 x s64>) = G_IMPLICIT_DEF %2:sgpr(<6 x s64>) = G_CONCAT_VECTORS %0, %1