Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,6 +37,12 @@ [{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, MRI, *MF, ${matchinfo}); }]), (apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def extract_elt_to_cmp_select : GICombineRule< + (defs root:$extract_elt), + (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$extract_elt, + [{ return matchExtractElt(*${extract_elt}, MRI, *MF); }]), + (apply [{ applyExtractEltToCmpSelect(*${extract_elt}, MRI); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -52,8 +58,8 @@ // aren't re-legalized. // FIXME: Is there a way to remove a single item from all_combines? def all_combines_minus_extload : GICombineGroup<[trivial_combines, - ptr_add_immed_chain, combine_indexed_load_store, undef_combines, - identity_combines] + ptr_add_immed_chain, combine_indexed_load_store, extract_elt_to_cmp_select, + undef_combines, identity_combines] >; def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -219,6 +219,51 @@ MI.eraseFromParent(); } +static bool matchExtractElt(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF) { + Register VecReg = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(2).getReg(); + + if (getOpcodeDef(TargetOpcode::G_CONSTANT, Idx, MRI)) + return false; + + // FIXME: We need another combiner post RegBankSelect. Then move this combine + // there and actually query RegBank of the Idx. + bool IsDivergent = false; + + LLT VecTy = MRI.getType(VecReg); + unsigned EltSize = VecTy.getScalarSizeInBits(); + unsigned NumElem = VecTy.getNumElements(); + + return SIInstrInfo::shouldExpandVectorDynExt(EltSize, NumElem, IsDivergent); +} + +static void applyExtractEltToCmpSelect(MachineInstr &MI, + MachineRegisterInfo &MRI) { + MachineIRBuilder B(MI); + + Register VecReg = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(2).getReg(); + + LLT VecTy = MRI.getType(VecReg); + LLT EltTy = VecTy.getScalarType(); + unsigned NumElem = VecTy.getNumElements(); + + auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); + Register Res = UnmergeToEltTy.getReg(0); + + for (unsigned I = 1; I < NumElem; ++I) { + auto IC = B.buildConstant(LLT::scalar(32), I); + auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), Idx, IC); + Register Sel = (I == NumElem - 1) ? MI.getOperand(0).getReg() + : MRI.createGenericVirtualRegister(EltTy); + B.buildSelect(Sel, Cmp, UnmergeToEltTy.getReg(I), Res); + Res = Sel; + } + + MI.eraseFromParent(); +} + #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenPostLegalizeGICombiner.inc" #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -99,12 +99,6 @@ "amdgpu-reserve-vgpr-for-sgpr-spill", cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true)); -static cl::opt UseDivergentRegisterIndexing( - "amdgpu-use-divergent-register-indexing", - cl::Hidden, - cl::desc("Use indirect register addressing for divergent indexes"), - cl::init(false)); - static bool hasFP32Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo(); return Info->getMode().allFP32Denormals(); @@ -9545,33 +9539,17 @@ // expanded into a set of cmp/select instructions. static bool shouldExpandVectorDynExt(SDNode *N) { SDValue Idx = N->getOperand(N->getNumOperands() - 1); - if (UseDivergentRegisterIndexing || isa(Idx)) + if (isa(Idx)) return false; SDValue Vec = N->getOperand(0); EVT VecVT = Vec.getValueType(); EVT EltVT = VecVT.getVectorElementType(); - unsigned VecSize = VecVT.getSizeInBits(); unsigned EltSize = EltVT.getSizeInBits(); unsigned NumElem = VecVT.getVectorNumElements(); - // Sub-dword vectors of size 2 dword or less have better implementation. - if (VecSize <= 64 && EltSize < 32) - return false; - - // Always expand the rest of sub-dword instructions, otherwise it will be - // lowered via memory. - if (EltSize < 32) - return true; - - // Always do this if var-idx is divergent, otherwise it will become a loop. - if (Idx->isDivergent()) - return true; - - // Large vectors would yield too many compares and v_cndmask_b32 instructions. - unsigned NumInsts = NumElem /* Number of compares */ + - ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; - return NumInsts <= 16; + return SIInstrInfo::shouldExpandVectorDynExt(EltSize, NumElem, + Idx->isDivergent()); } SDValue SITargetLowering::performExtractVectorEltCombine( Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1047,6 +1047,11 @@ unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost = nullptr) const override; + + // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (, var-idx) should be + // expanded into a set of cmp/select instructions. + static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, + bool IsDivergentIdx); }; /// \brief Returns true if a reg:subreg pair P has a TRC class Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -89,6 +89,12 @@ cl::init(true), cl::ReallyHidden); +static cl::opt UseDivergentRegisterIndexing( + "amdgpu-use-divergent-register-indexing", + cl::Hidden, + cl::desc("Use indirect register addressing for divergent indexes"), + cl::init(false)); + SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { @@ -6957,3 +6963,29 @@ return SchedModel.computeInstrLatency(&MI); } + +bool SIInstrInfo::shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, + bool IsDivergentIdx) { + if (UseDivergentRegisterIndexing) + return false; + + unsigned VecSize = EltSize * NumElem; + + // Sub-dword vectors of size 2 dword or less have better implementation. + if (VecSize <= 64 && EltSize < 32) + return false; + + // Always expand the rest of sub-dword instructions, otherwise it will be + // lowered via memory. + if (EltSize < 32) + return true; + + // Always do this if var-idx is divergent, otherwise it will become a loop. + if (IsDivergentIdx) + return true; + + // Large vectors would yield too many compares and v_cndmask_b32 instructions. + unsigned NumInsts = NumElem /* Number of compares */ + + ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; + return NumInsts <= 16; +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -6,27 +6,25 @@ ; GCN-LABEL: dyn_extract_v8f32_const_s_v: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s11, 0x41000000 -; GCN-NEXT: s_mov_b32 s10, 0x40e00000 -; GCN-NEXT: s_mov_b32 s9, 0x40c00000 -; GCN-NEXT: s_mov_b32 s8, 0x40a00000 -; GCN-NEXT: s_mov_b32 s7, 4.0 -; GCN-NEXT: s_mov_b32 s6, 0x40400000 -; GCN-NEXT: s_mov_b32 s5, 2.0 -; GCN-NEXT: s_mov_b32 s4, 1.0 -; GCN-NEXT: s_mov_b64 s[12:13], exec -; GCN-NEXT: BB0_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s14, v0 -; GCN-NEXT: s_mov_b32 m0, s14 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s14, v0 -; GCN-NEXT: s_movrels_b32 s14, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NEXT: s_and_saveexec_b64 vcc, vcc -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execnz BB0_1 -; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[12:13] -; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40a00000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40c00000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40e00000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x float> , i32 %sel @@ -36,16 +34,41 @@ define amdgpu_ps float @dyn_extract_v8f32_const_s_s(i32 inreg %sel) { ; GCN-LABEL: dyn_extract_v8f32_const_s_s: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s4, 1.0 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: s_mov_b32 s11, 0x41000000 -; GCN-NEXT: s_mov_b32 s10, 0x40e00000 -; GCN-NEXT: s_mov_b32 s9, 0x40c00000 -; GCN-NEXT: s_mov_b32 s8, 0x40a00000 -; GCN-NEXT: s_mov_b32 s7, 4.0 -; GCN-NEXT: s_mov_b32 s6, 0x40400000 -; GCN-NEXT: s_mov_b32 s5, 2.0 -; GCN-NEXT: s_movrels_b32 s0, s4 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, 2.0, 1.0 +; GCN-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 0x40400000, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 4.0, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 4 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 0x40a00000, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 5 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 0x40c00000, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 6 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 0x40e00000, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 7 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 0x41000000, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog entry: @@ -56,27 +79,28 @@ define amdgpu_ps float @dyn_extract_v8f32_s_v(<8 x float> inreg %vec, i32 %sel) { ; GCN-LABEL: dyn_extract_v8f32_s_v: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s0, s2 -; GCN-NEXT: s_mov_b32 s1, s3 -; GCN-NEXT: s_mov_b32 s2, s4 -; GCN-NEXT: s_mov_b32 s3, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s8 -; GCN-NEXT: s_mov_b32 s7, s9 -; GCN-NEXT: s_mov_b64 s[8:9], exec -; GCN-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NEXT: s_mov_b32 m0, s10 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s10, v0 -; GCN-NEXT: s_movrels_b32 s10, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NEXT: s_and_saveexec_b64 vcc, vcc -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execnz BB2_1 -; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel @@ -84,58 +108,68 @@ } define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v8f32_v_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v8 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB3_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: s_setpc_b64 s[30:31] -; -; MOVREL-LABEL: dyn_extract_v8f32_v_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], exec -; MOVREL-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; MOVREL-NEXT: v_movrels_b32_e32 v9, v0 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB3_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: dyn_extract_v8f32_v_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v8f32_v_s(<8 x float> %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f32_v_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f32_v_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v8f32_v_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext @@ -144,16 +178,41 @@ define amdgpu_ps float @dyn_extract_v8f32_s_s(<8 x float> inreg %vec, i32 inreg %sel) { ; GCN-LABEL: dyn_extract_v8f32_s_s: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s0, s2 -; GCN-NEXT: s_mov_b32 m0, s10 -; GCN-NEXT: s_mov_b32 s1, s3 -; GCN-NEXT: s_mov_b32 s2, s4 -; GCN-NEXT: s_mov_b32 s3, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s8 -; GCN-NEXT: s_mov_b32 s7, s9 -; GCN-NEXT: s_movrels_b32 s0, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_cmp_eq_u32 s10, 2 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s4, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 3 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s5, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 4 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s6, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 5 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s7, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 6 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s8, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 7 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s9, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog entry: @@ -437,16 +496,42 @@ define amdgpu_ps float @dyn_extract_v8f32_s_s_offset3(<8 x float> inreg %vec, i32 inreg %sel) { ; GCN-LABEL: dyn_extract_v8f32_s_s_offset3: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s0, s2 -; GCN-NEXT: s_mov_b32 s1, s3 -; GCN-NEXT: s_mov_b32 s3, s5 -; GCN-NEXT: s_mov_b32 m0, s10 -; GCN-NEXT: s_mov_b32 s2, s4 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s8 -; GCN-NEXT: s_mov_b32 s7, s9 -; GCN-NEXT: s_movrels_b32 s0, s3 +; GCN-NEXT: s_add_i32 s10, s10, 3 +; GCN-NEXT: s_cmp_eq_u32 s10, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_cmp_eq_u32 s10, 2 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s4, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 3 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s5, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 4 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s6, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 5 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s7, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 6 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s8, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 7 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s9, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog entry: @@ -459,36 +544,41 @@ ; GPRIDX-LABEL: dyn_extract_v8f32_v_v_offset3: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v8 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v9, v3 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB13_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 +; GPRIDX-NEXT: v_add_u32_e32 v8, 3, v8 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; ; MOVREL-LABEL: dyn_extract_v8f32_v_v_offset3: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], exec -; MOVREL-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; MOVREL-NEXT: v_movrels_b32_e32 v9, v3 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB13_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 +; MOVREL-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; MOVREL-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %sel, 3 @@ -792,40 +882,24 @@ } define i8 addrspace(3)* @dyn_extract_v8p3_v_v(<8 x i8 addrspace(3)*> %vec, i32 %idx) { -; GPRIDX-LABEL: dyn_extract_v8p3_v_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB23_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v8 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB23_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: s_setpc_b64 s[30:31] -; -; MOVREL-LABEL: dyn_extract_v8p3_v_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], exec -; MOVREL-NEXT: BB23_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; MOVREL-NEXT: v_movrels_b32_e32 v9, v0 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB23_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: dyn_extract_v8p3_v_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i8 addrspace(3)*> %vec, i32 %idx ret i8 addrspace(3)* %ext @@ -834,32 +908,82 @@ define amdgpu_ps void @dyn_extract_v8p3_s_s(<8 x i8 addrspace(3)*> inreg %vec, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_extract_v8p3_s_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 m0, s10 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_movrels_b32 s0, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 1 +; GPRIDX-NEXT: s_cselect_b32 s0, 1, 0 +; GPRIDX-NEXT: s_and_b32 s0, s0, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s0, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s3, s2 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 2 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s4, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 3 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s5, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 4 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s6, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 5 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s7, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 6 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s8, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 7 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s9, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: ds_write_b32 v0, v0 ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_extract_v8p3_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s10 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movrels_b32 s0, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 1 +; MOVREL-NEXT: s_cselect_b32 s0, 1, 0 +; MOVREL-NEXT: s_and_b32 s0, s0, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s0, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s3, s2 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 2 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s4, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 3 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s5, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 4 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s6, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 5 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s7, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 6 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s8, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 7 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s9, s0 ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: s_mov_b32 m0, -1 ; MOVREL-NEXT: ds_write_b32 v0, v0 @@ -1147,25 +1271,22 @@ define amdgpu_ps float @dyn_extract_v6f32_s_v(<6 x float> inreg %vec, i32 %sel) { ; GCN-LABEL: dyn_extract_v6f32_s_v: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s0, s2 -; GCN-NEXT: s_mov_b32 s1, s3 -; GCN-NEXT: s_mov_b32 s2, s4 -; GCN-NEXT: s_mov_b32 s3, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: BB33_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NEXT: s_mov_b32 m0, s8 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s8, v0 -; GCN-NEXT: s_movrels_b32 s8, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s8 -; GCN-NEXT: s_and_saveexec_b64 vcc, vcc -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execnz BB33_1 -; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel @@ -1173,58 +1294,54 @@ } define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v6f32_v_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB34_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v6 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v7, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB34_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v7 -; GPRIDX-NEXT: s_setpc_b64 s[30:31] -; -; MOVREL-LABEL: dyn_extract_v6f32_v_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], exec -; MOVREL-NEXT: BB34_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v6 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 -; MOVREL-NEXT: v_movrels_b32_e32 v7, v0 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB34_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, v7 -; MOVREL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: dyn_extract_v6f32_v_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v6f32_v_s(<6 x float> %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v6f32_v_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v6f32_v_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v6f32_v_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext @@ -1233,14 +1350,31 @@ define amdgpu_ps float @dyn_extract_v6f32_s_s(<6 x float> inreg %vec, i32 inreg %sel) { ; GCN-LABEL: dyn_extract_v6f32_s_s: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s0, s2 -; GCN-NEXT: s_mov_b32 m0, s8 -; GCN-NEXT: s_mov_b32 s1, s3 -; GCN-NEXT: s_mov_b32 s2, s4 -; GCN-NEXT: s_mov_b32 s3, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_movrels_b32 s0, s0 +; GCN-NEXT: s_cmp_eq_u32 s8, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_cmp_eq_u32 s8, 2 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s4, s0 +; GCN-NEXT: s_cmp_eq_u32 s8, 3 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s5, s0 +; GCN-NEXT: s_cmp_eq_u32 s8, 4 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s6, s0 +; GCN-NEXT: s_cmp_eq_u32 s8, 5 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s7, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog entry: @@ -1251,26 +1385,25 @@ define amdgpu_ps float @dyn_extract_v7f32_s_v(<7 x float> inreg %vec, i32 %sel) { ; GCN-LABEL: dyn_extract_v7f32_s_v: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s0, s2 -; GCN-NEXT: s_mov_b32 s1, s3 -; GCN-NEXT: s_mov_b32 s2, s4 -; GCN-NEXT: s_mov_b32 s3, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s6, s8 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b64 s[8:9], exec -; GCN-NEXT: BB37_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s7, v0 -; GCN-NEXT: s_mov_b32 m0, s7 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s7, v0 -; GCN-NEXT: s_movrels_b32 s7, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_and_saveexec_b64 vcc, vcc -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN-NEXT: s_cbranch_execnz BB37_1 -; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel @@ -1278,58 +1411,61 @@ } define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v7f32_v_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB38_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB38_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 -; GPRIDX-NEXT: s_setpc_b64 s[30:31] -; -; MOVREL-LABEL: dyn_extract_v7f32_v_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], exec -; MOVREL-NEXT: BB38_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v7 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 -; MOVREL-NEXT: v_movrels_b32_e32 v8, v0 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB38_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, v8 -; MOVREL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: dyn_extract_v7f32_v_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v7f32_v_s(<7 x float> %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v7f32_v_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v7f32_v_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v7f32_v_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext @@ -1338,15 +1474,36 @@ define amdgpu_ps float @dyn_extract_v7f32_s_s(<7 x float> inreg %vec, i32 inreg %sel) { ; GCN-LABEL: dyn_extract_v7f32_s_s: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s0, s2 -; GCN-NEXT: s_mov_b32 m0, s9 -; GCN-NEXT: s_mov_b32 s1, s3 -; GCN-NEXT: s_mov_b32 s2, s4 -; GCN-NEXT: s_mov_b32 s3, s5 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: s_mov_b32 s6, s8 -; GCN-NEXT: s_movrels_b32 s0, s0 +; GCN-NEXT: s_cmp_eq_u32 s9, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_cmp_eq_u32 s9, 2 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s4, s0 +; GCN-NEXT: s_cmp_eq_u32 s9, 3 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s5, s0 +; GCN-NEXT: s_cmp_eq_u32 s9, 4 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s6, s0 +; GCN-NEXT: s_cmp_eq_u32 s9, 5 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s7, s0 +; GCN-NEXT: s_cmp_eq_u32 s9, 6 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s8, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog entry: @@ -1622,7 +1779,7 @@ ; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256 ; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0 ; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0 -; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2 +; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1 ; GPRIDX-NEXT: priority = 0 ; GPRIDX-NEXT: float_mode = 240 ; GPRIDX-NEXT: priv = 0 @@ -1665,7 +1822,7 @@ ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 24 +; GPRIDX-NEXT: wavefront_sgpr_count = 10 ; GPRIDX-NEXT: workitem_vgpr_count = 4 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -1681,22 +1838,37 @@ ; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 ; GPRIDX-NEXT: .end_amd_kernel_code_t ; GPRIDX-NEXT: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GPRIDX-NEXT: s_load_dword s2, s[4:5], 0x8 -; GPRIDX-NEXT: s_mov_b32 s16, 0 -; GPRIDX-NEXT: s_mov_b64 s[8:9], 1.0 -; GPRIDX-NEXT: s_mov_b32 s17, 0x40140000 -; GPRIDX-NEXT: s_mov_b64 s[14:15], 4.0 +; GPRIDX-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8 +; GPRIDX-NEXT: s_mov_b32 s0, 0 +; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 +; GPRIDX-NEXT: s_mov_b32 s2, s0 +; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b32 m0, s2 -; GPRIDX-NEXT: s_mov_b32 s13, 0x40080000 -; GPRIDX-NEXT: s_mov_b32 s12, s16 -; GPRIDX-NEXT: s_mov_b64 s[10:11], 2.0 -; GPRIDX-NEXT: s_movrels_b64 s[2:3], s[8:9] -; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 +; GPRIDX-NEXT: s_cselect_b32 s4, 1, 0 +; GPRIDX-NEXT: s_and_b32 s4, s4, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s4, 0 +; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 2 +; GPRIDX-NEXT: s_cselect_b32 s9, 1, 0 +; GPRIDX-NEXT: s_and_b32 s9, s9, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s9, 0 +; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 3 +; GPRIDX-NEXT: s_cselect_b32 s4, 1, 0 +; GPRIDX-NEXT: s_and_b32 s4, s4, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s4, 0 +; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 4 +; GPRIDX-NEXT: s_cselect_b32 s4, 1, 0 +; GPRIDX-NEXT: s_and_b32 s4, s4, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s4, 0 +; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s7 ; GPRIDX-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GPRIDX-NEXT: s_endpgm ; @@ -1711,7 +1883,7 @@ ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -1754,7 +1926,7 @@ ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 24 +; MOVREL-NEXT: wavefront_sgpr_count = 10 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -1770,22 +1942,37 @@ ; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 ; MOVREL-NEXT: .end_amd_kernel_code_t ; MOVREL-NEXT: ; %bb.0: ; %entry -; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; MOVREL-NEXT: s_load_dword s2, s[4:5], 0x8 -; MOVREL-NEXT: s_mov_b32 s16, 0 -; MOVREL-NEXT: s_mov_b64 s[8:9], 1.0 -; MOVREL-NEXT: s_mov_b32 s17, 0x40140000 -; MOVREL-NEXT: s_mov_b64 s[14:15], 4.0 +; MOVREL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8 +; MOVREL-NEXT: s_mov_b32 s0, 0 +; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 +; MOVREL-NEXT: s_mov_b32 s2, s0 +; MOVREL-NEXT: s_mov_b32 s1, 0x40140000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: s_mov_b32 s13, 0x40080000 -; MOVREL-NEXT: s_mov_b32 s12, s16 -; MOVREL-NEXT: s_mov_b64 s[10:11], 2.0 -; MOVREL-NEXT: s_movrels_b64 s[2:3], s[8:9] -; MOVREL-NEXT: v_mov_b32_e32 v0, s2 -; MOVREL-NEXT: v_mov_b32_e32 v3, s1 -; MOVREL-NEXT: v_mov_b32_e32 v1, s3 -; MOVREL-NEXT: v_mov_b32_e32 v2, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 +; MOVREL-NEXT: s_cselect_b32 s4, 1, 0 +; MOVREL-NEXT: s_and_b32 s4, s4, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s4, 0 +; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 +; MOVREL-NEXT: s_cselect_b32 s9, 1, 0 +; MOVREL-NEXT: s_and_b32 s9, s9, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s9, 0 +; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 +; MOVREL-NEXT: s_cselect_b32 s4, 1, 0 +; MOVREL-NEXT: s_and_b32 s4, s4, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s4, 0 +; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 +; MOVREL-NEXT: s_cselect_b32 s4, 1, 0 +; MOVREL-NEXT: s_and_b32 s4, s4, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s4, 0 +; MOVREL-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: v_mov_b32_e32 v3, s7 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm entry: