Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,6 +37,12 @@ [{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, MRI, *MF, ${matchinfo}); }]), (apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def extract_elt_to_cmp_select : GICombineRule< + (defs root:$extract_elt), + (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$extract_elt, + [{ return matchExtractElt(*${extract_elt}, MRI, *MF); }]), + (apply [{ applyExtractEltToCmpSelect(*${extract_elt}, MRI); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -52,8 +58,8 @@ // aren't re-legalized. // FIXME: Is there a way to remove a single item from all_combines? def all_combines_minus_extload : GICombineGroup<[trivial_combines, - ptr_add_immed_chain, combine_indexed_load_store, undef_combines, - identity_combines] + ptr_add_immed_chain, combine_indexed_load_store, extract_elt_to_cmp_select, + undef_combines, identity_combines] >; def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -219,6 +219,61 @@ MI.eraseFromParent(); } +static bool matchExtractElt(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF) { + Register VecReg = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(2).getReg(); + + if (getOpcodeDef(TargetOpcode::G_CONSTANT, Idx, MRI)) + return false; + + // FIXME: We need another combiner post RegBankSelect. Then move this combine + // there and actually query RegBank of the Idx. + bool IsDivergent = false; + + LLT VecTy = MRI.getType(VecReg); + unsigned EltSize = VecTy.getScalarSizeInBits(); + unsigned NumElem = VecTy.getNumElements(); + + return SIInstrInfo::shouldExpandVectorDynExt(EltSize, NumElem, IsDivergent); +} + +static void applyExtractEltToCmpSelect(MachineInstr &MI, + MachineRegisterInfo &MRI) { + MachineIRBuilder B(MI); + + Register VecReg = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(2).getReg(); + + LLT VecTy = MRI.getType(VecReg); + LLT EltTy = VecTy.getScalarType(); + unsigned EltSize = EltTy.getSizeInBits(); + unsigned NumElem = VecTy.getNumElements(); + MachineOperand &Vec = MI.getOperand(1); + Register Res; + + for (unsigned I = 0; I < NumElem; ++I) { + // FIXME: After RegBankSelect we should know and set register bank. + Register Elt = MRI.createGenericVirtualRegister(EltTy); + B.buildExtract(Elt, Vec, I * EltSize); + if (I == 0) { + Res = Elt; + continue; + } + + Register Cmp = MRI.createGenericVirtualRegister(LLT::scalar(1)); + Register IC = MRI.createGenericVirtualRegister(LLT::scalar(32)); + B.buildConstant(IC, I); + B.buildICmp(CmpInst::ICMP_EQ, Cmp, Idx, IC); + Register Sel = (I == NumElem -1) ? MI.getOperand(0).getReg() + : MRI.createGenericVirtualRegister(EltTy); + B.buildSelect(Sel, Cmp, Elt, Res); + Res = Sel; + } + + MI.eraseFromParent(); +} + #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenPostLegalizeGICombiner.inc" #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -99,12 +99,6 @@ "amdgpu-reserve-vgpr-for-sgpr-spill", cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true)); -static cl::opt UseDivergentRegisterIndexing( - "amdgpu-use-divergent-register-indexing", - cl::Hidden, - cl::desc("Use indirect register addressing for divergent indexes"), - cl::init(false)); - static bool hasFP32Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo(); return Info->getMode().allFP32Denormals(); @@ -9545,33 +9539,17 @@ // expanded into a set of cmp/select instructions. static bool shouldExpandVectorDynExt(SDNode *N) { SDValue Idx = N->getOperand(N->getNumOperands() - 1); - if (UseDivergentRegisterIndexing || isa(Idx)) + if (isa(Idx)) return false; SDValue Vec = N->getOperand(0); EVT VecVT = Vec.getValueType(); EVT EltVT = VecVT.getVectorElementType(); - unsigned VecSize = VecVT.getSizeInBits(); unsigned EltSize = EltVT.getSizeInBits(); unsigned NumElem = VecVT.getVectorNumElements(); - // Sub-dword vectors of size 2 dword or less have better implementation. - if (VecSize <= 64 && EltSize < 32) - return false; - - // Always expand the rest of sub-dword instructions, otherwise it will be - // lowered via memory. - if (EltSize < 32) - return true; - - // Always do this if var-idx is divergent, otherwise it will become a loop. - if (Idx->isDivergent()) - return true; - - // Large vectors would yield too many compares and v_cndmask_b32 instructions. - unsigned NumInsts = NumElem /* Number of compares */ + - ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; - return NumInsts <= 16; + return SIInstrInfo::shouldExpandVectorDynExt(EltSize, NumElem, + Idx->isDivergent()); } SDValue SITargetLowering::performExtractVectorEltCombine( Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1047,6 +1047,11 @@ unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost = nullptr) const override; + + // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (, var-idx) should be + // expanded into a set of cmp/select instructions. + static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, + bool IsDivergentIdx); }; /// \brief Returns true if a reg:subreg pair P has a TRC class Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -89,6 +89,12 @@ cl::init(true), cl::ReallyHidden); +static cl::opt UseDivergentRegisterIndexing( + "amdgpu-use-divergent-register-indexing", + cl::Hidden, + cl::desc("Use indirect register addressing for divergent indexes"), + cl::init(false)); + SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { @@ -6955,3 +6961,29 @@ return SchedModel.computeInstrLatency(&MI); } + +bool SIInstrInfo::shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, + bool IsDivergentIdx) { + if (UseDivergentRegisterIndexing) + return false; + + unsigned VecSize = EltSize * NumElem; + + // Sub-dword vectors of size 2 dword or less have better implementation. + if (VecSize <= 64 && EltSize < 32) + return false; + + // Always expand the rest of sub-dword instructions, otherwise it will be + // lowered via memory. + if (EltSize < 32) + return true; + + // Always do this if var-idx is divergent, otherwise it will become a loop. + if (IsDivergentIdx) + return true; + + // Large vectors would yield too many compares and v_cndmask_b32 instructions. + unsigned NumInsts = NumElem /* Number of compares */ + + ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; + return NumInsts <= 16; +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1,301 +1,252 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GPRIDX %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=MOVREL %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s define float @dyn_extract_v8f32_const_s_v(i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v8f32_const_s_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b32 s11, 0x41000000 -; GPRIDX-NEXT: s_mov_b32 s10, 0x40e00000 -; GPRIDX-NEXT: s_mov_b32 s9, 0x40c00000 -; GPRIDX-NEXT: s_mov_b32 s8, 0x40a00000 -; GPRIDX-NEXT: s_mov_b32 s7, 4.0 -; GPRIDX-NEXT: s_mov_b32 s6, 0x40400000 -; GPRIDX-NEXT: s_mov_b32 s5, 2.0 -; GPRIDX-NEXT: s_mov_b32 s4, 1.0 -; GPRIDX-NEXT: s_mov_b64 s[12:13], exec -; GPRIDX-NEXT: BB0_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s14, v0 -; GPRIDX-NEXT: s_mov_b32 m0, s14 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s14, v0 -; GPRIDX-NEXT: s_movrels_b32 s14, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s14 -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB0_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[12:13] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 -; GPRIDX-NEXT: s_setpc_b64 s[30:31] -; -; MOVREL-LABEL: dyn_extract_v8f32_const_s_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b32 s11, 0x41000000 -; MOVREL-NEXT: s_mov_b32 s10, 0x40e00000 -; MOVREL-NEXT: s_mov_b32 s9, 0x40c00000 -; MOVREL-NEXT: s_mov_b32 s8, 0x40a00000 -; MOVREL-NEXT: s_mov_b32 s7, 4.0 -; MOVREL-NEXT: s_mov_b32 s6, 0x40400000 -; MOVREL-NEXT: s_mov_b32 s5, 2.0 -; MOVREL-NEXT: s_mov_b32 s4, 1.0 -; MOVREL-NEXT: s_mov_b64 s[12:13], exec -; MOVREL-NEXT: BB0_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s14, v0 -; MOVREL-NEXT: s_mov_b32 m0, s14 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s14, v0 -; MOVREL-NEXT: s_movrels_b32 s14, s4 -; MOVREL-NEXT: v_mov_b32_e32 v1, s14 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB0_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[12:13] -; MOVREL-NEXT: v_mov_b32_e32 v0, v1 -; MOVREL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: dyn_extract_v8f32_const_s_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40a00000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40c00000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x40e00000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x float> , i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v8f32_const_s_s(i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f32_const_s_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s4, 1.0 -; GPRIDX-NEXT: s_mov_b32 m0, s2 -; GPRIDX-NEXT: s_mov_b32 s11, 0x41000000 -; GPRIDX-NEXT: s_mov_b32 s10, 0x40e00000 -; GPRIDX-NEXT: s_mov_b32 s9, 0x40c00000 -; GPRIDX-NEXT: s_mov_b32 s8, 0x40a00000 -; GPRIDX-NEXT: s_mov_b32 s7, 4.0 -; GPRIDX-NEXT: s_mov_b32 s6, 0x40400000 -; GPRIDX-NEXT: s_mov_b32 s5, 2.0 -; GPRIDX-NEXT: s_movrels_b32 s0, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f32_const_s_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s4, 1.0 -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: s_mov_b32 s11, 0x41000000 -; MOVREL-NEXT: s_mov_b32 s10, 0x40e00000 -; MOVREL-NEXT: s_mov_b32 s9, 0x40c00000 -; MOVREL-NEXT: s_mov_b32 s8, 0x40a00000 -; MOVREL-NEXT: s_mov_b32 s7, 4.0 -; MOVREL-NEXT: s_mov_b32 s6, 0x40400000 -; MOVREL-NEXT: s_mov_b32 s5, 2.0 -; MOVREL-NEXT: s_movrels_b32 s0, s4 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v8f32_const_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, 2.0, 1.0 +; GCN-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 0x40400000, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 4.0, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 4 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 0x40a00000, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 5 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 0x40c00000, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 6 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 0x40e00000, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 7 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, 0x41000000, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> , i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v8f32_s_v(<8 x float> inreg %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v8f32_s_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b64 s[8:9], exec -; GPRIDX-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s10, v0 -; GPRIDX-NEXT: s_mov_b32 m0, s10 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s10, v0 -; GPRIDX-NEXT: s_movrels_b32 s10, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s10 -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB2_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[8:9] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f32_s_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b64 s[8:9], exec -; MOVREL-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s10, v0 -; MOVREL-NEXT: s_mov_b32 m0, s10 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s10, v0 -; MOVREL-NEXT: s_movrels_b32 s10, s0 -; MOVREL-NEXT: v_mov_b32_e32 v1, s10 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB2_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[8:9] -; MOVREL-NEXT: v_mov_b32_e32 v0, v1 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v8f32_s_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext } define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v8f32_v_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v8 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB3_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: s_setpc_b64 s[30:31] -; -; MOVREL-LABEL: dyn_extract_v8f32_v_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], exec -; MOVREL-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; MOVREL-NEXT: v_movrels_b32_e32 v9, v0 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB3_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: dyn_extract_v8f32_v_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v8f32_v_s(<8 x float> %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f32_v_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f32_v_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v8f32_v_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v8f32_s_s(<8 x float> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f32_s_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 m0, s10 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_movrels_b32 s0, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f32_s_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s10 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movrels_b32 s0, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v8f32_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_cmp_eq_u32 s10, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_cmp_eq_u32 s10, 2 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s4, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 3 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s5, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 4 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s6, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 5 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s7, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 6 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s8, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 7 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s9, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext } define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v8i64_const_s_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[18:19], 8 -; GPRIDX-NEXT: s_mov_b64 s[16:17], 7 -; GPRIDX-NEXT: s_mov_b64 s[14:15], 6 -; GPRIDX-NEXT: s_mov_b64 s[12:13], 5 -; GPRIDX-NEXT: s_mov_b64 s[10:11], 4 -; GPRIDX-NEXT: s_mov_b64 s[8:9], 3 -; GPRIDX-NEXT: s_mov_b64 s[6:7], 2 -; GPRIDX-NEXT: s_mov_b64 s[4:5], 1 -; GPRIDX-NEXT: s_mov_b64 s[20:21], exec -; GPRIDX-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s22, v0 -; GPRIDX-NEXT: s_lshl_b32 m0, s22, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s22, v0 -; GPRIDX-NEXT: s_movrels_b32 s22, s4 -; GPRIDX-NEXT: s_movrels_b32 s23, s5 -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB6_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[20:21] -; GPRIDX-NEXT: v_mov_b32_e32 v0, s22 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s23 -; GPRIDX-NEXT: s_setpc_b64 s[30:31] -; -; MOVREL-LABEL: dyn_extract_v8i64_const_s_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[18:19], 8 -; MOVREL-NEXT: s_mov_b64 s[16:17], 7 -; MOVREL-NEXT: s_mov_b64 s[14:15], 6 -; MOVREL-NEXT: s_mov_b64 s[12:13], 5 -; MOVREL-NEXT: s_mov_b64 s[10:11], 4 -; MOVREL-NEXT: s_mov_b64 s[8:9], 3 -; MOVREL-NEXT: s_mov_b64 s[6:7], 2 -; MOVREL-NEXT: s_mov_b64 s[4:5], 1 -; MOVREL-NEXT: s_mov_b64 s[20:21], exec -; MOVREL-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s22, v0 -; MOVREL-NEXT: s_lshl_b32 m0, s22, 1 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s22, v0 -; MOVREL-NEXT: s_movrels_b32 s22, s4 -; MOVREL-NEXT: s_movrels_b32 s23, s5 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB6_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[20:21] -; MOVREL-NEXT: v_mov_b32_e32 v0, s22 -; MOVREL-NEXT: v_mov_b32_e32 v1, s23 -; MOVREL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: dyn_extract_v8i64_const_s_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[18:19], 8 +; GCN-NEXT: s_mov_b64 s[16:17], 7 +; GCN-NEXT: s_mov_b64 s[14:15], 6 +; GCN-NEXT: s_mov_b64 s[12:13], 5 +; GCN-NEXT: s_mov_b64 s[10:11], 4 +; GCN-NEXT: s_mov_b64 s[8:9], 3 +; GCN-NEXT: s_mov_b64 s[6:7], 2 +; GCN-NEXT: s_mov_b64 s[4:5], 1 +; GCN-NEXT: s_mov_b64 s[20:21], exec +; GCN-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s22, v0 +; GCN-NEXT: s_lshl_b32 m0, s22, 1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s22, v0 +; GCN-NEXT: s_movrels_b32 s22, s4 +; GCN-NEXT: s_movrels_b32 s23, s5 +; GCN-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN-NEXT: s_cbranch_execnz BB6_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[20:21] +; GCN-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i64> , i32 %sel ret i64 %ext @@ -543,35 +494,46 @@ } define amdgpu_ps float @dyn_extract_v8f32_s_s_offset3(<8 x float> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f32_s_s_offset3: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 m0, s10 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_movrels_b32 s0, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f32_s_s_offset3: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 m0, s10 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movrels_b32 s0, s3 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v8f32_s_s_offset3: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_add_i32 s10, s10, 3 +; GCN-NEXT: s_cmp_eq_u32 s10, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_cmp_eq_u32 s10, 2 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s4, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 3 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s5, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 4 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s6, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 5 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s7, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 6 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s8, s0 +; GCN-NEXT: s_cmp_eq_u32 s10, 7 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s9, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 3 %ext = extractelement <8 x float> %vec, i32 %add @@ -582,36 +544,41 @@ ; GPRIDX-LABEL: dyn_extract_v8f32_v_v_offset3: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v8 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v9, v3 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB13_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 +; GPRIDX-NEXT: v_add_u32_e32 v8, 3, v8 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; ; MOVREL-LABEL: dyn_extract_v8f32_v_v_offset3: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], exec -; MOVREL-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; MOVREL-NEXT: v_movrels_b32_e32 v9, v3 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB13_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 +; MOVREL-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; MOVREL-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %sel, 3 @@ -620,49 +587,27 @@ } define amdgpu_ps double @dyn_extract_v8f64_s_s_offset1(<8 x double> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset1: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 m0, s18 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s15, s17 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[2:3] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset1: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 m0, s18 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s12, s14 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[2:3] -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v8f64_s_s_offset1: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 m0, s18 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_mov_b32 s15, s17 +; GCN-NEXT: s_movrels_b64 s[0:1], s[2:3] +; GCN-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 1 %ext = extractelement <8 x double> %vec, i32 %add @@ -670,7 +615,147 @@ } define amdgpu_ps double @dyn_extract_v8f64_s_s_offset2(<8 x double> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset2: +; GCN-LABEL: dyn_extract_v8f64_s_s_offset2: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 m0, s18 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_mov_b32 s15, s17 +; GCN-NEXT: s_movrels_b64 s[0:1], s[4:5] +; GCN-NEXT: ; return to shader part epilog +entry: + %add = add i32 %sel, 2 + %ext = extractelement <8 x double> %vec, i32 %add + ret double %ext +} + +define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec, i32 inreg %sel) { +; GCN-LABEL: dyn_extract_v8f64_s_s_offset3: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 m0, s18 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_mov_b32 s15, s17 +; GCN-NEXT: s_movrels_b64 s[0:1], s[6:7] +; GCN-NEXT: ; return to shader part epilog +entry: + %add = add i32 %sel, 3 + %ext = extractelement <8 x double> %vec, i32 %add + ret double %ext +} + +define amdgpu_ps double @dyn_extract_v8f64_s_s_offset4(<8 x double> inreg %vec, i32 inreg %sel) { +; GCN-LABEL: dyn_extract_v8f64_s_s_offset4: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 m0, s18 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_mov_b32 s15, s17 +; GCN-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GCN-NEXT: ; return to shader part epilog +entry: + %add = add i32 %sel, 4 + %ext = extractelement <8 x double> %vec, i32 %add + ret double %ext +} + +define amdgpu_ps double @dyn_extract_v8f64_s_s_offset5(<8 x double> inreg %vec, i32 inreg %sel) { +; GCN-LABEL: dyn_extract_v8f64_s_s_offset5: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 m0, s18 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_mov_b32 s15, s17 +; GCN-NEXT: s_movrels_b64 s[0:1], s[10:11] +; GCN-NEXT: ; return to shader part epilog +entry: + %add = add i32 %sel, 5 + %ext = extractelement <8 x double> %vec, i32 %add + ret double %ext +} + +define amdgpu_ps double @dyn_extract_v8f64_s_s_offset6(<8 x double> inreg %vec, i32 inreg %sel) { +; GCN-LABEL: dyn_extract_v8f64_s_s_offset6: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 m0, s18 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_mov_b32 s15, s17 +; GCN-NEXT: s_movrels_b64 s[0:1], s[12:13] +; GCN-NEXT: ; return to shader part epilog +entry: + %add = add i32 %sel, 6 + %ext = extractelement <8 x double> %vec, i32 %add + ret double %ext +} + +define amdgpu_ps double @dyn_extract_v8f64_s_s_offset7(<8 x double> inreg %vec, i32 inreg %sel) { +; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset7: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_mov_b32 s0, s2 ; GPRIDX-NEXT: s_mov_b32 s1, s3 @@ -678,7 +763,6 @@ ; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 m0, s18 ; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 @@ -689,10 +773,12 @@ ; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: s_mov_b32 s15, s17 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[4:5] +; GPRIDX-NEXT: s_mov_b32 m0, s18 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[14:15] ; GPRIDX-NEXT: ; return to shader part epilog ; -; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset2: +; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset7: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_mov_b32 s0, s2 ; MOVREL-NEXT: s_mov_b32 s1, s3 @@ -700,7 +786,6 @@ ; MOVREL-NEXT: s_mov_b32 s3, s5 ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 m0, s18 ; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 @@ -711,309 +796,37 @@ ; MOVREL-NEXT: s_mov_b32 s13, s15 ; MOVREL-NEXT: s_mov_b32 s14, s16 ; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[4:5] +; MOVREL-NEXT: s_mov_b32 m0, s18 +; MOVREL-NEXT: s_movrels_b64 s[0:1], s[14:15] ; MOVREL-NEXT: ; return to shader part epilog entry: - %add = add i32 %sel, 2 - %ext = extractelement <8 x double> %vec, i32 %add - ret double %ext -} - -define amdgpu_ps double @dyn_extract_v8f64_s_s_offset3(<8 x double> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset3: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 m0, s18 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s15, s17 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[6:7] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset3: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 m0, s18 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s12, s14 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[6:7] -; MOVREL-NEXT: ; return to shader part epilog -entry: - %add = add i32 %sel, 3 - %ext = extractelement <8 x double> %vec, i32 %add - ret double %ext -} - -define amdgpu_ps double @dyn_extract_v8f64_s_s_offset4(<8 x double> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset4: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 m0, s18 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s15, s17 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset4: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 m0, s18 -; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s12, s14 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[8:9] -; MOVREL-NEXT: ; return to shader part epilog -entry: - %add = add i32 %sel, 4 - %ext = extractelement <8 x double> %vec, i32 %add - ret double %ext -} - -define amdgpu_ps double @dyn_extract_v8f64_s_s_offset5(<8 x double> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset5: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 m0, s18 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s15, s17 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[10:11] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset5: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 m0, s18 -; MOVREL-NEXT: s_mov_b32 s12, s14 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[10:11] -; MOVREL-NEXT: ; return to shader part epilog -entry: - %add = add i32 %sel, 5 - %ext = extractelement <8 x double> %vec, i32 %add - ret double %ext -} - -define amdgpu_ps double @dyn_extract_v8f64_s_s_offset6(<8 x double> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset6: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 m0, s18 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s15, s17 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[12:13] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset6: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s12, s14 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 m0, s18 -; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[12:13] -; MOVREL-NEXT: ; return to shader part epilog -entry: - %add = add i32 %sel, 6 - %ext = extractelement <8 x double> %vec, i32 %add - ret double %ext -} - -define amdgpu_ps double @dyn_extract_v8f64_s_s_offset7(<8 x double> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offset7: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s15, s17 -; GPRIDX-NEXT: s_mov_b32 m0, s18 -; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[14:15] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f64_s_s_offset7: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s12, s14 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_mov_b32 m0, s18 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[14:15] -; MOVREL-NEXT: ; return to shader part epilog -entry: - %add = add i32 %sel, 7 + %add = add i32 %sel, 7 %ext = extractelement <8 x double> %vec, i32 %add ret double %ext } define amdgpu_ps double @dyn_extract_v8f64_s_s_offsetm1(<8 x double> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v8f64_s_s_offsetm1: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_add_i32 m0, s18, -1 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s15, s17 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v8f64_s_s_offsetm1: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_add_i32 m0, s18, -1 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s12, s14 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1] -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v8f64_s_s_offsetm1: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_add_i32 m0, s18, -1 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_mov_b32 s15, s17 +; GCN-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GCN-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, -1 %ext = extractelement <8 x double> %vec, i32 %add @@ -1069,40 +882,24 @@ } define i8 addrspace(3)* @dyn_extract_v8p3_v_v(<8 x i8 addrspace(3)*> %vec, i32 %idx) { -; GPRIDX-LABEL: dyn_extract_v8p3_v_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB23_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v8 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB23_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: s_setpc_b64 s[30:31] -; -; MOVREL-LABEL: dyn_extract_v8p3_v_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], exec -; MOVREL-NEXT: BB23_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 -; MOVREL-NEXT: v_movrels_b32_e32 v9, v0 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB23_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: dyn_extract_v8p3_v_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i8 addrspace(3)*> %vec, i32 %idx ret i8 addrspace(3)* %ext @@ -1111,32 +908,82 @@ define amdgpu_ps void @dyn_extract_v8p3_s_s(<8 x i8 addrspace(3)*> inreg %vec, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_extract_v8p3_s_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 m0, s10 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_movrels_b32 s0, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 1 +; GPRIDX-NEXT: s_cselect_b32 s0, 1, 0 +; GPRIDX-NEXT: s_and_b32 s0, s0, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s0, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s3, s2 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 2 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s4, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 3 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s5, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 4 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s6, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 5 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s7, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 6 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s8, s0 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 7 +; GPRIDX-NEXT: s_cselect_b32 s1, 1, 0 +; GPRIDX-NEXT: s_and_b32 s1, s1, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s1, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s9, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: ds_write_b32 v0, v0 ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_extract_v8p3_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s10 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movrels_b32 s0, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 1 +; MOVREL-NEXT: s_cselect_b32 s0, 1, 0 +; MOVREL-NEXT: s_and_b32 s0, s0, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s0, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s3, s2 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 2 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s4, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 3 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s5, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 4 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s6, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 5 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s7, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 6 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s8, s0 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 7 +; MOVREL-NEXT: s_cselect_b32 s1, 1, 0 +; MOVREL-NEXT: s_and_b32 s1, s1, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s1, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s9, s0 ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: s_mov_b32 m0, -1 ; MOVREL-NEXT: ds_write_b32 v0, v0 @@ -1310,554 +1157,388 @@ } define amdgpu_ps float @dyn_extract_v16f32_s_s(i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v16f32_s_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s4, 1.0 -; GPRIDX-NEXT: s_mov_b32 m0, s2 -; GPRIDX-NEXT: s_mov_b32 s19, 0x41800000 -; GPRIDX-NEXT: s_mov_b32 s18, 0x41700000 -; GPRIDX-NEXT: s_mov_b32 s17, 0x41600000 -; GPRIDX-NEXT: s_mov_b32 s16, 0x41500000 -; GPRIDX-NEXT: s_mov_b32 s15, 0x41400000 -; GPRIDX-NEXT: s_mov_b32 s14, 0x41300000 -; GPRIDX-NEXT: s_mov_b32 s13, 0x41200000 -; GPRIDX-NEXT: s_mov_b32 s12, 0x41100000 -; GPRIDX-NEXT: s_mov_b32 s11, 0x41000000 -; GPRIDX-NEXT: s_mov_b32 s10, 0x40e00000 -; GPRIDX-NEXT: s_mov_b32 s9, 0x40c00000 -; GPRIDX-NEXT: s_mov_b32 s8, 0x40a00000 -; GPRIDX-NEXT: s_mov_b32 s7, 4.0 -; GPRIDX-NEXT: s_mov_b32 s6, 0x40400000 -; GPRIDX-NEXT: s_mov_b32 s5, 2.0 -; GPRIDX-NEXT: s_movrels_b32 s0, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v16f32_s_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s4, 1.0 -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: s_mov_b32 s19, 0x41800000 -; MOVREL-NEXT: s_mov_b32 s18, 0x41700000 -; MOVREL-NEXT: s_mov_b32 s17, 0x41600000 -; MOVREL-NEXT: s_mov_b32 s16, 0x41500000 -; MOVREL-NEXT: s_mov_b32 s15, 0x41400000 -; MOVREL-NEXT: s_mov_b32 s14, 0x41300000 -; MOVREL-NEXT: s_mov_b32 s13, 0x41200000 -; MOVREL-NEXT: s_mov_b32 s12, 0x41100000 -; MOVREL-NEXT: s_mov_b32 s11, 0x41000000 -; MOVREL-NEXT: s_mov_b32 s10, 0x40e00000 -; MOVREL-NEXT: s_mov_b32 s9, 0x40c00000 -; MOVREL-NEXT: s_mov_b32 s8, 0x40a00000 -; MOVREL-NEXT: s_mov_b32 s7, 4.0 -; MOVREL-NEXT: s_mov_b32 s6, 0x40400000 -; MOVREL-NEXT: s_mov_b32 s5, 2.0 -; MOVREL-NEXT: s_movrels_b32 s0, s4 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v16f32_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s4, 1.0 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_mov_b32 s19, 0x41800000 +; GCN-NEXT: s_mov_b32 s18, 0x41700000 +; GCN-NEXT: s_mov_b32 s17, 0x41600000 +; GCN-NEXT: s_mov_b32 s16, 0x41500000 +; GCN-NEXT: s_mov_b32 s15, 0x41400000 +; GCN-NEXT: s_mov_b32 s14, 0x41300000 +; GCN-NEXT: s_mov_b32 s13, 0x41200000 +; GCN-NEXT: s_mov_b32 s12, 0x41100000 +; GCN-NEXT: s_mov_b32 s11, 0x41000000 +; GCN-NEXT: s_mov_b32 s10, 0x40e00000 +; GCN-NEXT: s_mov_b32 s9, 0x40c00000 +; GCN-NEXT: s_mov_b32 s8, 0x40a00000 +; GCN-NEXT: s_mov_b32 s7, 4.0 +; GCN-NEXT: s_mov_b32 s6, 0x40400000 +; GCN-NEXT: s_mov_b32 s5, 2.0 +; GCN-NEXT: s_movrels_b32 s0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <16 x float> , i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v32f32_s_s(i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v32f32_s_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s36, 1.0 -; GPRIDX-NEXT: s_mov_b32 m0, s2 -; GPRIDX-NEXT: s_mov_b32 s67, 0x42000000 -; GPRIDX-NEXT: s_mov_b32 s66, 0x41f80000 -; GPRIDX-NEXT: s_mov_b32 s65, 0x41f00000 -; GPRIDX-NEXT: s_mov_b32 s64, 0x41e80000 -; GPRIDX-NEXT: s_mov_b32 s63, 0x41e00000 -; GPRIDX-NEXT: s_mov_b32 s62, 0x41d80000 -; GPRIDX-NEXT: s_mov_b32 s61, 0x41d00000 -; GPRIDX-NEXT: s_mov_b32 s60, 0x41c80000 -; GPRIDX-NEXT: s_mov_b32 s59, 0x41c00000 -; GPRIDX-NEXT: s_mov_b32 s58, 0x41b80000 -; GPRIDX-NEXT: s_mov_b32 s57, 0x41b00000 -; GPRIDX-NEXT: s_mov_b32 s56, 0x41a80000 -; GPRIDX-NEXT: s_mov_b32 s55, 0x41a00000 -; GPRIDX-NEXT: s_mov_b32 s54, 0x41980000 -; GPRIDX-NEXT: s_mov_b32 s53, 0x41900000 -; GPRIDX-NEXT: s_mov_b32 s52, 0x41880000 -; GPRIDX-NEXT: s_mov_b32 s51, 0x41800000 -; GPRIDX-NEXT: s_mov_b32 s50, 0x41700000 -; GPRIDX-NEXT: s_mov_b32 s49, 0x41600000 -; GPRIDX-NEXT: s_mov_b32 s48, 0x41500000 -; GPRIDX-NEXT: s_mov_b32 s47, 0x41400000 -; GPRIDX-NEXT: s_mov_b32 s46, 0x41300000 -; GPRIDX-NEXT: s_mov_b32 s45, 0x41200000 -; GPRIDX-NEXT: s_mov_b32 s44, 0x41100000 -; GPRIDX-NEXT: s_mov_b32 s43, 0x41000000 -; GPRIDX-NEXT: s_mov_b32 s42, 0x40e00000 -; GPRIDX-NEXT: s_mov_b32 s41, 0x40c00000 -; GPRIDX-NEXT: s_mov_b32 s40, 0x40a00000 -; GPRIDX-NEXT: s_mov_b32 s39, 4.0 -; GPRIDX-NEXT: s_mov_b32 s38, 0x40400000 -; GPRIDX-NEXT: s_mov_b32 s37, 2.0 -; GPRIDX-NEXT: s_movrels_b32 s0, s36 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v32f32_s_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s36, 1.0 -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: s_mov_b32 s67, 0x42000000 -; MOVREL-NEXT: s_mov_b32 s66, 0x41f80000 -; MOVREL-NEXT: s_mov_b32 s65, 0x41f00000 -; MOVREL-NEXT: s_mov_b32 s64, 0x41e80000 -; MOVREL-NEXT: s_mov_b32 s63, 0x41e00000 -; MOVREL-NEXT: s_mov_b32 s62, 0x41d80000 -; MOVREL-NEXT: s_mov_b32 s61, 0x41d00000 -; MOVREL-NEXT: s_mov_b32 s60, 0x41c80000 -; MOVREL-NEXT: s_mov_b32 s59, 0x41c00000 -; MOVREL-NEXT: s_mov_b32 s58, 0x41b80000 -; MOVREL-NEXT: s_mov_b32 s57, 0x41b00000 -; MOVREL-NEXT: s_mov_b32 s56, 0x41a80000 -; MOVREL-NEXT: s_mov_b32 s55, 0x41a00000 -; MOVREL-NEXT: s_mov_b32 s54, 0x41980000 -; MOVREL-NEXT: s_mov_b32 s53, 0x41900000 -; MOVREL-NEXT: s_mov_b32 s52, 0x41880000 -; MOVREL-NEXT: s_mov_b32 s51, 0x41800000 -; MOVREL-NEXT: s_mov_b32 s50, 0x41700000 -; MOVREL-NEXT: s_mov_b32 s49, 0x41600000 -; MOVREL-NEXT: s_mov_b32 s48, 0x41500000 -; MOVREL-NEXT: s_mov_b32 s47, 0x41400000 -; MOVREL-NEXT: s_mov_b32 s46, 0x41300000 -; MOVREL-NEXT: s_mov_b32 s45, 0x41200000 -; MOVREL-NEXT: s_mov_b32 s44, 0x41100000 -; MOVREL-NEXT: s_mov_b32 s43, 0x41000000 -; MOVREL-NEXT: s_mov_b32 s42, 0x40e00000 -; MOVREL-NEXT: s_mov_b32 s41, 0x40c00000 -; MOVREL-NEXT: s_mov_b32 s40, 0x40a00000 -; MOVREL-NEXT: s_mov_b32 s39, 4.0 -; MOVREL-NEXT: s_mov_b32 s38, 0x40400000 -; MOVREL-NEXT: s_mov_b32 s37, 2.0 -; MOVREL-NEXT: s_movrels_b32 s0, s36 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v32f32_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s36, 1.0 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_mov_b32 s67, 0x42000000 +; GCN-NEXT: s_mov_b32 s66, 0x41f80000 +; GCN-NEXT: s_mov_b32 s65, 0x41f00000 +; GCN-NEXT: s_mov_b32 s64, 0x41e80000 +; GCN-NEXT: s_mov_b32 s63, 0x41e00000 +; GCN-NEXT: s_mov_b32 s62, 0x41d80000 +; GCN-NEXT: s_mov_b32 s61, 0x41d00000 +; GCN-NEXT: s_mov_b32 s60, 0x41c80000 +; GCN-NEXT: s_mov_b32 s59, 0x41c00000 +; GCN-NEXT: s_mov_b32 s58, 0x41b80000 +; GCN-NEXT: s_mov_b32 s57, 0x41b00000 +; GCN-NEXT: s_mov_b32 s56, 0x41a80000 +; GCN-NEXT: s_mov_b32 s55, 0x41a00000 +; GCN-NEXT: s_mov_b32 s54, 0x41980000 +; GCN-NEXT: s_mov_b32 s53, 0x41900000 +; GCN-NEXT: s_mov_b32 s52, 0x41880000 +; GCN-NEXT: s_mov_b32 s51, 0x41800000 +; GCN-NEXT: s_mov_b32 s50, 0x41700000 +; GCN-NEXT: s_mov_b32 s49, 0x41600000 +; GCN-NEXT: s_mov_b32 s48, 0x41500000 +; GCN-NEXT: s_mov_b32 s47, 0x41400000 +; GCN-NEXT: s_mov_b32 s46, 0x41300000 +; GCN-NEXT: s_mov_b32 s45, 0x41200000 +; GCN-NEXT: s_mov_b32 s44, 0x41100000 +; GCN-NEXT: s_mov_b32 s43, 0x41000000 +; GCN-NEXT: s_mov_b32 s42, 0x40e00000 +; GCN-NEXT: s_mov_b32 s41, 0x40c00000 +; GCN-NEXT: s_mov_b32 s40, 0x40a00000 +; GCN-NEXT: s_mov_b32 s39, 4.0 +; GCN-NEXT: s_mov_b32 s38, 0x40400000 +; GCN-NEXT: s_mov_b32 s37, 2.0 +; GCN-NEXT: s_movrels_b32 s0, s36 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <32 x float> , i32 %sel ret float %ext } define amdgpu_ps double @dyn_extract_v16f64_s_s(i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v16f64_s_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s66, 0 -; GPRIDX-NEXT: s_mov_b64 s[36:37], 1.0 -; GPRIDX-NEXT: s_mov_b32 m0, s2 -; GPRIDX-NEXT: s_mov_b32 s67, 0x40300000 -; GPRIDX-NEXT: s_mov_b32 s65, 0x402e0000 -; GPRIDX-NEXT: s_mov_b32 s64, s66 -; GPRIDX-NEXT: s_mov_b32 s63, 0x402c0000 -; GPRIDX-NEXT: s_mov_b32 s62, s66 -; GPRIDX-NEXT: s_mov_b32 s61, 0x402a0000 -; GPRIDX-NEXT: s_mov_b32 s60, s66 -; GPRIDX-NEXT: s_mov_b32 s59, 0x40280000 -; GPRIDX-NEXT: s_mov_b32 s58, s66 -; GPRIDX-NEXT: s_mov_b32 s57, 0x40260000 -; GPRIDX-NEXT: s_mov_b32 s56, s66 -; GPRIDX-NEXT: s_mov_b32 s55, 0x40240000 -; GPRIDX-NEXT: s_mov_b32 s54, s66 -; GPRIDX-NEXT: s_mov_b32 s53, 0x40220000 -; GPRIDX-NEXT: s_mov_b32 s52, s66 -; GPRIDX-NEXT: s_mov_b32 s51, 0x40200000 -; GPRIDX-NEXT: s_mov_b32 s50, s66 -; GPRIDX-NEXT: s_mov_b32 s49, 0x401c0000 -; GPRIDX-NEXT: s_mov_b32 s48, s66 -; GPRIDX-NEXT: s_mov_b32 s47, 0x40180000 -; GPRIDX-NEXT: s_mov_b32 s46, s66 -; GPRIDX-NEXT: s_mov_b32 s45, 0x40140000 -; GPRIDX-NEXT: s_mov_b32 s44, s66 -; GPRIDX-NEXT: s_mov_b64 s[42:43], 4.0 -; GPRIDX-NEXT: s_mov_b32 s41, 0x40080000 -; GPRIDX-NEXT: s_mov_b32 s40, s66 -; GPRIDX-NEXT: s_mov_b64 s[38:39], 2.0 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[36:37] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v16f64_s_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s66, 0 -; MOVREL-NEXT: s_mov_b64 s[36:37], 1.0 -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: s_mov_b32 s67, 0x40300000 -; MOVREL-NEXT: s_mov_b32 s65, 0x402e0000 -; MOVREL-NEXT: s_mov_b32 s64, s66 -; MOVREL-NEXT: s_mov_b32 s63, 0x402c0000 -; MOVREL-NEXT: s_mov_b32 s62, s66 -; MOVREL-NEXT: s_mov_b32 s61, 0x402a0000 -; MOVREL-NEXT: s_mov_b32 s60, s66 -; MOVREL-NEXT: s_mov_b32 s59, 0x40280000 -; MOVREL-NEXT: s_mov_b32 s58, s66 -; MOVREL-NEXT: s_mov_b32 s57, 0x40260000 -; MOVREL-NEXT: s_mov_b32 s56, s66 -; MOVREL-NEXT: s_mov_b32 s55, 0x40240000 -; MOVREL-NEXT: s_mov_b32 s54, s66 -; MOVREL-NEXT: s_mov_b32 s53, 0x40220000 -; MOVREL-NEXT: s_mov_b32 s52, s66 -; MOVREL-NEXT: s_mov_b32 s51, 0x40200000 -; MOVREL-NEXT: s_mov_b32 s50, s66 -; MOVREL-NEXT: s_mov_b32 s49, 0x401c0000 -; MOVREL-NEXT: s_mov_b32 s48, s66 -; MOVREL-NEXT: s_mov_b32 s47, 0x40180000 -; MOVREL-NEXT: s_mov_b32 s46, s66 -; MOVREL-NEXT: s_mov_b32 s45, 0x40140000 -; MOVREL-NEXT: s_mov_b32 s44, s66 -; MOVREL-NEXT: s_mov_b64 s[42:43], 4.0 -; MOVREL-NEXT: s_mov_b32 s41, 0x40080000 -; MOVREL-NEXT: s_mov_b32 s40, s66 -; MOVREL-NEXT: s_mov_b64 s[38:39], 2.0 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[36:37] -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v16f64_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s66, 0 +; GCN-NEXT: s_mov_b64 s[36:37], 1.0 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_mov_b32 s67, 0x40300000 +; GCN-NEXT: s_mov_b32 s65, 0x402e0000 +; GCN-NEXT: s_mov_b32 s64, s66 +; GCN-NEXT: s_mov_b32 s63, 0x402c0000 +; GCN-NEXT: s_mov_b32 s62, s66 +; GCN-NEXT: s_mov_b32 s61, 0x402a0000 +; GCN-NEXT: s_mov_b32 s60, s66 +; GCN-NEXT: s_mov_b32 s59, 0x40280000 +; GCN-NEXT: s_mov_b32 s58, s66 +; GCN-NEXT: s_mov_b32 s57, 0x40260000 +; GCN-NEXT: s_mov_b32 s56, s66 +; GCN-NEXT: s_mov_b32 s55, 0x40240000 +; GCN-NEXT: s_mov_b32 s54, s66 +; GCN-NEXT: s_mov_b32 s53, 0x40220000 +; GCN-NEXT: s_mov_b32 s52, s66 +; GCN-NEXT: s_mov_b32 s51, 0x40200000 +; GCN-NEXT: s_mov_b32 s50, s66 +; GCN-NEXT: s_mov_b32 s49, 0x401c0000 +; GCN-NEXT: s_mov_b32 s48, s66 +; GCN-NEXT: s_mov_b32 s47, 0x40180000 +; GCN-NEXT: s_mov_b32 s46, s66 +; GCN-NEXT: s_mov_b32 s45, 0x40140000 +; GCN-NEXT: s_mov_b32 s44, s66 +; GCN-NEXT: s_mov_b64 s[42:43], 4.0 +; GCN-NEXT: s_mov_b32 s41, 0x40080000 +; GCN-NEXT: s_mov_b32 s40, s66 +; GCN-NEXT: s_mov_b64 s[38:39], 2.0 +; GCN-NEXT: s_movrels_b64 s[0:1], s[36:37] +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <16 x double> , i32 %sel ret double %ext } define amdgpu_ps float @dyn_extract_v6f32_s_v(<6 x float> inreg %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v6f32_s_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b64 s[6:7], exec -; GPRIDX-NEXT: BB33_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s8, v0 -; GPRIDX-NEXT: s_mov_b32 m0, s8 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s8, v0 -; GPRIDX-NEXT: s_movrels_b32 s8, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s8 -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB33_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[6:7] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v6f32_s_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b64 s[6:7], exec -; MOVREL-NEXT: BB33_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s8, v0 -; MOVREL-NEXT: s_mov_b32 m0, s8 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s8, v0 -; MOVREL-NEXT: s_movrels_b32 s8, s0 -; MOVREL-NEXT: v_mov_b32_e32 v1, s8 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB33_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[6:7] -; MOVREL-NEXT: v_mov_b32_e32 v0, v1 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v6f32_s_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext } define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v6f32_v_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB34_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v6 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v7, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB34_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v7 -; GPRIDX-NEXT: s_setpc_b64 s[30:31] -; -; MOVREL-LABEL: dyn_extract_v6f32_v_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], exec -; MOVREL-NEXT: BB34_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v6 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6 -; MOVREL-NEXT: v_movrels_b32_e32 v7, v0 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB34_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, v7 -; MOVREL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: dyn_extract_v6f32_v_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v6f32_v_s(<6 x float> %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v6f32_v_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v6f32_v_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v6f32_v_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v6f32_s_s(<6 x float> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v6f32_s_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 m0, s8 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_movrels_b32 s0, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v6f32_s_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s8 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_movrels_b32 s0, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v6f32_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_cmp_eq_u32 s8, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_cmp_eq_u32 s8, 2 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s4, s0 +; GCN-NEXT: s_cmp_eq_u32 s8, 3 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s5, s0 +; GCN-NEXT: s_cmp_eq_u32 s8, 4 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s6, s0 +; GCN-NEXT: s_cmp_eq_u32 s8, 5 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s7, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v7f32_s_v(<7 x float> inreg %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v7f32_s_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b64 s[8:9], exec -; GPRIDX-NEXT: BB37_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s7, v0 -; GPRIDX-NEXT: s_mov_b32 m0, s7 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s7, v0 -; GPRIDX-NEXT: s_movrels_b32 s7, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s7 -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB37_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[8:9] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v1 -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v7f32_s_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b64 s[8:9], exec -; MOVREL-NEXT: BB37_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s7, v0 -; MOVREL-NEXT: s_mov_b32 m0, s7 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s7, v0 -; MOVREL-NEXT: s_movrels_b32 s7, s0 -; MOVREL-NEXT: v_mov_b32_e32 v1, s7 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB37_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[8:9] -; MOVREL-NEXT: v_mov_b32_e32 v0, v1 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v7f32_s_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext } define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v7f32_v_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB38_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB38_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 -; GPRIDX-NEXT: s_setpc_b64 s[30:31] -; -; MOVREL-LABEL: dyn_extract_v7f32_v_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], exec -; MOVREL-NEXT: BB38_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v7 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 -; MOVREL-NEXT: v_movrels_b32_e32 v8, v0 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB38_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[4:5] -; MOVREL-NEXT: v_mov_b32_e32 v0, v8 -; MOVREL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: dyn_extract_v7f32_v_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v7f32_v_s(<7 x float> %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v7f32_v_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v7f32_v_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v7f32_v_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: s_cmp_eq_u32 s2, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps float @dyn_extract_v7f32_s_s(<7 x float> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v7f32_s_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 m0, s9 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_movrels_b32 s0, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v7f32_s_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s9 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_movrels_b32 s0, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v7f32_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_cmp_eq_u32 s9, 1 +; GCN-NEXT: s_cselect_b32 s0, 1, 0 +; GCN-NEXT: s_and_b32 s0, s0, 1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, s3, s2 +; GCN-NEXT: s_cmp_eq_u32 s9, 2 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s4, s0 +; GCN-NEXT: s_cmp_eq_u32 s9, 3 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s5, s0 +; GCN-NEXT: s_cmp_eq_u32 s9, 4 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s6, s0 +; GCN-NEXT: s_cmp_eq_u32 s9, 5 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s7, s0 +; GCN-NEXT: s_cmp_eq_u32 s9, 6 +; GCN-NEXT: s_cselect_b32 s1, 1, 0 +; GCN-NEXT: s_and_b32 s1, s1, 1 +; GCN-NEXT: s_cmp_lg_u32 s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s8, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext } define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v6f64_s_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s16, s2 -; GPRIDX-NEXT: s_mov_b32 s17, s3 -; GPRIDX-NEXT: s_mov_b32 s18, s4 -; GPRIDX-NEXT: s_mov_b32 s19, s5 -; GPRIDX-NEXT: s_mov_b32 s20, s6 -; GPRIDX-NEXT: s_mov_b32 s21, s7 -; GPRIDX-NEXT: s_mov_b32 s22, s8 -; GPRIDX-NEXT: s_mov_b32 s23, s9 -; GPRIDX-NEXT: s_mov_b32 s24, s10 -; GPRIDX-NEXT: s_mov_b32 s25, s11 -; GPRIDX-NEXT: s_mov_b32 s26, s12 -; GPRIDX-NEXT: s_mov_b32 s27, s13 -; GPRIDX-NEXT: s_mov_b64 s[2:3], exec -; GPRIDX-NEXT: BB41_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 -; GPRIDX-NEXT: s_lshl_b32 m0, s0, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; GPRIDX-NEXT: s_movrels_b32 s0, s16 -; GPRIDX-NEXT: s_movrels_b32 s1, s17 -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB41_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[2:3] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v6f64_s_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s16, s2 -; MOVREL-NEXT: s_mov_b32 s17, s3 -; MOVREL-NEXT: s_mov_b32 s18, s4 -; MOVREL-NEXT: s_mov_b32 s19, s5 -; MOVREL-NEXT: s_mov_b32 s20, s6 -; MOVREL-NEXT: s_mov_b32 s21, s7 -; MOVREL-NEXT: s_mov_b32 s22, s8 -; MOVREL-NEXT: s_mov_b32 s23, s9 -; MOVREL-NEXT: s_mov_b32 s24, s10 -; MOVREL-NEXT: s_mov_b32 s25, s11 -; MOVREL-NEXT: s_mov_b32 s26, s12 -; MOVREL-NEXT: s_mov_b32 s27, s13 -; MOVREL-NEXT: s_mov_b64 s[2:3], exec -; MOVREL-NEXT: BB41_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 -; MOVREL-NEXT: s_lshl_b32 m0, s0, 1 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; MOVREL-NEXT: s_movrels_b32 s0, s16 -; MOVREL-NEXT: s_movrels_b32 s1, s17 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB41_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[2:3] -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v6f64_s_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s16, s2 +; GCN-NEXT: s_mov_b32 s17, s3 +; GCN-NEXT: s_mov_b32 s18, s4 +; GCN-NEXT: s_mov_b32 s19, s5 +; GCN-NEXT: s_mov_b32 s20, s6 +; GCN-NEXT: s_mov_b32 s21, s7 +; GCN-NEXT: s_mov_b32 s22, s8 +; GCN-NEXT: s_mov_b32 s23, s9 +; GCN-NEXT: s_mov_b32 s24, s10 +; GCN-NEXT: s_mov_b32 s25, s11 +; GCN-NEXT: s_mov_b32 s26, s12 +; GCN-NEXT: s_mov_b32 s27, s13 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: BB41_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_lshl_b32 m0, s0, 1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; GCN-NEXT: s_movrels_b32 s0, s16 +; GCN-NEXT: s_movrels_b32 s1, s17 +; GCN-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN-NEXT: s_cbranch_execnz BB41_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x double> %vec, i32 %sel ret double %ext @@ -1934,106 +1615,58 @@ } define amdgpu_ps double @dyn_extract_v6f64_s_s(<6 x double> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v6f64_s_s: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 m0, s14 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v6f64_s_s: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 m0, s14 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1] -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v6f64_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 m0, s14 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x double> %vec, i32 %sel ret double %ext } define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) { -; GPRIDX-LABEL: dyn_extract_v7f64_s_v: -; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s16, s2 -; GPRIDX-NEXT: s_mov_b32 s17, s3 -; GPRIDX-NEXT: s_mov_b32 s18, s4 -; GPRIDX-NEXT: s_mov_b32 s19, s5 -; GPRIDX-NEXT: s_mov_b32 s20, s6 -; GPRIDX-NEXT: s_mov_b32 s21, s7 -; GPRIDX-NEXT: s_mov_b32 s22, s8 -; GPRIDX-NEXT: s_mov_b32 s23, s9 -; GPRIDX-NEXT: s_mov_b32 s24, s10 -; GPRIDX-NEXT: s_mov_b32 s25, s11 -; GPRIDX-NEXT: s_mov_b32 s26, s12 -; GPRIDX-NEXT: s_mov_b32 s27, s13 -; GPRIDX-NEXT: s_mov_b32 s28, s14 -; GPRIDX-NEXT: s_mov_b32 s29, s15 -; GPRIDX-NEXT: s_mov_b64 s[2:3], exec -; GPRIDX-NEXT: BB45_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 -; GPRIDX-NEXT: s_lshl_b32 m0, s0, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; GPRIDX-NEXT: s_movrels_b32 s0, s16 -; GPRIDX-NEXT: s_movrels_b32 s1, s17 -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB45_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[2:3] -; GPRIDX-NEXT: ; return to shader part epilog -; -; MOVREL-LABEL: dyn_extract_v7f64_s_v: -; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s16, s2 -; MOVREL-NEXT: s_mov_b32 s17, s3 -; MOVREL-NEXT: s_mov_b32 s18, s4 -; MOVREL-NEXT: s_mov_b32 s19, s5 -; MOVREL-NEXT: s_mov_b32 s20, s6 -; MOVREL-NEXT: s_mov_b32 s21, s7 -; MOVREL-NEXT: s_mov_b32 s22, s8 -; MOVREL-NEXT: s_mov_b32 s23, s9 -; MOVREL-NEXT: s_mov_b32 s24, s10 -; MOVREL-NEXT: s_mov_b32 s25, s11 -; MOVREL-NEXT: s_mov_b32 s26, s12 -; MOVREL-NEXT: s_mov_b32 s27, s13 -; MOVREL-NEXT: s_mov_b32 s28, s14 -; MOVREL-NEXT: s_mov_b32 s29, s15 -; MOVREL-NEXT: s_mov_b64 s[2:3], exec -; MOVREL-NEXT: BB45_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 -; MOVREL-NEXT: s_lshl_b32 m0, s0, 1 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; MOVREL-NEXT: s_movrels_b32 s0, s16 -; MOVREL-NEXT: s_movrels_b32 s1, s17 -; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc -; MOVREL-NEXT: s_xor_b64 exec, exec, vcc -; MOVREL-NEXT: s_cbranch_execnz BB45_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b64 exec, s[2:3] -; MOVREL-NEXT: ; return to shader part epilog +; GCN-LABEL: dyn_extract_v7f64_s_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s16, s2 +; GCN-NEXT: s_mov_b32 s17, s3 +; GCN-NEXT: s_mov_b32 s18, s4 +; GCN-NEXT: s_mov_b32 s19, s5 +; GCN-NEXT: s_mov_b32 s20, s6 +; GCN-NEXT: s_mov_b32 s21, s7 +; GCN-NEXT: s_mov_b32 s22, s8 +; GCN-NEXT: s_mov_b32 s23, s9 +; GCN-NEXT: s_mov_b32 s24, s10 +; GCN-NEXT: s_mov_b32 s25, s11 +; GCN-NEXT: s_mov_b32 s26, s12 +; GCN-NEXT: s_mov_b32 s27, s13 +; GCN-NEXT: s_mov_b32 s28, s14 +; GCN-NEXT: s_mov_b32 s29, s15 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: BB45_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_lshl_b32 m0, s0, 1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; GCN-NEXT: s_movrels_b32 s0, s16 +; GCN-NEXT: s_movrels_b32 s1, s17 +; GCN-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN-NEXT: s_cbranch_execnz BB45_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x double> %vec, i32 %sel ret double %ext @@ -2110,46 +1743,496 @@ } define amdgpu_ps double @dyn_extract_v7f64_s_s(<7 x double> inreg %vec, i32 inreg %sel) { -; GPRIDX-LABEL: dyn_extract_v7f64_s_s: +; GCN-LABEL: dyn_extract_v7f64_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 m0, s16 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GCN-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <7 x double> %vec, i32 %sel + ret double %ext +} + +define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v5f64_s_s: +; GPRIDX: .amd_kernel_code_t +; GPRIDX-NEXT: amd_code_version_major = 1 +; GPRIDX-NEXT: amd_code_version_minor = 2 +; GPRIDX-NEXT: amd_machine_kind = 1 +; GPRIDX-NEXT: amd_machine_version_major = 9 +; GPRIDX-NEXT: amd_machine_version_minor = 0 +; GPRIDX-NEXT: amd_machine_version_stepping = 0 +; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256 +; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0 +; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0 +; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1 +; GPRIDX-NEXT: priority = 0 +; GPRIDX-NEXT: float_mode = 240 +; GPRIDX-NEXT: priv = 0 +; GPRIDX-NEXT: enable_dx10_clamp = 1 +; GPRIDX-NEXT: debug_mode = 0 +; GPRIDX-NEXT: enable_ieee_mode = 1 +; GPRIDX-NEXT: enable_wgp_mode = 0 +; GPRIDX-NEXT: enable_mem_ordered = 0 +; GPRIDX-NEXT: enable_fwd_progress = 0 +; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GPRIDX-NEXT: user_sgpr_count = 6 +; GPRIDX-NEXT: enable_trap_handler = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_id_z = 0 +; GPRIDX-NEXT: enable_sgpr_workgroup_info = 0 +; GPRIDX-NEXT: enable_vgpr_workitem_id = 0 +; GPRIDX-NEXT: enable_exception_msb = 0 +; GPRIDX-NEXT: granulated_lds_size = 0 +; GPRIDX-NEXT: enable_exception = 0 +; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 0 +; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 +; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GPRIDX-NEXT: enable_sgpr_dispatch_id = 0 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 +; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 +; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GPRIDX-NEXT: enable_wavefront_size32 = 0 +; GPRIDX-NEXT: enable_ordered_append_gds = 0 +; GPRIDX-NEXT: private_element_size = 1 +; GPRIDX-NEXT: is_ptr64 = 1 +; GPRIDX-NEXT: is_dynamic_callstack = 0 +; GPRIDX-NEXT: is_debug_enabled = 0 +; GPRIDX-NEXT: is_xnack_enabled = 0 +; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 +; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 +; GPRIDX-NEXT: gds_segment_byte_size = 0 +; GPRIDX-NEXT: kernarg_segment_byte_size = 28 +; GPRIDX-NEXT: workgroup_fbarrier_count = 0 +; GPRIDX-NEXT: wavefront_sgpr_count = 10 +; GPRIDX-NEXT: workitem_vgpr_count = 4 +; GPRIDX-NEXT: reserved_vgpr_first = 0 +; GPRIDX-NEXT: reserved_vgpr_count = 0 +; GPRIDX-NEXT: reserved_sgpr_first = 0 +; GPRIDX-NEXT: reserved_sgpr_count = 0 +; GPRIDX-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GPRIDX-NEXT: debug_private_segment_buffer_sgpr = 0 +; GPRIDX-NEXT: kernarg_segment_alignment = 4 +; GPRIDX-NEXT: group_segment_alignment = 4 +; GPRIDX-NEXT: private_segment_alignment = 4 +; GPRIDX-NEXT: wavefront_size = 6 +; GPRIDX-NEXT: call_convention = -1 +; GPRIDX-NEXT: runtime_loader_kernel_symbol = 0 +; GPRIDX-NEXT: .end_amd_kernel_code_t +; GPRIDX-NEXT: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8 +; GPRIDX-NEXT: s_mov_b32 s0, 0 +; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 +; GPRIDX-NEXT: s_mov_b32 s2, s0 +; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000 +; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 +; GPRIDX-NEXT: s_cselect_b32 s4, 1, 0 +; GPRIDX-NEXT: s_and_b32 s4, s4, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s4, 0 +; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 2 +; GPRIDX-NEXT: s_cselect_b32 s9, 1, 0 +; GPRIDX-NEXT: s_and_b32 s9, s9, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s9, 0 +; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 3 +; GPRIDX-NEXT: s_cselect_b32 s4, 1, 0 +; GPRIDX-NEXT: s_and_b32 s4, s4, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s4, 0 +; GPRIDX-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 4 +; GPRIDX-NEXT: s_cselect_b32 s4, 1, 0 +; GPRIDX-NEXT: s_and_b32 s4, s4, 1 +; GPRIDX-NEXT: s_cmp_lg_u32 s4, 0 +; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s7 +; GPRIDX-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GPRIDX-NEXT: s_endpgm +; +; MOVREL-LABEL: dyn_extract_v5f64_s_s: +; MOVREL: .amd_kernel_code_t +; MOVREL-NEXT: amd_code_version_major = 1 +; MOVREL-NEXT: amd_code_version_minor = 2 +; MOVREL-NEXT: amd_machine_kind = 1 +; MOVREL-NEXT: amd_machine_version_major = 8 +; MOVREL-NEXT: amd_machine_version_minor = 0 +; MOVREL-NEXT: amd_machine_version_stepping = 3 +; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 +; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 +; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: priority = 0 +; MOVREL-NEXT: float_mode = 240 +; MOVREL-NEXT: priv = 0 +; MOVREL-NEXT: enable_dx10_clamp = 1 +; MOVREL-NEXT: debug_mode = 0 +; MOVREL-NEXT: enable_ieee_mode = 1 +; MOVREL-NEXT: enable_wgp_mode = 0 +; MOVREL-NEXT: enable_mem_ordered = 0 +; MOVREL-NEXT: enable_fwd_progress = 0 +; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MOVREL-NEXT: user_sgpr_count = 6 +; MOVREL-NEXT: enable_trap_handler = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 +; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_id_z = 0 +; MOVREL-NEXT: enable_sgpr_workgroup_info = 0 +; MOVREL-NEXT: enable_vgpr_workitem_id = 0 +; MOVREL-NEXT: enable_exception_msb = 0 +; MOVREL-NEXT: granulated_lds_size = 0 +; MOVREL-NEXT: enable_exception = 0 +; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 0 +; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 +; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; MOVREL-NEXT: enable_sgpr_dispatch_id = 0 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 +; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 +; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MOVREL-NEXT: enable_wavefront_size32 = 0 +; MOVREL-NEXT: enable_ordered_append_gds = 0 +; MOVREL-NEXT: private_element_size = 1 +; MOVREL-NEXT: is_ptr64 = 1 +; MOVREL-NEXT: is_dynamic_callstack = 0 +; MOVREL-NEXT: is_debug_enabled = 0 +; MOVREL-NEXT: is_xnack_enabled = 0 +; MOVREL-NEXT: workitem_private_segment_byte_size = 0 +; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 +; MOVREL-NEXT: gds_segment_byte_size = 0 +; MOVREL-NEXT: kernarg_segment_byte_size = 28 +; MOVREL-NEXT: workgroup_fbarrier_count = 0 +; MOVREL-NEXT: wavefront_sgpr_count = 10 +; MOVREL-NEXT: workitem_vgpr_count = 4 +; MOVREL-NEXT: reserved_vgpr_first = 0 +; MOVREL-NEXT: reserved_vgpr_count = 0 +; MOVREL-NEXT: reserved_sgpr_first = 0 +; MOVREL-NEXT: reserved_sgpr_count = 0 +; MOVREL-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MOVREL-NEXT: debug_private_segment_buffer_sgpr = 0 +; MOVREL-NEXT: kernarg_segment_alignment = 4 +; MOVREL-NEXT: group_segment_alignment = 4 +; MOVREL-NEXT: private_segment_alignment = 4 +; MOVREL-NEXT: wavefront_size = 6 +; MOVREL-NEXT: call_convention = -1 +; MOVREL-NEXT: runtime_loader_kernel_symbol = 0 +; MOVREL-NEXT: .end_amd_kernel_code_t +; MOVREL-NEXT: ; %bb.0: ; %entry +; MOVREL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8 +; MOVREL-NEXT: s_mov_b32 s0, 0 +; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 +; MOVREL-NEXT: s_mov_b32 s2, s0 +; MOVREL-NEXT: s_mov_b32 s1, 0x40140000 +; MOVREL-NEXT: s_waitcnt lgkmcnt(0) +; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 +; MOVREL-NEXT: s_cselect_b32 s4, 1, 0 +; MOVREL-NEXT: s_and_b32 s4, s4, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s4, 0 +; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 +; MOVREL-NEXT: s_cselect_b32 s9, 1, 0 +; MOVREL-NEXT: s_and_b32 s9, s9, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s9, 0 +; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 +; MOVREL-NEXT: s_cselect_b32 s4, 1, 0 +; MOVREL-NEXT: s_and_b32 s4, s4, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s4, 0 +; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 +; MOVREL-NEXT: s_cselect_b32 s4, 1, 0 +; MOVREL-NEXT: s_and_b32 s4, s4, 1 +; MOVREL-NEXT: s_cmp_lg_u32 s4, 0 +; MOVREL-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; MOVREL-NEXT: v_mov_b32_e32 v2, s6 +; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: v_mov_b32_e32 v3, s7 +; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; MOVREL-NEXT: s_endpgm +entry: + %ext = extractelement <5 x double> , i32 %sel + store double %ext, double addrspace(1)* %out + ret void +} + +define float @dyn_extract_v15f32_const_s_v(i32 %sel) { +; GCN-LABEL: dyn_extract_v15f32_const_s_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s18, 0x41700000 +; GCN-NEXT: s_mov_b32 s17, 0x41600000 +; GCN-NEXT: s_mov_b32 s16, 0x41500000 +; GCN-NEXT: s_mov_b32 s15, 0x41400000 +; GCN-NEXT: s_mov_b32 s14, 0x41300000 +; GCN-NEXT: s_mov_b32 s13, 0x41200000 +; GCN-NEXT: s_mov_b32 s12, 0x41100000 +; GCN-NEXT: s_mov_b32 s11, 0x41000000 +; GCN-NEXT: s_mov_b32 s10, 0x40e00000 +; GCN-NEXT: s_mov_b32 s9, 0x40c00000 +; GCN-NEXT: s_mov_b32 s8, 0x40a00000 +; GCN-NEXT: s_mov_b32 s7, 4.0 +; GCN-NEXT: s_mov_b32 s6, 0x40400000 +; GCN-NEXT: s_mov_b32 s5, 2.0 +; GCN-NEXT: s_mov_b32 s4, 1.0 +; GCN-NEXT: s_mov_b64 s[20:21], exec +; GCN-NEXT: BB50_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s19, v0 +; GCN-NEXT: s_mov_b32 m0, s19 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s19, v0 +; GCN-NEXT: s_movrels_b32 s19, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN-NEXT: s_cbranch_execnz BB50_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[20:21] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] +entry: + %ext = extractelement <15 x float> , i32 %sel + ret float %ext +} + +define amdgpu_ps float @dyn_extract_v15f32_const_s_s(i32 inreg %sel) { +; GCN-LABEL: dyn_extract_v15f32_const_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s4, 1.0 +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_mov_b32 s18, 0x41700000 +; GCN-NEXT: s_mov_b32 s17, 0x41600000 +; GCN-NEXT: s_mov_b32 s16, 0x41500000 +; GCN-NEXT: s_mov_b32 s15, 0x41400000 +; GCN-NEXT: s_mov_b32 s14, 0x41300000 +; GCN-NEXT: s_mov_b32 s13, 0x41200000 +; GCN-NEXT: s_mov_b32 s12, 0x41100000 +; GCN-NEXT: s_mov_b32 s11, 0x41000000 +; GCN-NEXT: s_mov_b32 s10, 0x40e00000 +; GCN-NEXT: s_mov_b32 s9, 0x40c00000 +; GCN-NEXT: s_mov_b32 s8, 0x40a00000 +; GCN-NEXT: s_mov_b32 s7, 4.0 +; GCN-NEXT: s_mov_b32 s6, 0x40400000 +; GCN-NEXT: s_mov_b32 s5, 2.0 +; GCN-NEXT: s_movrels_b32 s0, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <15 x float> , i32 %sel + ret float %ext +} + +define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel) { +; GCN-LABEL: dyn_extract_v15f32_s_v: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b64 s[16:17], exec +; GCN-NEXT: BB52_1: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s15, v0 +; GCN-NEXT: s_mov_b32 m0, s15 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s15, v0 +; GCN-NEXT: s_movrels_b32 s15, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN-NEXT: s_cbranch_execnz BB52_1 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_mov_b64 exec, s[16:17] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <15 x float> %vec, i32 %sel + ret float %ext +} + +define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v15f32_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 m0, s16 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GPRIDX-NEXT: s_mov_b64 s[4:5], exec +; GPRIDX-NEXT: BB53_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v15 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v15 +; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v16, v0 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB53_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v16 +; GPRIDX-NEXT: s_setpc_b64 s[30:31] +; +; MOVREL-LABEL: dyn_extract_v15f32_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_mov_b64 s[4:5], exec +; MOVREL-NEXT: BB53_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s6, v15 +; MOVREL-NEXT: s_mov_b32 m0, s6 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v15 +; MOVREL-NEXT: v_movrels_b32_e32 v16, v0 +; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; MOVREL-NEXT: s_cbranch_execnz BB53_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b64 exec, s[4:5] +; MOVREL-NEXT: v_mov_b32_e32 v0, v16 +; MOVREL-NEXT: s_setpc_b64 s[30:31] +entry: + %ext = extractelement <15 x float> %vec, i32 %sel + ret float %ext +} + +define amdgpu_ps float @dyn_extract_v15f32_v_s(<15 x float> %vec, i32 inreg %sel) { +; GPRIDX-LABEL: dyn_extract_v15f32_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v0 +; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: ; return to shader part epilog ; -; MOVREL-LABEL: dyn_extract_v7f64_s_s: +; MOVREL-LABEL: dyn_extract_v15f32_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 m0, s16 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s12, s14 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_movrels_b64 s[0:1], s[0:1] +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; MOVREL-NEXT: ; return to shader part epilog entry: - %ext = extractelement <7 x double> %vec, i32 %sel - ret double %ext + %ext = extractelement <15 x float> %vec, i32 %sel + ret float %ext +} + +define amdgpu_ps float @dyn_extract_v15f32_s_s(<15 x float> inreg %vec, i32 inreg %sel) { +; GCN-LABEL: dyn_extract_v15f32_s_s: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 m0, s17 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_movrels_b32 s0, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %ext = extractelement <15 x float> %vec, i32 %sel + ret float %ext +} + +define amdgpu_ps float @dyn_extract_v15f32_s_s_offset3(<15 x float> inreg %vec, i32 inreg %sel) { +; GCN-LABEL: dyn_extract_v15f32_s_s_offset3: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 m0, s17 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: s_mov_b32 s10, s12 +; GCN-NEXT: s_mov_b32 s11, s13 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: s_movrels_b32 s0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %add = add i32 %sel, 3 + %ext = extractelement <15 x float> %vec, i32 %add + ret float %ext +} + +define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) { +; GPRIDX-LABEL: dyn_extract_v15f32_v_v_offset3: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GPRIDX-NEXT: s_mov_b64 s[4:5], exec +; GPRIDX-NEXT: BB57_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v15 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v15 +; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(SRC0) +; GPRIDX-NEXT: v_mov_b32_e32 v16, v3 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB57_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v16 +; GPRIDX-NEXT: s_setpc_b64 s[30:31] +; +; MOVREL-LABEL: dyn_extract_v15f32_v_v_offset3: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_mov_b64 s[4:5], exec +; MOVREL-NEXT: BB57_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s6, v15 +; MOVREL-NEXT: s_mov_b32 m0, s6 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v15 +; MOVREL-NEXT: v_movrels_b32_e32 v16, v3 +; MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc +; MOVREL-NEXT: s_xor_b64 exec, exec, vcc +; MOVREL-NEXT: s_cbranch_execnz BB57_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b64 exec, s[4:5] +; MOVREL-NEXT: v_mov_b32_e32 v0, v16 +; MOVREL-NEXT: s_setpc_b64 s[30:31] +entry: + %add = add i32 %sel, 3 + %ext = extractelement <15 x float> %vec, i32 %add + ret float %ext }