diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4835,6 +4835,7 @@ Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: case TargetOpcode::G_EXTRACT: if (TypeIdx != 1) return UnableToLegalize; @@ -4843,6 +4844,7 @@ Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_INSERT: + case TargetOpcode::G_INSERT_VECTOR_ELT: case TargetOpcode::G_FREEZE: case TargetOpcode::G_FNEG: case TargetOpcode::G_FABS: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -131,6 +131,28 @@ }; } +// Increase the number of vector elements to reach the next legal RegClass. +static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + const unsigned NumElts = Ty.getNumElements(); + const unsigned EltSize = Ty.getElementType().getSizeInBits(); + const unsigned MaxNumElts = MaxRegisterSize / EltSize; + + assert(EltSize == 32 || EltSize == 64); + assert(Ty.getSizeInBits() < MaxRegisterSize); + + unsigned NewNumElts; + // Find the nearest legal RegClass that is larger than the current type. + for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { + if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) + break; + } + + return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); + }; +} + static LLT getBitcastRegisterType(const LLT Ty) { const unsigned Size = Ty.getSizeInBits(); @@ -215,6 +237,15 @@ }; } +// RegisterType that doesn't have a corresponding RegClass. +static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + LLT Ty = Query.Types[TypeIdx]; + return isRegisterType(Ty) && + !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); + }; +} + static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT QueryTy = Query.Types[TypeIdx]; @@ -1455,10 +1486,13 @@ const LLT VecTy = Query.Types[VecTypeIdx]; const LLT IdxTy = Query.Types[IdxTypeIdx]; const unsigned EltSize = EltTy.getSizeInBits(); + const bool isLegalVecType = + !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); return (EltSize == 32 || EltSize == 64) && VecTy.getSizeInBits() % 32 == 0 && VecTy.getSizeInBits() <= MaxRegisterSize && - IdxTy.getSizeInBits() == 32; + IdxTy.getSizeInBits() == 32 && + isLegalVecType; }) .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), bitcastToVectorElement32(VecTypeIdx)) @@ -1484,6 +1518,9 @@ .clampScalar(IdxTypeIdx, S32, S32) .clampMaxNumElements(VecTypeIdx, S32, 32) // TODO: Clamp elements for 64-bit vectors? + .moreElementsIf( + isIllegalRegisterType(VecTypeIdx), + moreElementsToNextExistingRegClass(VecTypeIdx)) // It should only be necessary with variable indexes. // As a last resort, lower to the stack .lower(); @@ -1538,7 +1575,10 @@ .legalForCartesianProduct(AllS64Vectors, {S64}) .clampNumElements(0, V16S32, V32S32) .clampNumElements(0, V2S64, V16S64) - .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); + .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) + .moreElementsIf( + isIllegalRegisterType(0), + moreElementsToNextExistingRegClass(0)); if (ST.hasScalarPackInsts()) { BuildVector diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2501,31 +2501,31 @@ static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::VReg_64RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::VReg_96RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::VReg_128RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::VReg_160RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::VReg_192RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::VReg_224RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::VReg_256RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::VReg_288RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::VReg_320RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::VReg_352RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::VReg_384RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::VReg_512RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::VReg_1024RegClass; return nullptr; @@ -2533,31 +2533,31 @@ static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::VReg_64_Align2RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::VReg_96_Align2RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::VReg_128_Align2RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::VReg_160_Align2RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::VReg_192_Align2RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::VReg_224_Align2RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::VReg_256_Align2RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::VReg_288_Align2RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::VReg_320_Align2RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::VReg_352_Align2RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::VReg_384_Align2RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::VReg_512_Align2RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::VReg_1024_Align2RegClass; return nullptr; @@ -2567,9 +2567,9 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { if (BitWidth == 1) return &AMDGPU::VReg_1RegClass; - if (BitWidth <= 16) + if (BitWidth == 16) return &AMDGPU::VGPR_LO16RegClass; - if (BitWidth <= 32) + if (BitWidth == 32) return &AMDGPU::VGPR_32RegClass; return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) : getAnyVGPRClassForBitWidth(BitWidth); @@ -2577,31 +2577,31 @@ static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::AReg_64RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::AReg_96RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::AReg_128RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::AReg_160RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::AReg_192RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::AReg_224RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::AReg_256RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::AReg_288RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::AReg_320RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::AReg_352RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::AReg_384RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::AReg_512RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::AReg_1024RegClass; return nullptr; @@ -2609,31 +2609,31 @@ static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::AReg_64_Align2RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::AReg_96_Align2RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::AReg_128_Align2RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::AReg_160_Align2RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::AReg_192_Align2RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::AReg_224_Align2RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::AReg_256_Align2RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::AReg_288_Align2RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::AReg_320_Align2RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::AReg_352_Align2RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::AReg_384_Align2RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::AReg_512_Align2RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::AReg_1024_Align2RegClass; return nullptr; @@ -2641,9 +2641,9 @@ const TargetRegisterClass * SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { - if (BitWidth <= 16) + if (BitWidth == 16) return &AMDGPU::AGPR_LO16RegClass; - if (BitWidth <= 32) + if (BitWidth == 32) return &AMDGPU::AGPR_32RegClass; return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) : getAnyAGPRClassForBitWidth(BitWidth); @@ -2651,31 +2651,31 @@ static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::AV_64RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::AV_96RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::AV_128RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::AV_160RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::AV_192RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::AV_224RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::AV_256RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::AV_288RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::AV_320RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::AV_352RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::AV_384RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::AV_512RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::AV_1024RegClass; return nullptr; @@ -2683,31 +2683,31 @@ static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::AV_64_Align2RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::AV_96_Align2RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::AV_128_Align2RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::AV_160_Align2RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::AV_192_Align2RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::AV_224_Align2RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::AV_256_Align2RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::AV_288_Align2RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::AV_320_Align2RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::AV_352_Align2RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::AV_384_Align2RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::AV_512_Align2RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::AV_1024_Align2RegClass; return nullptr; @@ -2715,9 +2715,9 @@ const TargetRegisterClass * SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { - if (BitWidth <= 16) + if (BitWidth == 16) return &AMDGPU::VGPR_LO16RegClass; - if (BitWidth <= 32) + if (BitWidth == 32) return &AMDGPU::AV_32RegClass; return ST.needsAlignedVGPRs() ? getAlignedVectorSuperClassForBitWidth(BitWidth) @@ -2726,35 +2726,35 @@ const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 16) + if (BitWidth == 16) return &AMDGPU::SGPR_LO16RegClass; - if (BitWidth <= 32) + if (BitWidth == 32) return &AMDGPU::SReg_32RegClass; - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::SReg_64RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::SGPR_96RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::SGPR_128RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::SGPR_160RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::SGPR_192RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::SGPR_224RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::SGPR_256RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::SGPR_288RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::SGPR_320RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::SGPR_352RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::SGPR_384RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::SGPR_512RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::SGPR_1024RegClass; return nullptr; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2768,8 +2768,13 @@ ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v13, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GCN-NEXT: ; kill: def $vgpr15 killed $sgpr14 killed $exec +; GCN-NEXT: ; kill: def $vgpr16 killed $sgpr15 killed $exec +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: v_readfirstlane_b32 s1, v1 ; GCN-NEXT: ; return to shader part epilog @@ -2808,8 +2813,11 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -2847,8 +2855,11 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s12, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s13, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -2879,6 +2890,9 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: dyn_extract_v7f64_v_v: @@ -2903,6 +2917,9 @@ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: dyn_extract_v7f64_v_v: @@ -2921,6 +2938,8 @@ ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v14 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x double> %vec, i32 %sel @@ -3422,42 +3441,82 @@ ; GCN-NEXT: v_mov_b32_e32 v12, 0x41700000 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v12, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 15, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: dyn_extract_v15f32_const_s_v: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, 0x41700000, vcc_lo -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: dyn_extract_v15f32_const_s_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41700000, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s4, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: dyn_extract_v15f32_const_s_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41700000, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <15 x float> , i32 %sel ret float %ext @@ -3557,7 +3616,9 @@ ; GCN-NEXT: v_mov_b32_e32 v15, s16 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 15, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: dyn_extract_v15f32_s_v: @@ -3590,7 +3651,9 @@ ; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s15, vcc_lo ; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, s16, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s16, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <15 x float> %vec, i32 %sel @@ -3629,41 +3692,81 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v15 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 15, v15 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: dyn_extract_v15f32_v_v: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: dyn_extract_v15f32_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: dyn_extract_v15f32_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v15 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <15 x float> %vec, i32 %sel ret float %ext @@ -3825,6 +3928,8 @@ ; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 14, v15 ; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 15, v15 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; ; MOVREL-LABEL: dyn_extract_v15f32_v_v_offset3: @@ -3859,42 +3964,83 @@ ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v15 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v15 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc ; MOVREL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: dyn_extract_v15f32_v_v_offset3: -; GFX10PLUS: ; %bb.0: ; %entry -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: dyn_extract_v15f32_v_v_offset3: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: dyn_extract_v15f32_v_v_offset3: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v15 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %sel, 3 %ext = extractelement <15 x float> %vec, i32 %add diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -5670,6 +5670,10 @@ ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 @@ -5683,43 +5687,39 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v2 -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v0, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v14, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3 -; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 -; GPRIDX-NEXT: v_readfirstlane_b32 s3, v6 -; GPRIDX-NEXT: v_readfirstlane_b32 s4, v5 -; GPRIDX-NEXT: v_readfirstlane_b32 s5, v8 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7 -; GPRIDX-NEXT: v_readfirstlane_b32 s7, v10 -; GPRIDX-NEXT: v_readfirstlane_b32 s8, v9 -; GPRIDX-NEXT: v_readfirstlane_b32 s9, v12 -; GPRIDX-NEXT: v_readfirstlane_b32 s10, v11 -; GPRIDX-NEXT: v_readfirstlane_b32 s11, v13 -; GPRIDX-NEXT: v_readfirstlane_b32 s12, v0 -; GPRIDX-NEXT: v_readfirstlane_b32 s13, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_readfirstlane_b32 s1, v3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v7, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v8, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v11, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v12, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v13, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v14, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v15, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v16, v1, vcc +; GPRIDX-NEXT: v_readfirstlane_b32 s3, v3 +; GPRIDX-NEXT: v_readfirstlane_b32 s4, v4 +; GPRIDX-NEXT: v_readfirstlane_b32 s5, v5 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v6 +; GPRIDX-NEXT: v_readfirstlane_b32 s7, v7 +; GPRIDX-NEXT: v_readfirstlane_b32 s8, v8 +; GPRIDX-NEXT: v_readfirstlane_b32 s9, v9 +; GPRIDX-NEXT: v_readfirstlane_b32 s10, v10 +; GPRIDX-NEXT: v_readfirstlane_b32 s11, v11 +; GPRIDX-NEXT: v_readfirstlane_b32 s12, v12 +; GPRIDX-NEXT: v_readfirstlane_b32 s13, v13 ; GPRIDX-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: dyn_insertelement_v7f64_s_v_v: @@ -5739,9 +5739,13 @@ ; GFX10-NEXT: s_mov_b32 s12, s14 ; GFX10-NEXT: s_mov_b32 s13, s15 ; GFX10-NEXT: v_mov_b32_e32 v18, s15 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 ; GFX10-NEXT: v_mov_b32_e32 v17, s14 ; GFX10-NEXT: v_mov_b32_e32 v16, s13 ; GFX10-NEXT: v_mov_b32_e32 v15, s12 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v14, s11 ; GFX10-NEXT: v_mov_b32_e32 v13, s10 ; GFX10-NEXT: v_mov_b32_e32 v12, s9 @@ -5752,43 +5756,39 @@ ; GFX10-NEXT: v_mov_b32_e32 v7, s4 ; GFX10-NEXT: v_mov_b32_e32 v6, s3 ; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s2, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 -; GFX10-NEXT: v_readfirstlane_b32 s2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v1, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s3, v6 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v3 -; GFX10-NEXT: v_readfirstlane_b32 s1, v4 -; GFX10-NEXT: v_readfirstlane_b32 s4, v7 -; GFX10-NEXT: v_readfirstlane_b32 s5, v8 -; GFX10-NEXT: v_readfirstlane_b32 s6, v9 -; GFX10-NEXT: v_readfirstlane_b32 s7, v10 -; GFX10-NEXT: v_readfirstlane_b32 s8, v11 -; GFX10-NEXT: v_readfirstlane_b32 s9, v2 -; GFX10-NEXT: v_readfirstlane_b32 s10, v12 -; GFX10-NEXT: v_readfirstlane_b32 s11, v13 -; GFX10-NEXT: v_readfirstlane_b32 s12, v0 -; GFX10-NEXT: v_readfirstlane_b32 s13, v1 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v11, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, v12, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v13, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v14, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 +; GFX10-NEXT: v_readfirstlane_b32 s10, v10 +; GFX10-NEXT: v_readfirstlane_b32 s11, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, v16, v1, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s12, v12 +; GFX10-NEXT: v_readfirstlane_b32 s13, v13 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: dyn_insertelement_v7f64_s_v_v: @@ -5808,45 +5808,45 @@ ; GFX11-NEXT: s_mov_b32 s12, s14 ; GFX11-NEXT: s_mov_b32 s13, s15 ; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 ; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 ; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 ; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 ; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v18, v3, v0 :: v_dual_cndmask_b32 v17, v4, v1 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 -; GFX11-NEXT: v_readfirstlane_b32 s2, v5 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v2, v12, v1 -; GFX11-NEXT: v_readfirstlane_b32 s3, v6 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1 -; GFX11-NEXT: v_readfirstlane_b32 s0, v3 -; GFX11-NEXT: v_readfirstlane_b32 s1, v4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v7 -; GFX11-NEXT: v_readfirstlane_b32 s5, v8 -; GFX11-NEXT: v_readfirstlane_b32 s6, v9 -; GFX11-NEXT: v_readfirstlane_b32 s7, v10 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 4, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v1, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 6, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v7, v0 :: v_dual_cndmask_b32 v5, v8, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6 +; GFX11-NEXT: v_readfirstlane_b32 s0, v18 +; GFX11-NEXT: v_readfirstlane_b32 s1, v17 +; GFX11-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v9, v0 :: v_dual_cndmask_b32 v8, v10, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, v1, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v15, v0, s9 +; GFX11-NEXT: v_readfirstlane_b32 s3, v4 +; GFX11-NEXT: v_readfirstlane_b32 s4, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v13, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v13, v16, v1, s9 +; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s6, v7 +; GFX11-NEXT: v_readfirstlane_b32 s7, v8 ; GFX11-NEXT: v_readfirstlane_b32 s8, v11 -; GFX11-NEXT: v_readfirstlane_b32 s9, v2 -; GFX11-NEXT: v_readfirstlane_b32 s10, v12 -; GFX11-NEXT: v_readfirstlane_b32 s11, v13 -; GFX11-NEXT: v_readfirstlane_b32 s12, v0 -; GFX11-NEXT: v_readfirstlane_b32 s13, v1 +; GFX11-NEXT: v_readfirstlane_b32 s9, v9 +; GFX11-NEXT: v_readfirstlane_b32 s10, v10 +; GFX11-NEXT: v_readfirstlane_b32 s11, v14 +; GFX11-NEXT: v_readfirstlane_b32 s12, v12 +; GFX11-NEXT: v_readfirstlane_b32 s13, v13 ; GFX11-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x double> %vec, double %val, i32 %idx @@ -5908,26 +5908,26 @@ ; GPRIDX-LABEL: dyn_insertelement_v7f64_v_v_v: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v16 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 2, v16 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v16 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 4, v16 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 5, v16 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 6, v16 ; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v14, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[10:11] ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[10:11] +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v15, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 +; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 @@ -5947,38 +5947,38 @@ ; GFX10-LABEL: dyn_insertelement_v7f64_v_v_v: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 2, v16 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 3, v16 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 4, v16 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v16 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 6, v16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v14, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v14, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v15, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v15, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v15, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 ; GFX10-NEXT: v_readfirstlane_b32 s6, v6 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 ; GFX10-NEXT: v_readfirstlane_b32 s8, v8 ; GFX10-NEXT: v_readfirstlane_b32 s9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 ; GFX10-NEXT: v_readfirstlane_b32 s10, v10 ; GFX10-NEXT: v_readfirstlane_b32 s11, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s12, v12 ; GFX10-NEXT: v_readfirstlane_b32 s13, v13 ; GFX10-NEXT: ; return to shader part epilog @@ -5986,14 +5986,14 @@ ; GFX11-LABEL: dyn_insertelement_v7f64_v_v_v: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v16 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 5, v16 +; GFX11-NEXT: v_cmp_eq_u32_e64 s10, 6, v16 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v14, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v15, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v14, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v15, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v14, s9 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v15, s9 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v14, s10 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v15, s10 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v14 :: v_dual_cndmask_b32 v3, v3, v15 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0