Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -207,6 +207,12 @@ return Size % 32 == 0 && Size <= MaxRegisterSize; } +static bool isRegisterClassSize(unsigned Size) { + if (SIRegisterInfo::getSGPRClassForBitWidth(Size)) + return true; + return false; +} + static bool isRegisterVectorElementType(LLT EltTy) { const int EltSize = EltTy.getSizeInBits(); return EltSize == 16 || EltSize % 32 == 0; @@ -229,6 +235,16 @@ return true; } +static bool isRegisterClassType(LLT Ty) { + if (!isRegisterClassSize(Ty.getSizeInBits())) + return false; + + if (Ty.isVector()) + return isRegisterVectorType(Ty); + + return true; +} + // Any combination of 32 or 64-bit elements up the maximum register size, and // multiples of v2s16. static LegalityPredicate isRegisterType(unsigned TypeIdx) { @@ -237,6 +253,12 @@ }; } +static LegalityPredicate isRegisterClassType(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + return isRegisterClassType(Query.Types[TypeIdx]); + }; +} + // RegisterType that doesn't have a corresponding RegClass. static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { @@ -696,8 +718,7 @@ .scalarize(0); getActionDefinitionsBuilder(G_BITCAST) - // Don't worry about the size constraint. - .legalIf(all(isRegisterType(0), isRegisterType(1))) + .legalIf(all(isRegisterClassType(0), isRegisterClassType(1))) .lower(); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2725,6 +2725,294 @@ ret double %ext } +define amdgpu_ps double @dyn_extract_v7f64_s_v_bitcast(<14 x float> inreg %userData, i32 %sel) { +; GCN-LABEL: dyn_extract_v7f64_s_v_bitcast: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: v_mov_b32_e32 v6, s5 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_mov_b32_e32 v8, s7 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v9, s8 +; GCN-NEXT: v_mov_b32_e32 v10, s9 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v11, s12 +; GCN-NEXT: v_mov_b32_e32 v12, s13 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_mov_b32_e32 v13, s14 +; GCN-NEXT: v_mov_b32_e32 v14, s15 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GCN-NEXT: ; kill: def $vgpr15 killed $sgpr14 killed $exec +; GCN-NEXT: ; kill: def $vgpr16 killed $sgpr15 killed $exec +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v7f64_s_v_bitcast: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s19, s5 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s19 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_extract_v7f64_s_v_bitcast: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s19, s5 +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s19 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog +entry: + %bc = bitcast <14 x float> %userData to <7 x double> + %ext = extractelement <7 x double> %bc, i32 %sel + ret double %ext +} + +define amdgpu_ps i64 @dyn_extract_v7i64_s_v_bitcast(<14 x i32> inreg %userData, i32 %sel) { +; GCN-LABEL: dyn_extract_v7i64_s_v_bitcast: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s4 +; GCN-NEXT: s_mov_b32 s3, s5 +; GCN-NEXT: s_mov_b32 s4, s6 +; GCN-NEXT: s_mov_b32 s5, s7 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_mov_b32 s6, s8 +; GCN-NEXT: s_mov_b32 s7, s9 +; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: v_mov_b32_e32 v6, s5 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GCN-NEXT: s_mov_b32 s8, s10 +; GCN-NEXT: s_mov_b32 s9, s11 +; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_mov_b32_e32 v8, s7 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v9, s8 +; GCN-NEXT: v_mov_b32_e32 v10, s9 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v11, s12 +; GCN-NEXT: v_mov_b32_e32 v12, s13 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GCN-NEXT: v_mov_b32_e32 v13, s14 +; GCN-NEXT: v_mov_b32_e32 v14, s15 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GCN-NEXT: ; kill: def $vgpr15 killed $sgpr14 killed $exec +; GCN-NEXT: ; kill: def $vgpr16 killed $sgpr15 killed $exec +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_extract_v7i64_s_v_bitcast: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s19, s5 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s19 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s12, s14 +; GFX10-NEXT: s_mov_b32 s13, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_extract_v7i64_s_v_bitcast: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s19, s5 +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s19 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog +entry: + %bc = bitcast <14 x i32> %userData to <7 x i64> + %ext = extractelement <7 x i64> %bc, i32 %sel + ret i64 %ext +} + define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) { ; GCN-LABEL: dyn_extract_v7f64_s_v: ; GCN: ; %bb.0: ; %entry