Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -301,6 +301,10 @@ /// type that's wider than the given size. LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size); +/// True iff the specified type index is a scalar whose size is not a multiple +/// of Size. +LegalityPredicate sizeNotMultipleOf(unsigned TypeIdx, unsigned Size); + /// True iff the specified type index is a scalar whose size is not a power of /// 2. LegalityPredicate sizeNotPow2(unsigned TypeIdx); @@ -356,6 +360,11 @@ /// next power of 2. LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min = 0); +/// Widen the scalar type or vector element type for the given type index to +/// next multiple of \p Size. +LegalizeMutation widenScalarOrEltToNextMultipleOf(unsigned TypeIdx, + unsigned Size); + /// Add more elements to the type for the given type index to the next power of /// 2. LegalizeMutation moreElementsToNextPow2(unsigned TypeIdx, unsigned Min = 0); @@ -835,6 +844,16 @@ LegalizeAction::WidenScalar, sizeNotPow2(typeIdx(TypeIdx)), LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx, MinSize)); } + + /// Widen the scalar to the next multiple of Size. No effect if the + /// type is not a scalar or is a multiple of Size. + LegalizeRuleSet &widenScalarToNextMultipleOf(unsigned TypeIdx, + unsigned Size) { + using namespace LegalityPredicates; + return actionIf( + LegalizeAction::WidenScalar, sizeNotMultipleOf(typeIdx(TypeIdx), Size), + LegalizeMutations::widenScalarOrEltToNextMultipleOf(TypeIdx, Size)); + } /// Widen the scalar or vector element type to the next power of two that is /// at least MinSize. No effect if the scalar size is a power of two. Index: llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp +++ llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp @@ -153,6 +153,14 @@ }; } +LegalityPredicate LegalityPredicates::sizeNotMultipleOf(unsigned TypeIdx, + unsigned Size) { + return [=](const LegalityQuery &Query) { + const LLT QueryTy = Query.Types[TypeIdx]; + return QueryTy.isScalar() && QueryTy.getSizeInBits() % Size != 0; + }; +} + LegalityPredicate LegalityPredicates::sizeNotPow2(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT QueryTy = Query.Types[TypeIdx]; Index: llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp +++ llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp @@ -63,6 +63,16 @@ }; } +LegalizeMutation +LegalizeMutations::widenScalarOrEltToNextMultipleOf(unsigned TypeIdx, + unsigned Size) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + unsigned NewEltSizeInBits = alignTo(Ty.getScalarSizeInBits(), Size); + return std::make_pair(TypeIdx, Ty.changeElementSize(NewEltSizeInBits)); + }; +} + LegalizeMutation LegalizeMutations::moreElementsToNextPow2(unsigned TypeIdx, unsigned Min) { return [=](const LegalityQuery &Query) { Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -532,10 +532,11 @@ // Full set of gfx9 features. getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16, V2S16}) - .clampScalar(0, S16, S32) + .minScalar(0, S16) .clampMaxNumElements(0, S16, 2) - .scalarize(0) - .widenScalarToNextPow2(0, 32); + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32) + .scalarize(0); getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) .legalFor({S32, S16, V2S16}) // Clamp modifier @@ -547,9 +548,10 @@ } else if (ST.has16BitInsts()) { getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16}) - .clampScalar(0, S16, S32) - .scalarize(0) - .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32) + .scalarize(0); // Technically the saturating operations require clamp bit support, but this // was introduced at the same time as 16-bit operations. @@ -569,6 +571,7 @@ } else { getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32}) + .widenScalarToNextMultipleOf(0, 32) .clampScalar(0, S32, S32) .scalarize(0); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir @@ -500,21 +500,58 @@ $vgpr0 = COPY %5 ... -# FIXME: -# --- -# name: test_mul_s33 -# body: | -# bb.0: -# liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - -# %0:_(s64) = COPY $vgpr0_vgpr1 -# %1:_(s64) = COPY $vgpr2_vgpr3 -# %2:_(s33) = G_TRUNC %0 -# %3:_(s33) = G_TRUNC %1 -# %4:_(s33) = G_MUL %2, %3 -# %5:_(s64) = G_ANYEXT %4 -# $vgpr0_vgpr1 = COPY %5 -# ... +--- +name: test_mul_s33 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX6-LABEL: name: test_mul_s33 + ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] + ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX6: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]] + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; GFX6: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; GFX8-LABEL: name: test_mul_s33 + ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] + ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX8: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]] + ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] + ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; GFX8: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) + ; GFX8: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; GFX9-LABEL: name: test_mul_s33 + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] + ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX9: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]] + ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] + ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s33) = G_TRUNC %0 + %3:_(s33) = G_TRUNC %1 + %4:_(s33) = G_MUL %2, %3 + %5:_(s64) = G_ANYEXT %4 + $vgpr0_vgpr1 = COPY %5 +... --- name: test_mul_s96 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -276,6 +276,57 @@ ret <2 x i32> %result } +define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) { +; GFX7-LABEL: s_mul_i33: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX7-NEXT: s_mul_i32 s4, s0, s2 +; GFX7-NEXT: s_mul_i32 s1, s1, s2 +; GFX7-NEXT: s_mul_i32 s0, s0, s3 +; GFX7-NEXT: s_add_i32 s1, s1, s0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s1, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_mul_i33: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX8-NEXT: s_mul_i32 s4, s0, s2 +; GFX8-NEXT: s_mul_i32 s1, s1, s2 +; GFX8-NEXT: s_mul_i32 s0, s0, s3 +; GFX8-NEXT: s_add_i32 s1, s1, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s1, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_mul_i33: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mul_i32 s1, s1, s2 +; GFX9-NEXT: s_mul_i32 s3, s0, s3 +; GFX9-NEXT: s_mul_i32 s4, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s2 +; GFX9-NEXT: s_add_i32 s1, s1, s3 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_i33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s1, s1, s2 +; GFX10-NEXT: s_mul_i32 s3, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s4, s0, s2 +; GFX10-NEXT: s_add_i32 s1, s1, s3 +; GFX10-NEXT: s_mul_i32 s0, s0, s2 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: ; return to shader part epilog + %result = mul i33 %num, %den + ret i33 %result +} + define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) { ; GFX7-LABEL: s_mul_i64: ; GFX7: ; %bb.0: