Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -205,6 +205,26 @@ }; } +// Scalars that have to be widened before using narrowScalar e.g. s33->s64. +static LegalityPredicate isScalarLargerButNotMultipleOf(unsigned TypeIdx, + unsigned N) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + if (!Ty.isScalar()) + return false; + unsigned Size = Ty.getScalarSizeInBits(); + return (Size > N && Size % N != 0); + }; +} + +static LegalizeMutation widenScalarToNextMultipleOf(unsigned TypeIdx, + unsigned N) { + return [=](const LegalityQuery &Query) { + unsigned Size = Query.Types[TypeIdx].getScalarSizeInBits(); + return std::make_pair(TypeIdx, LLT::scalar(N * (Size / N + 1))); + }; +} + // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we // handle some operations by just promoting the register during // selection. There are also d16 loads on GFX9+ which preserve the high bits. @@ -425,6 +445,8 @@ assert(ST.hasIntClamp() && "all targets with VOP3P should support clamp"); getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16, V2S16}) + .widenScalarIf(isScalarLargerButNotMultipleOf(0, 32), + widenScalarToNextMultipleOf(0, 32)) .clampScalar(0, S16, S32) .clampMaxNumElements(0, S16, 2) .scalarize(0) @@ -440,6 +462,8 @@ } else if (ST.has16BitInsts()) { getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16}) + .widenScalarIf(isScalarLargerButNotMultipleOf(0, 32), + widenScalarToNextMultipleOf(0, 32)) .clampScalar(0, S16, S32) .scalarize(0) .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 @@ -464,6 +488,8 @@ } else { getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32}) + .widenScalarIf(isScalarLargerButNotMultipleOf(0, 32), + widenScalarToNextMultipleOf(0, 32)) .clampScalar(0, S32, S32) .scalarize(0); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir @@ -492,21 +492,56 @@ $vgpr0 = COPY %5 ... -# FIXME -# --- -# name: test_add_s33 -# body: | -# bb.0: -# liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 +--- +name: test_add_s33 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 -# %0:_(s64) = COPY $vgpr0_vgpr1 -# %1:_(s64) = COPY $vgpr2_vgpr3 -# %2:_(s33) = G_TRUNC %0 -# %3:_(s33) = G_TRUNC %1 -# %4:_(s33) = G_ADD %2, %3 -# %5:_(s64) = G_ANYEXT %4 -# $vgpr0_vgpr1 = COPY %5 -# ... + ; GFX6-LABEL: name: test_add_s33 + ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX6: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) + ; GFX6: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) + ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) + ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](s64) + ; GFX6: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] + ; GFX6: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] + ; GFX6: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) + ; GFX6: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; GFX6: $vgpr0_vgpr1 = COPY [[COPY4]](s64) + ; GFX8-LABEL: name: test_add_s33 + ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX8: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) + ; GFX8: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) + ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) + ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](s64) + ; GFX8: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] + ; GFX8: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] + ; GFX8: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) + ; GFX8: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; GFX8: $vgpr0_vgpr1 = COPY [[COPY4]](s64) + ; GFX9-LABEL: name: test_add_s33 + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) + ; GFX9: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) + ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](s64) + ; GFX9: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] + ; GFX9: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; GFX9: $vgpr0_vgpr1 = COPY [[COPY4]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s33) = G_TRUNC %0 + %3:_(s33) = G_TRUNC %1 + %4:_(s33) = G_ADD %2, %3 + %5:_(s64) = G_ANYEXT %4 + $vgpr0_vgpr1 = COPY %5 +... --- name: test_add_s96 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir @@ -546,21 +546,68 @@ $vgpr0 = COPY %5 ... -# FIXME: -# --- -# name: test_mul_s33 -# body: | -# bb.0: -# liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 +--- +name: test_mul_s33 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 -# %0:_(s64) = COPY $vgpr0_vgpr1 -# %1:_(s64) = COPY $vgpr2_vgpr3 -# %2:_(s33) = G_TRUNC %0 -# %3:_(s33) = G_TRUNC %1 -# %4:_(s33) = G_MUL %2, %3 -# %5:_(s64) = G_ANYEXT %4 -# $vgpr0_vgpr1 = COPY %5 -# ... + ; GFX6-LABEL: name: test_mul_s33 + ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX6: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) + ; GFX6: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) + ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) + ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](s64) + ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] + ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX6: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]] + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; GFX6: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) + ; GFX6: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; GFX6: $vgpr0_vgpr1 = COPY [[COPY4]](s64) + ; GFX8-LABEL: name: test_mul_s33 + ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX8: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) + ; GFX8: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) + ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) + ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](s64) + ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] + ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX8: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]] + ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] + ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; GFX8: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) + ; GFX8: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; GFX8: $vgpr0_vgpr1 = COPY [[COPY4]](s64) + ; GFX9-LABEL: name: test_mul_s33 + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) + ; GFX9: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) + ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](s64) + ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] + ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX9: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]] + ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] + ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; GFX9: $vgpr0_vgpr1 = COPY [[COPY4]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s33) = G_TRUNC %0 + %3:_(s33) = G_TRUNC %1 + %4:_(s33) = G_MUL %2, %3 + %5:_(s64) = G_ANYEXT %4 + $vgpr0_vgpr1 = COPY %5 +... --- name: test_mul_s96 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir @@ -486,21 +486,56 @@ $vgpr0 = COPY %5 ... -# FIXME -# --- -# name: test_sub_s33 -# body: | -# bb.0: -# liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 +--- + name: test_sub_s33 + body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 -# %0:_(s64) = COPY $vgpr0_vgpr1 -# %1:_(s64) = COPY $vgpr2_vgpr3 -# %2:_(s33) = G_TRUNC %0 -# %3:_(s33) = G_TRUNC %1 -# %4:_(s33) = G_SUB %2, %3 -# %5:_(s64) = G_ANYEXT %4 -# $vgpr0_vgpr1 = COPY %5 -# ... + ; GFX6-LABEL: name: test_sub_s33 + ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX6: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) + ; GFX6: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) + ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) + ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](s64) + ; GFX6: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] + ; GFX6: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] + ; GFX6: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) + ; GFX6: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; GFX6: $vgpr0_vgpr1 = COPY [[COPY4]](s64) + ; GFX8-LABEL: name: test_sub_s33 + ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX8: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) + ; GFX8: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) + ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) + ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](s64) + ; GFX8: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] + ; GFX8: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] + ; GFX8: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) + ; GFX8: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; GFX8: $vgpr0_vgpr1 = COPY [[COPY4]](s64) + ; GFX9-LABEL: name: test_sub_s33 + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64) + ; GFX9: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](s64) + ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](s64) + ; GFX9: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] + ; GFX9: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; GFX9: $vgpr0_vgpr1 = COPY [[COPY4]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s33) = G_TRUNC %0 + %3:_(s33) = G_TRUNC %1 + %4:_(s33) = G_SUB %2, %3 + %5:_(s64) = G_ANYEXT %4 + $vgpr0_vgpr1 = COPY %5 +... --- name: test_sub_s96