Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -530,13 +530,20 @@ if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { // Full set of gfx9 features. - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({S32, S16, V2S16}) .clampScalar(0, S16, S32) .clampMaxNumElements(0, S16, 2) .scalarize(0) .widenScalarToNextPow2(0, 32); + getActionDefinitionsBuilder(G_MUL) + .legalFor({S32, S16, V2S16}) + .widenScalarToNextPow2(0, 32) + .clampScalar(0, S16, S32) + .clampMaxNumElements(0, S16, 2) + .scalarize(0); + getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) .legalFor({S32, S16, V2S16}) // Clamp modifier .minScalarOrElt(0, S16) @@ -545,12 +552,18 @@ .widenScalarToNextPow2(0, 32) .lower(); } else if (ST.has16BitInsts()) { - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({S32, S16}) .clampScalar(0, S16, S32) .scalarize(0) .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 + getActionDefinitionsBuilder(G_MUL) + .legalFor({S32, S16}) + .widenScalarToNextPow2(0, 32) + .clampScalar(0, S16, S32) + .scalarize(0); // FIXME: min should be 16 + // Technically the saturating operations require clamp bit support, but this // was introduced at the same time as 16-bit operations. getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) @@ -567,11 +580,17 @@ .scalarize(0) .lower(); } else { - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({S32}) .clampScalar(0, S32, S32) .scalarize(0); + getActionDefinitionsBuilder(G_MUL) + .legalFor({S32}) + .widenScalarToNextPow2(0, 32) + .clampScalar(0, S32, S32) + .scalarize(0); + if (ST.hasIntClamp()) { getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) .legalFor({S32}) // Clamp modifier. Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir @@ -500,21 +500,58 @@ $vgpr0 = COPY %5 ... -# FIXME: -# --- -# name: test_mul_s33 -# body: | -# bb.0: -# liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - -# %0:_(s64) = COPY $vgpr0_vgpr1 -# %1:_(s64) = COPY $vgpr2_vgpr3 -# %2:_(s33) = G_TRUNC %0 -# %3:_(s33) = G_TRUNC %1 -# %4:_(s33) = G_MUL %2, %3 -# %5:_(s64) = G_ANYEXT %4 -# $vgpr0_vgpr1 = COPY %5 -# ... +--- +name: test_mul_s33 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX6-LABEL: name: test_mul_s33 + ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] + ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX6: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]] + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; GFX6: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; GFX8-LABEL: name: test_mul_s33 + ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] + ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX8: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]] + ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] + ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; GFX8: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) + ; GFX8: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; GFX9-LABEL: name: test_mul_s33 + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]] + ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]] + ; GFX9: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] + ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]] + ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] + ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s33) = G_TRUNC %0 + %3:_(s33) = G_TRUNC %1 + %4:_(s33) = G_MUL %2, %3 + %5:_(s64) = G_ANYEXT %4 + $vgpr0_vgpr1 = COPY %5 +... --- name: test_mul_s96 @@ -526,6 +563,7 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2 ; GFX6: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5 ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96) + ; GFX6: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX6: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96) ; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] ; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]] @@ -541,17 +579,41 @@ ; GFX6: [[MUL5:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]] ; GFX6: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[UV1]], [[UV3]] ; GFX6: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV4]] - ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]] - ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[MUL5]] - ; GFX6: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[UMULH1]] - ; GFX6: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UMULH2]] - ; GFX6: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD]] - ; GFX6: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[ADD5]](s32) + ; GFX6: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[MUL3]], [[MUL4]] + ; GFX6: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO5]](s1) + ; GFX6: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UADDO4]], [[MUL5]] + ; GFX6: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO7]](s1) + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ZEXT2]], [[ZEXT3]] + ; GFX6: [[UADDO8:%[0-9]+]]:_(s32), [[UADDO9:%[0-9]+]]:_(s1) = G_UADDO [[UADDO6]], [[UMULH1]] + ; GFX6: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO9]](s1) + ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ZEXT4]] + ; GFX6: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[UADDO8]], [[UMULH2]] + ; GFX6: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO11]](s1) + ; GFX6: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[ZEXT5]] + ; GFX6: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[ADD]] + ; GFX6: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) + ; GFX6: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[ZEXT6]] + ; GFX6: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[DEF]], [[UV3]] + ; GFX6: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV4]] + ; GFX6: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV5]] + ; GFX6: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[DEF]] + ; GFX6: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[UV2]], [[UV3]] + ; GFX6: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[UV1]], [[UV4]] + ; GFX6: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV5]] + ; GFX6: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[MUL6]], [[MUL7]] + ; GFX6: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[MUL8]] + ; GFX6: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[MUL9]] + ; GFX6: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH3]] + ; GFX6: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ADD8]], [[UMULH4]] + ; GFX6: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[UMULH5]] + ; GFX6: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ADD4]] + ; GFX6: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[UADDO12]](s32) ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX8-LABEL: name: test_mul_s96 ; GFX8: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2 ; GFX8: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96) + ; GFX8: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX8: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96) ; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] ; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]] @@ -567,17 +629,41 @@ ; GFX8: [[MUL5:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]] ; GFX8: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[UV1]], [[UV3]] ; GFX8: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV4]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]] - ; GFX8: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[MUL5]] - ; GFX8: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[UMULH1]] - ; GFX8: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UMULH2]] - ; GFX8: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD]] - ; GFX8: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[ADD5]](s32) + ; GFX8: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[MUL3]], [[MUL4]] + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO5]](s1) + ; GFX8: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UADDO4]], [[MUL5]] + ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO7]](s1) + ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ZEXT2]], [[ZEXT3]] + ; GFX8: [[UADDO8:%[0-9]+]]:_(s32), [[UADDO9:%[0-9]+]]:_(s1) = G_UADDO [[UADDO6]], [[UMULH1]] + ; GFX8: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO9]](s1) + ; GFX8: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ZEXT4]] + ; GFX8: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[UADDO8]], [[UMULH2]] + ; GFX8: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO11]](s1) + ; GFX8: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[ZEXT5]] + ; GFX8: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[ADD]] + ; GFX8: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) + ; GFX8: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[ZEXT6]] + ; GFX8: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[DEF]], [[UV3]] + ; GFX8: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV4]] + ; GFX8: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV5]] + ; GFX8: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[DEF]] + ; GFX8: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[UV2]], [[UV3]] + ; GFX8: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[UV1]], [[UV4]] + ; GFX8: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV5]] + ; GFX8: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[MUL6]], [[MUL7]] + ; GFX8: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[MUL8]] + ; GFX8: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[MUL9]] + ; GFX8: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH3]] + ; GFX8: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ADD8]], [[UMULH4]] + ; GFX8: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[UMULH5]] + ; GFX8: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ADD4]] + ; GFX8: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[UADDO12]](s32) ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-LABEL: name: test_mul_s96 ; GFX9: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96) + ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s96) ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]] ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]] @@ -593,12 +679,35 @@ ; GFX9: [[MUL5:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV5]] ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[UV1]], [[UV3]] ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV4]] - ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]] - ; GFX9: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[MUL5]] - ; GFX9: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[UMULH1]] - ; GFX9: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UMULH2]] - ; GFX9: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD]] - ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[ADD5]](s32) + ; GFX9: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[MUL3]], [[MUL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO5]](s1) + ; GFX9: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UADDO4]], [[MUL5]] + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO7]](s1) + ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ZEXT2]], [[ZEXT3]] + ; GFX9: [[UADDO8:%[0-9]+]]:_(s32), [[UADDO9:%[0-9]+]]:_(s1) = G_UADDO [[UADDO6]], [[UMULH1]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO9]](s1) + ; GFX9: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ZEXT4]] + ; GFX9: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[UADDO8]], [[UMULH2]] + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO11]](s1) + ; GFX9: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[ZEXT5]] + ; GFX9: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[ADD]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1) + ; GFX9: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[ZEXT6]] + ; GFX9: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[DEF]], [[UV3]] + ; GFX9: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[UV2]], [[UV4]] + ; GFX9: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV5]] + ; GFX9: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[DEF]] + ; GFX9: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[UV2]], [[UV3]] + ; GFX9: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[UV1]], [[UV4]] + ; GFX9: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV5]] + ; GFX9: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[MUL6]], [[MUL7]] + ; GFX9: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[MUL8]] + ; GFX9: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[MUL9]] + ; GFX9: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH3]] + ; GFX9: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ADD8]], [[UMULH4]] + ; GFX9: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[UMULH5]] + ; GFX9: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ADD4]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MUL]](s32), [[UADDO2]](s32), [[UADDO12]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(s96) = COPY $vgpr0_vgpr1_vgpr2 %1:_(s96) = COPY $vgpr3_vgpr4_vgpr5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -276,6 +276,58 @@ ret <2 x i32> %result } + +define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) { +; GFX7-LABEL: s_mul_i33: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX7-NEXT: s_mul_i32 s4, s0, s2 +; GFX7-NEXT: s_mul_i32 s1, s1, s2 +; GFX7-NEXT: s_mul_i32 s0, s0, s3 +; GFX7-NEXT: s_add_i32 s1, s1, s0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s1, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v0 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_mul_i33: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX8-NEXT: s_mul_i32 s4, s0, s2 +; GFX8-NEXT: s_mul_i32 s1, s1, s2 +; GFX8-NEXT: s_mul_i32 s0, s0, s3 +; GFX8-NEXT: s_add_i32 s1, s1, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s1, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v0 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_mul_i33: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mul_i32 s1, s1, s2 +; GFX9-NEXT: s_mul_i32 s3, s0, s3 +; GFX9-NEXT: s_mul_i32 s4, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s2 +; GFX9-NEXT: s_add_i32 s1, s1, s3 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_mul_i33: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mul_i32 s1, s1, s2 +; GFX10-NEXT: s_mul_i32 s3, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s4, s0, s2 +; GFX10-NEXT: s_add_i32 s1, s1, s3 +; GFX10-NEXT: s_mul_i32 s0, s0, s2 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: ; return to shader part epilog + %result = mul i33 %num, %den + ret i33 %result +} + define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) { ; GFX7-LABEL: s_mul_i64: ; GFX7: ; %bb.0: @@ -394,8 +446,8 @@ ; GFX7-NEXT: s_cselect_b32 s8, 1, 0 ; GFX7-NEXT: s_mul_i32 s6, s0, s3 ; GFX7-NEXT: s_mul_i32 s5, s0, s5 -; GFX7-NEXT: s_add_i32 s0, s2, s7 -; GFX7-NEXT: s_add_i32 s0, s0, s5 +; GFX7-NEXT: s_add_u32 s0, s2, s7 +; GFX7-NEXT: s_add_u32 s0, s0, s5 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; GFX7-NEXT: s_and_b32 s8, s8, 1 @@ -424,8 +476,8 @@ ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 ; GFX8-NEXT: s_mul_i32 s6, s0, s3 ; GFX8-NEXT: s_mul_i32 s5, s0, s5 -; GFX8-NEXT: s_add_i32 s0, s2, s7 -; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_add_u32 s0, s2, s7 +; GFX8-NEXT: s_add_u32 s0, s0, s5 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: s_and_b32 s8, s8, 1 @@ -452,14 +504,14 @@ ; GFX9-NEXT: s_mul_i32 s9, s1, s4 ; GFX9-NEXT: s_mul_i32 s2, s2, s3 ; GFX9-NEXT: s_mul_i32 s5, s0, s5 -; GFX9-NEXT: s_add_i32 s2, s2, s9 +; GFX9-NEXT: s_add_u32 s2, s2, s9 ; GFX9-NEXT: s_mul_hi_u32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s5 +; GFX9-NEXT: s_add_u32 s2, s2, s5 ; GFX9-NEXT: s_mul_i32 s6, s0, s3 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4 -; GFX9-NEXT: s_add_i32 s1, s2, s1 -; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_add_i32 s2, s0, s8 +; GFX9-NEXT: s_add_u32 s1, s2, s1 +; GFX9-NEXT: s_add_u32 s0, s1, s0 +; GFX9-NEXT: s_add_u32 s2, s0, s8 ; GFX9-NEXT: s_mov_b32 s0, s6 ; GFX9-NEXT: s_mov_b32 s1, s7 ; GFX9-NEXT: ; return to shader part epilog @@ -477,16 +529,16 @@ ; GFX10-NEXT: s_add_u32 s6, s6, s8 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: s_mul_i32 s5, s0, s5 -; GFX10-NEXT: s_add_i32 s2, s2, s9 +; GFX10-NEXT: s_add_u32 s2, s2, s9 ; GFX10-NEXT: s_mul_hi_u32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s2, s2, s5 +; GFX10-NEXT: s_add_u32 s2, s2, s5 ; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: s_mul_hi_u32 s4, s0, s4 -; GFX10-NEXT: s_add_i32 s1, s2, s1 +; GFX10-NEXT: s_add_u32 s1, s2, s1 ; GFX10-NEXT: s_add_i32 s7, s7, s8 -; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_u32 s1, s1, s4 ; GFX10-NEXT: s_mul_i32 s0, s0, s3 -; GFX10-NEXT: s_add_i32 s2, s1, s7 +; GFX10-NEXT: s_add_u32 s2, s1, s7 ; GFX10-NEXT: s_mov_b32 s1, s6 ; GFX10-NEXT: ; return to shader part epilog %result = mul i96 %num, %den @@ -553,20 +605,22 @@ ; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3 ; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 -; GFX9-NEXT: v_mul_lo_u32 v10, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v1, v1, v3 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, v0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v2, v10 -; GFX9-NEXT: v_add_u32_e32 v3, v8, v9 -; GFX9-NEXT: v_add3_u32 v1, v2, v5, v1 -; GFX9-NEXT: v_add3_u32 v2, v1, v0, v3 +; GFX9-NEXT: v_add_u32_e32 v8, v8, v9 +; GFX9-NEXT: v_mul_lo_u32 v9, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v1, v1, v3 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v0, v0, v4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v8 ; GFX9-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -575,23 +629,25 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_lo_u32 v6, v1, v3 -; GFX10-NEXT: v_mul_lo_u32 v7, v0, v4 -; GFX10-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX10-NEXT: v_mul_lo_u32 v9, v1, v4 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3 +; GFX10-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX10-NEXT: v_mul_lo_u32 v8, v0, v4 +; GFX10-NEXT: v_mul_lo_u32 v7, v1, v3 ; GFX10-NEXT: v_mul_lo_u32 v5, v0, v5 +; GFX10-NEXT: v_mul_hi_u32 v9, v0, v3 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v7 -; GFX10-NEXT: v_mul_hi_u32 v7, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v9 -; GFX10-NEXT: v_add_co_u32 v1, s4, v6, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v2, v2, v5, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v6 -; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3 +; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v6 +; GFX10-NEXT: v_add_co_u32 v6, s4, v7, v8 +; GFX10-NEXT: v_mul_hi_u32 v8, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v5 +; GFX10-NEXT: v_add_co_u32 v1, s4, v6, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v7, v5 +; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v4 +; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = mul i96 %num, %den ret i96 %result