diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4462,6 +4462,10 @@ return DAG.getConstant(0, DL, VT); } + // fold (mulhs c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHS, DL, VT, {N0, N1})) + return C; + // fold (mulhs x, 0) -> 0 if (isNullConstant(N1)) return N1; @@ -4510,6 +4514,10 @@ return DAG.getConstant(0, DL, VT); } + // fold (mulhu c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHU, DL, VT, {N0, N1})) + return C; + // fold (mulhu x, 0) -> 0 if (isNullConstant(N1)) return N1; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5079,6 +5079,18 @@ if (!C2.getBoolValue()) break; return C1.srem(C2); + case ISD::MULHS: { + unsigned FullWidth = C1.getBitWidth() * 2; + APInt C1Ext = C1.sext(FullWidth); + APInt C2Ext = C2.sext(FullWidth); + return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth()); + } + case ISD::MULHU: { + unsigned FullWidth = C1.getBitWidth() * 2; + APInt C1Ext = C1.zext(FullWidth); + APInt C2Ext = C2.zext(FullWidth); + return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth()); + } } return llvm::None; } diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -203,15 +203,9 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GCN-LABEL: v_test_udiv64_mulhi_fold -; GFX1030: s_mov_b32 [[VAL1:s[0-9]+]], 0xa9000000 -; GFX1030: s_brev_b32 [[VAL2:s[0-9]+]], 6 -; GFX1030: s_movk_i32 [[VAL3:s[0-9]+]], 0x500 -; GFX1030: s_mul_hi_u32 s7, [[VAL1]], [[VAL2]] -; GFX1030: s_mov_b32 [[VAL4:s[0-9]+]], 0xa7c5 -; GFX1030: s_mul_hi_u32 s8, [[VAL1]], [[VAL3]] -; GFX1030: s_mul_hi_u32 s5, [[VAL4]], [[VAL2]] -; GFX1030: s_mul_hi_u32 s6, [[VAL4]], [[VAL3]] -; GFX1030: v_add_co_u32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX1030: s_add_u32 [[VAL:s[0-9]+]], 0x4237, s{{[0-9]+}} +; GFX1030-NOT: s_mul_hi_u32 +; GFX1030: v_add_co_u32 v{{[0-9]+}}, [[VAL]], 0xa9000000, [[VAL]] %d = udiv i64 %arg, 100000 ret i64 %d } diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -2190,14 +2190,12 @@ define <8 x i16> @sse2_pmulh_w_const(<8 x i16> %a0, <8 x i16> %a1) { ; SSE-LABEL: sse2_pmulh_w_const: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65534,65533,65532,65531,65530,65529,0] -; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: retq ; ; AVX-LABEL: sse2_pmulh_w_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65534,65533,65532,65531,65530,65529,0] -; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] ; AVX-NEXT: retq %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> , <8 x i16> ) ret <8 x i16> %res @@ -2207,14 +2205,12 @@ define <8 x i16> @sse2_pmulhu_w_const(<8 x i16> %a0, <8 x i16> %a1) { ; SSE-LABEL: sse2_pmulhu_w_const: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65534,65533,65532,65531,65530,65529,0] -; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,1,2,3,4,5,0] ; SSE-NEXT: retq ; ; AVX-LABEL: sse2_pmulhu_w_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,65534,65533,65532,65531,65530,65529,0] -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,1,2,3,4,5,0] ; AVX-NEXT: retq %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> , <8 x i16> ) ret <8 x i16> %res