diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4451,6 +4451,10 @@ return DAG.getConstant(0, DL, VT); } + // fold (mulhs c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHS, DL, VT, {N0, N1})) + return C; + // fold (mulhs x, 0) -> 0 if (isNullConstant(N1)) return N1; @@ -4499,6 +4503,10 @@ return DAG.getConstant(0, DL, VT); } + // fold (mulhu c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHU, DL, VT, {N0, N1})) + return C; + // fold (mulhu x, 0) -> 0 if (isNullConstant(N1)) return N1; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5081,6 +5081,18 @@ if (!C2.getBoolValue()) break; return C1.srem(C2); + case ISD::MULHS: { + unsigned FullWidth = C1.getBitWidth() * 2; + APInt C1Ext = C1.sext(FullWidth); + APInt C2Ext = C2.sext(FullWidth); + return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth()); + } + case ISD::MULHU: { + unsigned FullWidth = C1.getBitWidth() * 2; + APInt C1Ext = C1.zext(FullWidth); + APInt C2Ext = C2.zext(FullWidth); + return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth()); + } } return llvm::None; } diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-mulhs-const.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-mulhs-const.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-mulhs-const.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}main: +; MULHS C1, C2 replacement results in 0x4237 in the following add +; GCN: s_add_u32 s0, 0x4237, s0 + +define amdgpu_cs void @main(<4 x i32> %0, <4 x i32> %1) { +main_body: + %2 = call nsz arcp <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %0, i32 0, i32 0, i32 0) + %3 = bitcast <2 x float> %2 to <2 x i32> + %4 = extractelement <2 x i32> %3, i32 0 + %5 = insertelement <2 x i32> undef, i32 %4, i32 0 + %6 = insertelement <2 x i32> %5, i32 undef, i32 1 + %7 = bitcast <2 x i32> %6 to i64 + %8 = mul i64 %7, 1000000 + %9 = udiv i64 %8, 100000 + %10 = bitcast i64 %9 to <2 x i32> + %11 = extractelement <2 x i32> %10, i32 1 + %12 = select i1 false, i32 undef, i32 %11 + %.not33 = icmp eq i32 0, 0 + %13 = select i1 %.not33, i32 %12, i32 0 + %14 = insertelement <2 x i32> undef, i32 %13, i32 1 + %15 = bitcast <2 x i32> %14 to <2 x float> + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %15, <4 x i32> %1, i32 0, i32 0, i32 0) + ret void +} + +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32 immarg) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32 immarg) #1 +attributes #0 = { nounwind readonly willreturn } +attributes #1 = { nounwind willreturn writeonly }