Index: llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4061,6 +4061,13 @@
   } else
     return nullptr;
 
+  // Bail out if the type is not desirable. The intrinsic might be lowered as a
+  // function call if the target can't handle the type. We allow the transform
+  // for vectors under the assumption that those will always be expanded inline.
+  Type *Ty = X->getType();
+  if (Ty->isIntegerTy() && !isDesirableIntType(Ty->getPrimitiveSizeInBits()))
+    return nullptr;
+
   BuilderTy::InsertPointGuard Guard(Builder);
   // If the pattern included (x * y), we'll want to insert new instructions
   // right before that original multiplication so that we can replace it.
@@ -4072,7 +4079,7 @@
                                           Div->getOpcode() == Instruction::UDiv
                                               ? Intrinsic::umul_with_overflow
                                               : Intrinsic::smul_with_overflow,
-                                          X->getType());
+                                          Ty);
   CallInst *Call = Builder.CreateCall(F, {X, Y}, "mul");
 
   // If the multiplication was used elsewhere, to ensure that we don't leave
Index: llvm/test/Transforms/PhaseOrdering/AArch64/mul-ov.ll
===================================================================
--- llvm/test/Transforms/PhaseOrdering/AArch64/mul-ov.ll
+++ llvm/test/Transforms/PhaseOrdering/AArch64/mul-ov.ll
@@ -1,28 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
 
+; PR56403
+; We do not want to form a mul-with-overflow intrinsic here because
+; that could be lowered to a runtime libcall: __muloti4!
+
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-unknown"
 
 define i128 @__muloti4(i128 %0, i128 %1, i32* nonnull align 4 %2) {
 ; CHECK-LABEL: @__muloti4(
 ; CHECK-NEXT:  Entry:
-; CHECK-NEXT:    [[DOTFR:%.*]] = freeze i128 [[TMP1:%.*]]
 ; CHECK-NEXT:    store i32 0, i32* [[TMP2:%.*]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 [[TMP0:%.*]], i128 [[DOTFR]])
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i128 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i128 [[DOTFR]], -170141183460469231731687303715884105728
-; CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[THEN7:%.*]], label [[ELSE2:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i128 [[TMP1:%.*]], [[TMP0:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i128 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i128 [[TMP1]], -170141183460469231731687303715884105728
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], i1 [[TMP5]], i1 false
+; CHECK-NEXT:    br i1 [[TMP6]], label [[THEN7:%.*]], label [[ELSE2:%.*]]
 ; CHECK:       Else2:
-; CHECK-NEXT:    [[MUL_OV:%.*]] = extractvalue { i128, i1 } [[MUL]], 1
-; CHECK-NEXT:    br i1 [[MUL_OV]], label [[THEN7]], label [[BLOCK9:%.*]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i128 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[BLOCK9:%.*]], label [[THEN3:%.*]]
+; CHECK:       Then3:
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i128 [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    [[DOTNOT3:%.*]] = icmp eq i128 [[TMP7]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[DOTNOT3]], label [[BLOCK9]], label [[THEN7]]
 ; CHECK:       Then7:
 ; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
 ; CHECK-NEXT:    br label [[BLOCK9]]
 ; CHECK:       Block9:
-; CHECK-NEXT:    [[MUL_VAL:%.*]] = extractvalue { i128, i1 } [[MUL]], 0
-; CHECK-NEXT:    ret i128 [[MUL_VAL]]
+; CHECK-NEXT:    ret i128 [[TMP3]]
 ;
 Entry:
   %3 = alloca i128, align 16