diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -198,15 +198,19 @@ DepthOp = InstrDepth[II->second]; int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg()); int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg()); - LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, + // Add latency if DefInstr is a real instruction. Transients get latency 0. + if (!DefInstr->isTransient()) + LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, InstrPtr, UseIdx); } else { MachineInstr *DefInstr = getOperandDef(MO); if (DefInstr) { DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; - LatencyOp = TSchedModel.computeOperandLatency( - DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), - InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + // Add latency if DefInstr is a real instruction. Transients get latency 0. + if (!DefInstr->isTransient()) + LatencyOp = TSchedModel.computeOperandLatency( + DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), + InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); } } IDepth = std::max(IDepth, DepthOp + LatencyOp); diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir --- a/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir @@ -22,10 +22,6 @@ RET_ReallyLR implicit $d0 ... -# UNPROFITABLE-LABEL: name: f1_2s -# UNPROFITABLE: %3:fpr64 = FMULv2f32 %0, %1 -# UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2 -# # PROFITABLE-LABEL: name: f1_2s # PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2 # PROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1 diff --git a/llvm/test/CodeGen/AArch64/neon-mla-mls.ll b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll --- a/llvm/test/CodeGen/AArch64/neon-mla-mls.ll +++ b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll @@ -138,8 +138,9 @@ define <8 x i8> @mls2v8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { ; CHECK-LABEL: mls2v8xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: mul v0.8b, v0.8b, v1.8b -; CHECK-NEXT: sub v0.8b, v0.8b, v2.8b +; CHECK-NEXT: neg v2.8b, v2.8b +; CHECK-NEXT: mla v2.8b, v0.8b, v1.8b +; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret %tmp1 = mul <8 x i8> %A, %B; %tmp2 = sub <8 x i8> %tmp1, %C; @@ -149,8 +150,9 @@ define <16 x i8> @mls2v16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { ; CHECK-LABEL: mls2v16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sub v0.16b, v0.16b, v2.16b +; CHECK-NEXT: neg v2.16b, v2.16b +; CHECK-NEXT: mla v2.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = mul <16 x i8> %A, %B; %tmp2 = sub <16 x i8> %tmp1, %C; @@ -160,8 +162,9 @@ define <4 x i16> @mls2v4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { ; CHECK-LABEL: mls2v4xi16: ; CHECK: // %bb.0: -; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h -; CHECK-NEXT: sub v0.4h, v0.4h, v2.4h +; CHECK-NEXT: neg v2.4h, v2.4h +; CHECK-NEXT: mla v2.4h, v0.4h, v1.4h +; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret %tmp1 = mul <4 x i16> %A, %B; %tmp2 = sub <4 x i16> %tmp1, %C; @@ -171,8 +174,9 @@ define <8 x i16> @mls2v8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: mls2v8xi16: ; CHECK: // %bb.0: -; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h -; CHECK-NEXT: sub v0.8h, v0.8h, v2.8h +; CHECK-NEXT: neg v2.8h, v2.8h +; CHECK-NEXT: mla v2.8h, v0.8h, v1.8h +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = mul <8 x i16> %A, %B; %tmp2 = sub <8 x i16> %tmp1, %C; @@ -182,8 +186,9 @@ define <2 x i32> @mls2v2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { ; CHECK-LABEL: mls2v2xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s -; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s +; CHECK-NEXT: neg v2.2s, v2.2s +; CHECK-NEXT: mla v2.2s, v0.2s, v1.2s +; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret %tmp1 = mul <2 x i32> %A, %B; %tmp2 = sub <2 x i32> %tmp1, %C; @@ -193,8 +198,9 @@ define <4 x i32> @mls2v4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { ; CHECK-LABEL: mls2v4xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = mul <4 x i32> %A, %B; %tmp2 = sub <4 x i32> %tmp1, %C;