Index: llvm/lib/CodeGen/MachineCombiner.cpp =================================================================== --- llvm/lib/CodeGen/MachineCombiner.cpp +++ llvm/lib/CodeGen/MachineCombiner.cpp @@ -71,6 +71,7 @@ improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, DenseMap &InstrIdxForVirtReg, MachineCombinerPattern Pattern); bool preservesResourceLen(MachineBasicBlock *MBB, @@ -242,6 +243,7 @@ MachineBasicBlock *MBB, MachineInstr *Root, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, DenseMap &InstrIdxForVirtReg, MachineCombinerPattern Pattern) { assert(TSchedModel.hasInstrSchedModelOrItineraries() && @@ -269,8 +271,13 @@ // A more flexible cost calculation for the critical path includes the slack // of the original code sequence. This may allow the transform to proceed // even if the instruction depths (data dependency cycles) become worse. + unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); - unsigned RootLatency = TSchedModel.computeInstrLatency(Root); + unsigned RootLatency = 0; + + for (auto I : DelInstrs) + RootLatency += TSchedModel.computeInstrLatency(I); + unsigned RootSlack = BlockTrace.getInstrSlack(*Root); DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; @@ -421,7 +428,7 @@ // resource pressure. if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) || (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, - InstrIdxForVirtReg, P) && + DelInstrs, InstrIdxForVirtReg, P) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { for (auto *InstrPtr : InsInstrs) MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr); Index: llvm/test/CodeGen/AArch64/arm64-fma-combines.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-fma-combines.ll +++ llvm/test/CodeGen/AArch64/arm64-fma-combines.ll @@ -2,7 +2,7 @@ define void @foo_2d(double* %src) { ; CHECK-LABEL: %entry ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} entry: %arrayidx1 = getelementptr inbounds double, double* %src, i64 5 %arrayidx2 = getelementptr inbounds double, double* %src, i64 11 Index: llvm/test/CodeGen/AArch64/machine-combiner_madd_during_address_computation.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/machine-combiner_madd_during_address_computation.ll @@ -0,0 +1,66 @@ +; Converted from a machine-reduced C++ test case because hand-written test cases passed without first improving the compiler. + +; test all AArch64 subarches known as of Dec. 7 2016 to have scheduling models +; ---------------------------------------------------------------------------- +; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -o - %s | FileCheck %s +; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 -o - %s | FileCheck %s +; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cortex-a73 -o - %s | FileCheck %s +; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cyclone -o - %s | FileCheck %s +; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 -o - %s | FileCheck %s +; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=exynos-m2 -o - %s | FileCheck %s +; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=kryo -o - %s | FileCheck %s +; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=vulcan -o - %s | FileCheck %s + +; CHECK-NOT: mul +; CHECK: madd +; CHECK-NOT: mul + +target triple = "aarch64-sarc-linux-gnu" + +%class.BtlConfig = type { %class.C } +%class.C = type { %class.B } +%class.B = type { %class.D* } +%class.D = type { %class.basic_string.base, [4 x i8] } +%class.basic_string.base = type <{ i64, i64, i32 }> +%class.basic_string = type <{ i64, i64, i32, [4 x i8] }> +@a = global %class.BtlConfig zeroinitializer, align 8 +@.str = private unnamed_addr constant [1 x i8] zeroinitializer, align 1 +declare i64 @_ZN1CI1D1AIS0_EE5m_fn1Ev(%class.C*) local_unnamed_addr +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) +define internal void @_GLOBAL__sub_I_main_adv.ii() section ".text.startup" { +entry: + %tmp.i.i = alloca %class.D, align 8 + %agg.tmp.i.i = alloca %class.D, align 8 + %0 = bitcast %class.D* %tmp.i.i to i8* + %1 = bitcast %class.D* %agg.tmp.i.i to i8* + %call8.i.i = tail call i64 @_ZN1CI1D1AIS0_EE5m_fn1Ev(%class.C* getelementptr inbounds (%class.BtlConfig, %class.BtlConfig* @a, i64 0, i32 0)) + %cmp9.i.i = icmp sgt i64 %call8.i.i, 0 + br i1 %cmp9.i.i, label %for.body.lr.ph.i.i, label %__cxx_global_var_init.exit +for.body.lr.ph.i.i: + %2 = bitcast %class.D* %agg.tmp.i.i to %class.basic_string* + br label %for.body.i.i +for.body.i.i: + %conv11.i.i = phi i64 [ 0, %for.body.lr.ph.i.i ], [ %conv.i.i, %for.body.i.i ] + %i.010.i.i = phi i32 [ undef, %for.body.lr.ph.i.i ], [ %inc.i.i, %for.body.i.i ] + %3 = load %class.D*, %class.D** getelementptr inbounds (%class.BtlConfig, %class.BtlConfig* @a, i64 0, i32 0, i32 0, i32 0), align 8, !tbaa !1, !noalias !6 + %arrayidx.i.i.i = getelementptr inbounds %class.D, %class.D* %3, i64 %conv11.i.i + %4 = bitcast %class.D* %arrayidx.i.i.i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %0, i8* %4, i64 24, i32 8, i1 false) + %inc.i.i = add i32 %i.010.i.i, 1 + %conv.i.i = zext i32 %inc.i.i to i64 + %call.i.i = call i64 @_ZN1CI1D1AIS0_EE5m_fn1Ev(%class.C* getelementptr inbounds (%class.BtlConfig, %class.BtlConfig* @a, i64 0, i32 0)) + %cmp.i.i = icmp slt i64 %conv.i.i, %call.i.i + br i1 %cmp.i.i, label %for.body.i.i, label %__cxx_global_var_init.exit.loopexit +__cxx_global_var_init.exit.loopexit: + br label %__cxx_global_var_init.exit +__cxx_global_var_init.exit: + ret void +} +!1 = !{!2, !3, i64 0} +!2 = !{!"foo", !3, i64 0} +!3 = !{!"bar", !4, i64 0} +!4 = !{!"baz", !5, i64 0} +!5 = !{!"boo"} +!6 = !{!7} +!7 = distinct !{!7, !8, !"_ZN1CI1D1AIS0_EEixEl: %agg.result"} +!8 = distinct !{!8, !"_ZN1CI1D1AIS0_EEixEl"} Index: llvm/test/CodeGen/AArch64/mul-lohi.ll =================================================================== --- llvm/test/CodeGen/AArch64/mul-lohi.ll +++ llvm/test/CodeGen/AArch64/mul-lohi.ll @@ -3,16 +3,18 @@ define i128 @test_128bitmul(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: test_128bitmul: -; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3 -; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2 -; CHECK: mul [[PART2:x[0-9]+]], x1, x2 -; CHECK: mul x0, x0, x2 +; CHECK: umulh [[HI:x[0-9]+]], x0, x2 +; CHECK: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]] +; CHECK-DAG: madd x1, x1, x2, [[TEMP1]] +; CHECK-DAG: mul x0, x0, x2 +; CHECK-NEXT: ret ; CHECK-BE-LABEL: test_128bitmul: -; CHECK-BE-DAG: mul [[PART1:x[0-9]+]], x1, x2 -; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3 -; CHECK-BE: mul [[PART2:x[0-9]+]], x0, x3 -; CHECK-BE: mul x1, x1, x3 +; CHECK-BE: umulh [[HI:x[0-9]+]], x1, x3 +; CHECK-BE: madd [[TEMP1:x[0-9]+]], x1, x2, [[HI]] +; CHECK-BE-DAG: madd x0, x0, x3, [[TEMP1]] +; CHECK-BE-DAG: mul x1, x1, x3 +; CHECK-BE-NEXT: ret %prod = mul i128 %lhs, %rhs ret i128 %prod @@ -25,8 +27,8 @@ ; CHECK-LABEL: test_128bitmul_optsize: ; CHECK: umulh [[HI:x[0-9]+]], x0, x2 ; CHECK-NEXT: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]] -; CHECK-NEXT: madd x1, x1, x2, [[TEMP1]] -; CHECK-NEXT: mul x0, x0, x2 +; CHECK-DAG: madd x1, x1, x2, [[TEMP1]] +; CHECK-DAG: mul x0, x0, x2 ; CHECK-NEXT: ret %prod = mul i128 %lhs, %rhs @@ -37,8 +39,8 @@ ; CHECK-LABEL: test_128bitmul_minsize: ; CHECK: umulh [[HI:x[0-9]+]], x0, x2 ; CHECK-NEXT: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]] -; CHECK-NEXT: madd x1, x1, x2, [[TEMP1]] -; CHECK-NEXT: mul x0, x0, x2 +; CHECK-DAG: madd x1, x1, x2, [[TEMP1]] +; CHECK-DAG: mul x0, x0, x2 ; CHECK-NEXT: ret %prod = mul i128 %lhs, %rhs