Index: llvm/trunk/lib/Target/ARM/ARMInstrVFP.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrVFP.td +++ llvm/trunk/lib/Target/ARM/ARMInstrVFP.td @@ -1958,7 +1958,8 @@ [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>; + Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>, + Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def VFMSS : ASbIn<0b11101, 0b10, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), @@ -1966,7 +1967,8 @@ [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> { + Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>, + Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines. } @@ -1976,7 +1978,8 @@ IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm", []>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasFullFP16,UseFusedMAC]>; + Requires<[HasFullFP16,UseFusedMAC]>, + Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>, Index: llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vfma.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vfma.ll +++ llvm/trunk/test/CodeGen/ARM/cortex-a57-misched-vfma.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT +; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null -fp-contract=fast | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FAST ; Check latencies of vmul/vfma accumulate chains. define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) { @@ -14,7 +15,8 @@ ; > VMULS read-advanced latency to VMLAS = 0 ; CHECK-SAME: Latency=0 -; CHECK: VMLAS +; CHECK-DEFAULT: VMLAS +; CHECK-FAST: VFMAS ; > VMLAS common latency = 9 ; CHECK: Latency : 9 ; CHECK: Successors: @@ -22,7 +24,8 @@ ; > VMLAS read-advanced latency to the next VMLAS = 4 ; CHECK-SAME: Latency=4 -; CHECK: VMLAS +; CHECK-DEFAULT: VMLAS +; CHECK-FAST: VFMAS ; CHECK: Latency : 9 ; CHECK: Successors: ; CHECK: data @@ -51,7 +54,8 @@ ; VMULfd read-advanced latency to VMLAfd = 0 ; CHECK-SAME: Latency=0 -; CHECK: VMLAfd +; CHECK-DEFAULT: VMLAfd +; CHECK-FAST: VFMAfd ; > VMLAfd common latency = 9 ; CHECK: Latency : 9 ; CHECK: Successors: @@ -59,7 +63,8 @@ ; > VMLAfd read-advanced latency to the next VMLAfd = 4 ; CHECK-SAME: Latency=4 -; CHECK: VMLAfd +; CHECK-DEFAULT: VMLAfd +; CHECK-FAST: VFMAfd ; CHECK: Latency : 9 ; CHECK: Successors: ; CHECK: data @@ -75,3 +80,79 @@ ret <2 x float> %add2 } +define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) { +; CHECK: ********** MI Scheduling ********** +; CHECK: Test3:BB#0 + +; CHECK: VMULS +; > VMULS common latency = 5 +; CHECK: Latency : 5 +; CHECK: Successors: +; CHECK: data +; > VMULS read-advanced latency to VMLSS = 0 +; CHECK-SAME: Latency=0 + +; CHECK-DEFAULT: VMLSS +; CHECK-FAST: VFMSS +; > VMLSS common latency = 9 +; CHECK: Latency : 9 +; CHECK: Successors: +; CHECK: data +; > VMLSS read-advanced latency to the next VMLSS = 4 +; CHECK-SAME: Latency=4 + +; CHECK-DEFAULT: VMLSS +; CHECK-FAST: VFMSS +; CHECK: Latency : 9 +; CHECK: Successors: +; CHECK: data +; > VMLSS not-optimized latency to VMOVRS = 9 +; CHECK-SAME: Latency=9 + +; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLSS, VMLSS + %mul1 = fmul float %f1, %f2 + %mul2 = fmul float %f3, %f4 + %mul3 = fmul float %f5, %f6 + %sub1 = fsub float %mul1, %mul2 + %sub2 = fsub float %sub1, %mul3 + ret float %sub2 +} + +; ASIMD form +define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) { +; CHECK: ********** MI Scheduling ********** +; CHECK: Test4:BB#0 + +; CHECK: VMULfd +; > VMULfd common latency = 5 +; CHECK: Latency : 5 +; CHECK: Successors: +; CHECK: data +; VMULfd read-advanced latency to VMLSfd = 0 +; CHECK-SAME: Latency=0 + +; CHECK-DEFAULT: VMLSfd +; CHECK-FAST: VFMSfd +; > VMLSfd common latency = 9 +; CHECK: Latency : 9 +; CHECK: Successors: +; CHECK: data +; > VMLSfd read-advanced latency to the next VMLSfd = 4 +; CHECK-SAME: Latency=4 + +; CHECK-DEFAULT: VMLSfd +; CHECK-FAST: VFMSfd +; CHECK: Latency : 9 +; CHECK: Successors: +; CHECK: data +; > VMLSfd not-optimized latency to VMOVRRD = 9 +; CHECK-SAME: Latency=9 + +; f1 * f2 + f3 * f4 + f5 * f6 ==> VMULS, VMLSS, VMLSS + %mul1 = fmul <2 x float> %f1, %f2 + %mul2 = fmul <2 x float> %f3, %f4 + %mul3 = fmul <2 x float> %f5, %f6 + %sub1 = fsub <2 x float> %mul1, %mul2 + %sub2 = fsub <2 x float> %sub1, %mul3 + ret <2 x float> %sub2 +}