Index: lib/Target/ARM64/ARM64.td =================================================================== --- lib/Target/ARM64/ARM64.td +++ lib/Target/ARM64/ARM64.td @@ -21,7 +21,7 @@ // def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", - "Enable ARMv8 FP">; + "Enable ARMv8 FP">; def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", "Enable Advanced SIMD instructions", [FeatureFPARMv8]>; @@ -29,6 +29,7 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", "Enable cryptographic instructions">; + /// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zereo-cycle register moves">; @@ -56,6 +57,7 @@ //===----------------------------------------------------------------------===// // ARM64 Processors supported. // +include "ARM64SchedA53.td" include "ARM64SchedCyclone.td" def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", @@ -79,9 +81,8 @@ def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureNEON]>; -def : ProcessorModel<"cortex-a53", NoSchedModel, [ProcA53]>; +def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>; - def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; //===----------------------------------------------------------------------===// Index: lib/Target/ARM64/ARM64SchedA53.td =================================================================== --- /dev/null +++ lib/Target/ARM64/ARM64SchedA53.td @@ -0,0 +1,129 @@ +//=- ARM64SchedA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM Cortex A53 processors. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler. See MCSchedModel.h for details. + +// Cortex-A53 machine model for scheduling and other instruction cost heuristics. +def CortexA53Model : SchedMachineModel { + let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order. + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 2; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation + // Specification - Instruction Timings" + // v 1.0 Spreadsheet +} + + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available. + +// Modeling each pipeline as a ProcResource using the BufferSize = 0 since +// Cortex-A53 is in-order. + +def A53UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU +def A53UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC +def A53UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division +def A53UnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store +def A53UnitB : ProcResource<1> { let BufferSize = 0; } // Branch +def A53UnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU +def A53UnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt + + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedWrite types which both map the ProcResources and +// set the latency. + +let SchedModel = CortexA53Model in { + +// ALU - These are reduced to 1 despite a true latency of 4 in order to easily +// model forwarding logic. Once forwarding is properly modelled, then +// they'll be corrected. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +// MAC +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +// Div +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +// Load +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +// Store +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +// Branch +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + +// FP ALU +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } + +// FP Mul, Div, Sqrt +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 33; + let ResourceCycles = [29]; } +def A53WriteFDiv : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33; + let ResourceCycles = [29]; } +def A53WriteFSqrt : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32; + let ResourceCycles = [28]; } + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedRead types. + +// While there is no forwarding information defined for these SchedRead types, +// they are still used by some instruction via a SchedRW list and so these zero +// SchedReadAdvances are required. + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Subtarget-specific InstRWs. + +def : InstRW<[WriteI], (instrs COPY)>; +def : InstRW<[WriteLD], (instregex "LD[1-4]")>; +def : InstRW<[WriteST], (instregex "ST[1-4]")>; +def : InstRW<[A53WriteFDiv], (instregex "^FDIV")>; +def : InstRW<[A53WriteFSqrt], (instregex ".*SQRT.*")>; + +} Index: lib/Target/ARM64/ARM64SchedCyclone.td =================================================================== --- lib/Target/ARM64/ARM64SchedCyclone.td +++ lib/Target/ARM64/ARM64SchedCyclone.td @@ -342,7 +342,9 @@ // INS V[x],V[y] is a WriteV. // FMOVWSr,FMOVXDr,FMOVXDHighr -def : SchedAlias; +def : WriteRes { + let Latency = 5; +} // FMOVSWr,FMOVDXr def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; Index: test/CodeGen/ARM64/misched-basic-A53.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM64/misched-basic-A53.ll @@ -0,0 +1,112 @@ +; REQUIRES: asserts +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; +; The Cortex-A53 machine model will cause the MADD instruction to be scheduled +; much higher than the ADD instructions in order to hide latency. When not +; specifying a subtarget, the MADD will remain near the end of the block. +; +; CHECK: ********** MI Scheduling ********** +; CHECK: main +; CHECK: *** Final schedule for BB#2 *** +; CHECK: SU(13) +; CHECK: MADDWrrr +; CHECK: SU(4) +; CHECK: ADDWri +; CHECK: ********** INTERVALS ********** +@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4 +@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4 + +; Function Attrs: nounwind +define i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %x = alloca [8 x i32], align 4 + %y = alloca [8 x i32], align 4 + %i = alloca i32, align 4 + %xx = alloca i32, align 4 + %yy = alloca i32, align 4 + store i32 0, i32* %retval + %0 = bitcast [8 x i32]* %x to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false) + %1 = bitcast [8 x i32]* %y to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false) + store i32 0, i32* %xx, align 4 + store i32 0, i32* %yy, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %2 = load i32* %i, align 4 + %cmp = icmp slt i32 %2, 8 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %3 = load i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom + %4 = load i32* %arrayidx, align 4 + %add = add nsw i32 %4, 1 + store i32 %add, i32* %xx, align 4 + %5 = load i32* %xx, align 4 + %add1 = add nsw i32 %5, 12 + store i32 %add1, i32* %xx, align 4 + %6 = load i32* %xx, align 4 + %add2 = add nsw i32 %6, 23 + store i32 %add2, i32* %xx, align 4 + %7 = load i32* %xx, align 4 + %add3 = add nsw i32 %7, 34 + store i32 %add3, i32* %xx, align 4 + %8 = load i32* %i, align 4 + %idxprom4 = sext i32 %8 to i64 + %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4 + %9 = load i32* %arrayidx5, align 4 + %10 = load i32* %yy, align 4 + %mul = mul nsw i32 %10, %9 + store i32 %mul, i32* %yy, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %11 = load i32* %i, align 4 + %inc = add nsw i32 %11, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %12 = load i32* %xx, align 4 + %13 = load i32* %yy, align 4 + %add6 = add nsw i32 %12, %13 + ret i32 %add6 +} + + +; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to +; hide latency. Whereas normally there would only be a single FADDvvv_4s +; after it, this test checks to make sure there are more than one. +; +; CHECK: ********** MI Scheduling ********** +; CHECK: neon4xfloat:BB#0 +; CHECK: *** Final schedule for BB#0 *** +; CHECK: FDIVv4f32 +; CHECK: FADDv4f32 +; CHECK: FADDv4f32 +; CHECK: ********** INTERVALS ********** +define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) { + %tmp1 = fadd <4 x float> %A, %B; + %tmp2 = fadd <4 x float> %A, %tmp1; + %tmp3 = fadd <4 x float> %A, %tmp2; + %tmp4 = fadd <4 x float> %A, %tmp3; + %tmp5 = fadd <4 x float> %A, %tmp4; + %tmp6 = fadd <4 x float> %A, %tmp5; + %tmp7 = fadd <4 x float> %A, %tmp6; + %tmp8 = fadd <4 x float> %A, %tmp7; + %tmp9 = fdiv <4 x float> %A, %B; + %tmp10 = fadd <4 x float> %tmp8, %tmp9; + + ret <4 x float> %tmp10 +} + +; Function Attrs: nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind }