Index: llvm/lib/Target/AArch64/AArch64SchedA57.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SchedA57.td +++ llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -93,7 +93,8 @@ def : SchedAlias; def : SchedAlias; def : SchedAlias; -def : SchedAlias; +// Replacing SchedAlias with WriteRes for advance lookup +def : WriteRes { let Latency = 5;} def : SchedAlias; def : SchedAlias; def : SchedAlias; @@ -350,12 +351,17 @@ // D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64 // Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64 +// Cortex A57 Software Optimization Guide Sec 3.14 +// Advance between absolute diff accum, pairwise add and accumulate, shift +// accumulate, and various shift instructions +def A57ReadIVA3 : SchedReadAdvance<3, [A57Write_3cyc_1X_NonMul_Forward, A57Write_4cyc_1X_NonMul_Forward, A57Write_4cyc_2X_NonMul_Forward, A57Write_5cyc_2X_NonMul_Forward]>; + // ASIMD absolute diff accum, D-form -def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; // ASIMD absolute diff accum, Q-form -def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +def : InstRW<[A57Write_5cyc_2X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; // ASIMD absolute diff accum long -def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABAL")>; // ASIMD arith, reduce, 4H/4S def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; @@ -372,46 +378,54 @@ def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>; // ASIMD multiply, D-form -def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; // ASIMD multiply, Q-form -def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[A57Write_6cyc_2W_Mul_Forward], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; + +// Cortex A57 Software Optimization Guide Sec 3.14 +// Forwarding to multiply accumulate from other multiplies and mul-accums on the same pipeline +def A57ReadIVMA4 : SchedReadAdvance<4 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>; +def A57ReadIVMA3 : SchedReadAdvance<3 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>; // ASIMD multiply accumulate, D-form -def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; // ASIMD multiply accumulate, Q-form -def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[A57Write_6cyc_2W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; // ASIMD multiply accumulate long // ASIMD multiply accumulate saturating long -def A57WriteIVMA : SchedWriteRes<[A57UnitW]> { let Latency = 5; } -def A57ReadIVMA4 : SchedReadAdvance<4, [A57WriteIVMA]>; -def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^(S|U)ML[AS]L")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA3], (instregex "^SQDML[AS]L")>; // ASIMD multiply long -def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>; -def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^PMULL(v8i8|v16i8)")>; def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>; // ASIMD pairwise add and accumulate // ASIMD shift accumulate -def A57WriteIVA : SchedWriteRes<[A57UnitX]> { let Latency = 4; } -def A57ReadIVA3 : SchedReadAdvance<3, [A57WriteIVA]>; -def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>; -def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ADALP")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>; + + +// ASIMD shift by immed, basic +def : InstRW<[A57Write_3cyc_1X_NonMul_Forward], (instregex "^(U|S)?SHLL", "^SHL", "^SLI", "^SRI", "^(U|S)SHR", "^(U|S)(SHR|XTL)", "^SHRN")>; // ASIMD shift by immed, complex -def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>; -def : InstRW<[A57Write_4cyc_1X], (instregex "^SQSHLU")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward], (instregex "^SQSHLU")>; +// ASIMD shift by register, basic, D-form +def : InstRW<[A57Write_3cyc_1X_NonMul_Forward], (instregex "^[SU]SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; // ASIMD shift by register, basic, Q-form -def : InstRW<[A57Write_4cyc_2X], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[A57Write_4cyc_2X_NonMul_Forward], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; // ASIMD shift by register, complex, D-form -def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; // ASIMD shift by register, complex, Q-form -def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[A57Write_5cyc_2X_NonMul_Forward], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; // Vector - Floating Point @@ -473,18 +487,24 @@ // ASIMD FP max/min, reduce def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>; +// Cortex A57 Software Optimization Guide Sec 3.15 +// Advances from FP mul and mul-accum to mul-accum + // ASIMD FP multiply, D-form, FZ -def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[A57Write_5cyc_1V_FP_Forward], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; // ASIMD FP multiply, Q-form, FZ -def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; +def : InstRW<[A57Write_5cyc_2V_FP_Forward], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP multiply accumulate, D-form, FZ // ASIMD FP multiply accumulate, Q-form, FZ def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9; } def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10; } -def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>; +// D-form and Q-form have different advances +def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>; +def A57ReadFPVMA6 : SchedReadAdvance<6, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>; + def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; -def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; +def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA6], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP round, D-form def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>; @@ -547,8 +567,9 @@ def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>; +// Cortex A57 Software Optimization Guide Sec 3.10 def A57WriteFPMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; } -def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA]>; +def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA, WriteFMul]>; def A57ReadFPM : SchedReadAdvance<0>; def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; Index: llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td +++ llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td @@ -13,6 +13,10 @@ // Prefix: A57Write // Latency: #cyc // MicroOp Count/Types: #(B|I|M|L|S|X|W|V) +// Postfix (optional): (XYZ)_Forward +// +// The postfix is added to differentiate SchedWriteRes that are used in +// subsequent SchedReadAdvances. // // e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are // 11 micro-ops to be issued down one I pipe, six S pipes and four V pipes. @@ -25,7 +29,9 @@ def A57Write_5cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 5; } def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; } def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; } +def A57Write_5cyc_1V_FP_Forward : SchedWriteRes<[A57UnitV]> { let Latency = 5; } def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; } +def A57Write_5cyc_1W_Mul_Forward : SchedWriteRes<[A57UnitW]> { let Latency = 5; } def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; } def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17; let ResourceCycles = [17]; } @@ -43,8 +49,11 @@ def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; } def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; } def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; } +def A57Write_3cyc_1X_NonMul_Forward : SchedWriteRes<[A57UnitX]> { let Latency = 3; } def A57Write_4cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 4; } +def A57Write_4cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 4; } def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57Write_4cyc_1X_NonMul_Forward : SchedWriteRes<[A57UnitX]> { let Latency = 4; } def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; } def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; } def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; } @@ -93,6 +102,10 @@ let Latency = 6; let NumMicroOps = 2; } +def A57Write_6cyc_2W_Mul_Forward : SchedWriteRes<[A57UnitW, A57UnitW]> { + let Latency = 6; + let NumMicroOps = 2; +} def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI, A57UnitL]> { let Latency = 5; @@ -102,10 +115,18 @@ let Latency = 5; let NumMicroOps = 2; } +def A57Write_5cyc_2V_FP_Forward : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { let Latency = 5; let NumMicroOps = 2; } +def A57Write_5cyc_2X_NonMul_Forward : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 5; + let NumMicroOps = 2; +} def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL, A57UnitV]> { let Latency = 10; @@ -171,6 +192,10 @@ let Latency = 4; let NumMicroOps = 2; } +def A57Write_4cyc_2X_NonMul_Forward : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 4; + let NumMicroOps = 2; +} //===----------------------------------------------------------------------===// Index: llvm/test/CodeGen/AArch64/aarch64-misched-forwarding-A57.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-misched-forwarding-A57.ll @@ -0,0 +1,73 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -enable-misched -enable-post-misched=false -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s + +define <2 x i32> @mlamlaD(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) { +; CHECK: ********** MI Scheduling ********** +; CHECK-NEXT: mlamlaD:%bb.0 +; CHECK: MLAv2i32 +; CHECK: Latency : 5 +; CHECK: Successors +; CHECK-NEXT: SU(6): Data Latency=1 +; CHECK: MLAv2i32 +; CHECK: Latency : 5 +; CHECK: Successors +; CHECK-NEXT: SU(7): Data Latency=5 + %tmp0 = mul <2 x i32> %A, %B; + %tmp1 = add <2 x i32> %E, %tmp0; + %tmp2 = mul <2 x i32> %C, %D; + %tmp3 = add <2 x i32> %tmp1, %tmp2; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @mlamlaQ(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) { +; CHECK: ********** MI Scheduling ********** +; CHECK-NEXT: mlamlaQ:%bb.0 +; CHECK: MLAv4i32 +; CHECK: Latency : 6 +; CHECK: Successors +; CHECK-NEXT: SU(6): Data Latency=2 +; CHECK: MLAv4i32 +; CHECK: Latency : 6 +; CHECK: Successors +; CHECK-NEXT: SU(7): Data Latency=6 + %tmp0 = mul <4 x i32> %A, %B; + %tmp1 = add <4 x i32> %E, %tmp0; + %tmp2 = mul <4 x i32> %C, %D; + %tmp3 = add <4 x i32> %tmp1, %tmp2; + ret <4 x i32> %tmp3 +} + +define <2 x i32> @mulmlaD(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) { +; CHECK: ********** MI Scheduling ********** +; CHECK-NEXT: mulmlaD:%bb.0 +; CHECK: MULv2i32 +; CHECK: Latency : 5 +; CHECK: Successors +; CHECK-NEXT: SU(5): Data Latency=1 + +; CHECK: MLAv2i32 +; CHECK: Latency : 5 +; CHECK: Successors +; CHECK-NEXT: SU(6): Data Latency=5 + %tmp0 = mul <2 x i32> %A, %B; + %tmp1 = mul <2 x i32> %C, %D; + %tmp2 = add <2 x i32> %tmp0, %tmp1; + ret <2 x i32> %tmp2 +} + +define <4 x i32> @mulmlaQ(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) { +; CHECK: ********** MI Scheduling ********** +; CHECK-NEXT: mulmlaQ:%bb.0 +; CHECK: MULv4i32 +; CHECK: Latency : 6 +; CHECK: Successors +; CHECK-NEXT: SU(5): Data Latency=2 + +; CHECK: MLAv4i32 +; CHECK: Latency : 6 +; CHECK: Successors +; CHECK-NEXT: SU(6): Data Latency=6 + %tmp0 = mul <4 x i32> %A, %B; + %tmp1 = mul <4 x i32> %C, %D; + %tmp2 = add <4 x i32> %tmp0, %tmp1; + ret <4 x i32> %tmp2 +} \ No newline at end of file