diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td --- a/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -93,7 +93,7 @@ def : SchedAlias; def : SchedAlias; def : SchedAlias; -def : SchedAlias; +def : WriteRes { let Latency = 5;} def : SchedAlias; def : SchedAlias; def : SchedAlias; @@ -350,12 +350,16 @@ // D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64 // Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64 +// Cortex A57 Software Optimization Guide Sec 3.14 +// Advance for absolute diff accum, pairwise add and accumulate, shift accumulate +def A57ReadIVA3 : SchedReadAdvance<3, [A57Write_4cyc_1X_NonMul_Forward, A57Write_5cyc_2X_NonMul_Forward]>; + // ASIMD absolute diff accum, D-form -def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; // ASIMD absolute diff accum, Q-form -def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +def : InstRW<[A57Write_5cyc_2X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; // ASIMD absolute diff accum long -def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABAL")>; // ASIMD arith, reduce, 4H/4S def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; @@ -372,32 +376,41 @@ def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>; // ASIMD multiply, D-form -def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; +// MUL +def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^MUL(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; +// PMUL, SQDMULH, SQRDMULH +def : InstRW<[A57Write_5cyc_1W], (instregex "^(PMUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; + // ASIMD multiply, Q-form -def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +// MUL +def : InstRW<[A57Write_6cyc_2W_Mul_Forward], (instregex "^MUL(v16i8|v8i16|v4i32)(_indexed)?$")>; +// PMUL, SQDMULH, SQRDMULH +def : InstRW<[A57Write_6cyc_2W], (instregex "^(PMUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; + +// Cortex A57 Software Optimization Guide Sec 3.14 +def A57ReadIVMA4 : SchedReadAdvance<4 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>; +def A57ReadIVMA3 : SchedReadAdvance<3 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>; // ASIMD multiply accumulate, D-form -def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; // ASIMD multiply accumulate, Q-form -def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[A57Write_6cyc_2W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; // ASIMD multiply accumulate long // ASIMD multiply accumulate saturating long -def A57WriteIVMA : SchedWriteRes<[A57UnitW]> { let Latency = 5; } -def A57ReadIVMA4 : SchedReadAdvance<4, [A57WriteIVMA]>; -def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^(S|U)ML[AS]L")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA3], (instregex "^SQDML[AS]L")>; // ASIMD multiply long -def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(S|U)MULL")>; +def : InstRW<[A57Write_5cyc_1W], (instregex "^SQDMULL")>; def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>; def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>; // ASIMD pairwise add and accumulate // ASIMD shift accumulate -def A57WriteIVA : SchedWriteRes<[A57UnitX]> { let Latency = 4; } -def A57ReadIVA3 : SchedReadAdvance<3, [A57WriteIVA]>; -def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>; -def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ADALP")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>; // ASIMD shift by immed, complex def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>; @@ -474,17 +487,22 @@ def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>; // ASIMD FP multiply, D-form, FZ -def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[A57Write_5cyc_1V_FP_Forward], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; // ASIMD FP multiply, Q-form, FZ -def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; +def : InstRW<[A57Write_5cyc_2V_FP_Forward], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP multiply accumulate, D-form, FZ // ASIMD FP multiply accumulate, Q-form, FZ def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9; } def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10; } -def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>; + +// Cortex A57 Software Optimization Guide Sec 3.15 +// Advances from FP mul and mul-accum to mul-accum +def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>; +def A57ReadFPVMA6 : SchedReadAdvance<6, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>; + def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; -def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; +def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA6], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP round, D-form def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>; @@ -547,8 +565,9 @@ def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>; +// Cortex A57 Software Optimization Guide Sec 3.10 def A57WriteFPMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; } -def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA]>; +def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA, WriteFMul]>; def A57ReadFPM : SchedReadAdvance<0>; def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td --- a/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td @@ -13,6 +13,10 @@ // Prefix: A57Write // Latency: #cyc // MicroOp Count/Types: #(B|I|M|L|S|X|W|V) +// Postfix (optional): (XYZ)_Forward +// +// The postfix is added to differentiate SchedWriteRes that are used in +// subsequent SchedReadAdvances. // // e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are // 11 micro-ops to be issued down one I pipe, six S pipes and four V pipes. @@ -25,7 +29,9 @@ def A57Write_5cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 5; } def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; } def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; } +def A57Write_5cyc_1V_FP_Forward : SchedWriteRes<[A57UnitV]> { let Latency = 5; } def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; } +def A57Write_5cyc_1W_Mul_Forward : SchedWriteRes<[A57UnitW]> { let Latency = 5; } def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; } def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17; let ResourceCycles = [17]; } @@ -45,6 +51,7 @@ def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; } def A57Write_4cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 4; } def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57Write_4cyc_1X_NonMul_Forward : SchedWriteRes<[A57UnitX]> { let Latency = 4; } def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; } def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; } def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; } @@ -93,6 +100,10 @@ let Latency = 6; let NumMicroOps = 2; } +def A57Write_6cyc_2W_Mul_Forward : SchedWriteRes<[A57UnitW, A57UnitW]> { + let Latency = 6; + let NumMicroOps = 2; +} def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI, A57UnitL]> { let Latency = 5; @@ -102,10 +113,18 @@ let Latency = 5; let NumMicroOps = 2; } +def A57Write_5cyc_2V_FP_Forward : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { let Latency = 5; let NumMicroOps = 2; } +def A57Write_5cyc_2X_NonMul_Forward : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 5; + let NumMicroOps = 2; +} def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL, A57UnitV]> { let Latency = 10; diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/forwarding-A57.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/forwarding-A57.s new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/forwarding-A57.s @@ -0,0 +1,501 @@ +# RUN: llvm-mca -march=aarch64 -mcpu=cortex-a57 -iterations=1 -timeline < %s | FileCheck %s + +# CHECK: [0] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 12 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER .. fmul v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] DeeeeeeeeeER fmla v0.2s, v1.2s, v2.2s + +# CHECK: [1] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 13 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER . . fmul v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,1] DeeeeeeeeeeER fmla v0.4s, v1.4s, v2.4s + +# CHECK: [2] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 12 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER .. fmulx v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] DeeeeeeeeeER fmls v0.2s, v1.2s, v2.2s + +# CHECK: [3] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 13 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER . . fmulx v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,1] DeeeeeeeeeeER fmls v0.4s, v1.4s, v2.4s + +# CHECK: [4] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 16 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeeeeeER . fmla v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D====eeeeeeeeeER fmla v0.2s, v3.2s, v4.2s + +# CHECK: [5] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 16 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeeeeeER . fmls v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D====eeeeeeeeeER fmls v0.2s, v3.2s, v4.2s + +# CHECK: [6] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 12 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER .. fmul d4, d5, d6 +# CHECK-NEXT: [0,1] DeeeeeeeeeER fmadd d1, d2, d3, d4 + +# CHECK: [7] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 12 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER .. fmul d4, d5, d6 +# CHECK-NEXT: [0,1] DeeeeeeeeeER fmadd d1, d2, d3, d4 + +# CHECK: [8] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 16 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeeeeeER . fmadd d4, d5, d6, d7 +# CHECK-NEXT: [0,1] D====eeeeeeeeeER fmadd d1, d2, d3, d4 + +# CHECK: [9] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 16 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeeeeeER . fmsub d4, d5, d6, d7 +# CHECK-NEXT: [0,1] D====eeeeeeeeeER fmsub d1, d2, d3, d4 + +# CHECK: [10] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 16 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeeeeeER . fnmadd d4, d5, d6, d7 +# CHECK-NEXT: [0,1] D====eeeeeeeeeER fnmadd d1, d2, d3, d4 + +# CHECK: [11] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 16 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeeeeeER . fnmsub d4, d5, d6, d7 +# CHECK-NEXT: [0,1] D====eeeeeeeeeER fnmsub d1, d2, d3, d4 + +# CHECK: [12] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 8 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeER. saba v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeER saba v0.2s, v3.2s, v4.2s + +# CHECK: [13] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 8 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeER. sabal v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeER sabal v0.2d, v3.2s, v4.2s + +# CHECK: [14] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 8 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeER. uaba v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeER uaba v0.2s, v3.2s, v4.2s + +# CHECK: [15] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 8 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeER. uabal v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeER uabal v0.2d, v3.2s, v4.2s + +# CHECK: [16] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 8 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeER. sadalp v0.1d, v1.2s +# CHECK-NEXT: [0,1] D=eeeeER sadalp v0.1d, v2.2s + +# CHECK: [17] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 8 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeER. uadalp v0.1d, v1.2s +# CHECK-NEXT: [0,1] D=eeeeER uadalp v0.1d, v2.2s + +# CHECK: [18] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 8 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeER. srsra v0.8b, v1.8b, #3 +# CHECK-NEXT: [0,1] D=eeeeER srsra v0.8b, v2.8b, #3 + +# CHECK: [19] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 8 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeER. ursra v0.8b, v1.8b, #3 +# CHECK-NEXT: [0,1] D=eeeeER ursra v0.8b, v2.8b, #3 + +# CHECK: [20] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 8 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeER. usra v0.4s, v1.4s, #3 +# CHECK-NEXT: [0,1] D=eeeeER usra v0.4s, v2.4s, #3 + +# CHECK: [21] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 9 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER. mul v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeeER mla v0.2s, v1.2s, v2.2s + +# CHECK: [22] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 13 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER . . pmul v0.8b, v1.8b, v2.8b +# CHECK-NEXT: [0,1] D=====eeeeeER mla v0.8b, v1.8b, v2.8b + +# CHECK: [23] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 13 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER . . sqdmulh v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=====eeeeeER mla v0.2s, v1.2s, v2.2s + +# CHECK: [24] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 13 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER . . sqrdmulh v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=====eeeeeER mla v0.2s, v1.2s, v2.2s + +# CHECK: [25] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 9 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER. smull v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeeER smlal v0.2d, v1.2s, v2.2s + +# CHECK: [26] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 9 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER. umull v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeeER umlal v0.2d, v1.2s, v2.2s + +# CHECK: [27] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 13 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER . . sqdmull v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=====eeeeeER smlal v0.2d, v1.2s, v2.2s + +# CHECK: [28] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 13 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER . . pmull v0.8h, v1.8b, v2.8b +# CHECK-NEXT: [0,1] D=====eeeeeER smlal v0.8h, v1.8b, v2.8b + +# CHECK: [29] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 13 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER . . pmull2 v0.8h, v1.16b, v2.16b +# CHECK-NEXT: [0,1] D=====eeeeeER smlal v0.8h, v1.8b, v2.8b + +# CHECK: [30] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 9 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER. mla v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeeER mla v0.2s, v1.2s, v2.2s + +# CHECK: [31] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 11 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeeER . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,1] .D=eeeeeeER mla v0.4s, v1.4s, v2.4s + +# CHECK: [32] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 9 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER. mls v0.2s, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeeER mls v0.2s, v1.2s, v2.2s + +# CHECK: [33] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 11 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeeER . mls v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,1] .D=eeeeeeER mls v0.4s, v1.4s, v2.4s + +# CHECK: [34] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 9 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER. smlal v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeeER smlal v0.2d, v1.2s, v2.2s + +# CHECK: [35] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 9 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER. smlsl v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeeER smlsl v0.2d, v1.2s, v2.2s + +# CHECK: [36] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 9 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER. umlal v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeeER umlal v0.2d, v1.2s, v2.2s + +# CHECK: [37] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 9 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER. umlsl v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D=eeeeeER umlsl v0.2d, v1.2s, v2.2s + +# CHECK: [38] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 10 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER . sqdmlal v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D==eeeeeER sqdmlal v0.2d, v1.2s, v2.2s + +# CHECK: [39] Code Region +# CHECK: Instructions: 2 +# CHECK-NEXT: Total Cycles: 10 +# CHECK: Timeline view: +# CHECK: [0,0] DeeeeeER . sqdmlsl v0.2d, v1.2s, v2.2s +# CHECK-NEXT: [0,1] D==eeeeeER sqdmlsl v0.2d, v1.2s, v2.2s + +# ASIMD FP Instructions +# FMUL, FMULX, FMLA, FMLS are impacted +# testing only a subset of combinations +# LLVM-MCA-BEGIN +fmul v0.2s, v1.2s, v2.2s +fmla v0.2s, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +fmul v0.4s, v1.4s, v2.4s +fmla v0.4s, v1.4s, v2.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +fmulx v0.2s, v1.2s, v2.2s +fmls v0.2s, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +fmulx v0.4s, v1.4s, v2.4s +fmls v0.4s, v1.4s, v2.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +fmla v0.2s, v1.2s, v2.2s +fmla v0.2s, v3.2s, v4.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +fmls v0.2s, v1.2s, v2.2s +fmls v0.2s, v3.2s, v4.2s +# LLVM-MCA-END + + +# FP Multiply Instructions +# FMUL, FMUL, FNMUL, FMADD, FMSUB, FNMADD, FNMSUB are impacted +# testing only a subset of combinations +# LLVM-MCA-BEGIN +fmul d4, d5, d6 +fmadd d1, d2, d3, d4 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +fmul d4, d5, d6 +fmadd d1, d2, d3, d4 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +fmadd d4, d5, d6, d7 +fmadd d1, d2, d3, d4 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +fmsub d4, d5, d6, d7 +fmsub d1, d2, d3, d4 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +fnmadd d4, d5, d6, d7 +fnmadd d1, d2, d3, d4 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +fnmsub d4, d5, d6, d7 +fnmsub d1, d2, d3, d4 +# LLVM-MCA-END + + + +# ASIMD Integer Instructions X-Unit +# SABA, UABA, SABAL, UABAL, SADALP, UADALP, SRSRA, USRA, URSRA are impacted +# testing only a subset of combinations + +# LLVM-MCA-BEGIN +saba v0.2s, v1.2s, v2.2s +saba v0.2s, v3.2s, v4.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +sabal v0.2d, v1.2s, v2.2s +sabal v0.2d, v3.2s, v4.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +uaba v0.2s, v1.2s, v2.2s +uaba v0.2s, v3.2s, v4.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +uabal v0.2d, v1.2s, v2.2s +uabal v0.2d, v3.2s, v4.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +sadalp v0.1d, v1.2s +sadalp v0.1d, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +uadalp v0.1d, v1.2s +uadalp v0.1d, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +srsra v0.8b, v1.8b, #3 +srsra v0.8b, v2.8b, #3 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +ursra v0.8b, v1.8b, #3 +ursra v0.8b, v2.8b, #3 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +usra v0.4s, v1.4s, #3 +usra v0.4s, v2.4s, #3 +# LLVM-MCA-END + + +# ASIMD Multiply Instructions X-Unit +# pmuls and sqd/sqrdmuls dont forward + +# MULs +# LLVM-MCA-BEGIN +mul v0.2s, v1.2s, v2.2s +mla v0.2s, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +pmul v0.8b, v1.8b, v2.8b +mla v0.8b, v1.8b, v2.8b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +sqdmulh v0.2s, v1.2s, v2.2s +mla v0.2s, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +sqrdmulh v0.2s, v1.2s, v2.2s +mla v0.2s, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +smull v0.2d, v1.2s, v2.2s +smlal v0.2d, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +umull v0.2d, v1.2s, v2.2s +umlal v0.2d, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +sqdmull v0.2d, v1.2s, v2.2s +smlal v0.2d, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +pmull.8h v0, v1, v2 +smlal.8h v0, v1, v2 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +pmull2.8h v0, v1, v2 +smlal.8h v0, v1, v2 +# LLVM-MCA-END + + +# MLAs +# LLVM-MCA-BEGIN +mla v0.2s, v1.2s, v2.2s +mla v0.2s, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +mla v0.4s, v1.4s, v2.4s +mla v0.4s, v1.4s, v2.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +mls v0.2s, v1.2s, v2.2s +mls v0.2s, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +mls v0.4s, v1.4s, v2.4s +mls v0.4s, v1.4s, v2.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +smlal v0.2d, v1.2s, v2.2s +smlal v0.2d, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +smlsl v0.2d, v1.2s, v2.2s +smlsl v0.2d, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +umlal v0.2d, v1.2s, v2.2s +umlal v0.2d, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +umlsl v0.2d, v1.2s, v2.2s +umlsl v0.2d, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +sqdmlal v0.2d, v1.2s, v2.2s +sqdmlal v0.2d, v1.2s, v2.2s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN +sqdmlsl v0.2d, v1.2s, v2.2s +sqdmlsl v0.2d, v1.2s, v2.2s +# LLVM-MCA-END