Index: llvm/trunk/lib/Target/PowerPC/P9InstrResources.td =================================================================== --- llvm/trunk/lib/Target/PowerPC/P9InstrResources.td +++ llvm/trunk/lib/Target/PowerPC/P9InstrResources.td @@ -32,9 +32,8 @@ // Two cycle ALU vector operation that uses an entire superslice. // Uses both ALU units (the even ALUE and odd ALUO units), two pipelines -// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. -def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], +// (EXECE, EXECO) and 1 dispatches (DISP) to the given superslice. +def : InstRW<[P9_ALUE_2C, P9_ALUO_2C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs (instregex "VADDU(B|H|W|D)M$"), (instregex "VAND(C)?$"), @@ -86,7 +85,7 @@ // Restricted Dispatch ALU operation for 3 cycles. The operation runs on a // single slice. However, since it is Restricted it requires all 3 dispatches // (DISP) for that superslice. -def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs (instregex "TABORT(D|W)C(I)?$"), (instregex "MTFSB(0|1)$"), @@ -102,7 +101,7 @@ )>; // Standard Dispatch ALU operation for 3 cycles. Only one slice used. -def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C], (instrs (instregex "XSMAX(C|J)?DP$"), (instregex "XSMIN(C|J)?DP$"), @@ -119,7 +118,7 @@ )>; // Standard Dispatch ALU operation for 2 cycles. Only one slice used. -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C], (instrs (instregex "S(L|R)D$"), (instregex "SRAD(I)?$"), @@ -172,7 +171,7 @@ // Restricted Dispatch ALU operation for 2 cycles. The operation runs on a // single slice. However, since it is Restricted it requires all 3 dispatches // (DISP) for that superslice. -def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs (instregex "RLDC(L|R)$"), (instregex "RLWIMI(8)?$"), @@ -199,9 +198,8 @@ // Three cycle ALU vector operation that uses an entire superslice. // Uses both ALU units (the even ALUE and odd ALUO units), two pipelines -// (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. -def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], +// (EXECE, EXECO) and 1 dispatches (DISP) to the given superslice. +def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs (instregex "M(T|F)VSCR$"), (instregex "VCMPNEZ(B|H|W)$"), @@ -286,8 +284,7 @@ // 7 cycle DP vector operation that uses an entire superslice. // Uses both DP units (the even DPE and odd DPO units), two pipelines // (EXECE, EXECO) and all three dispatches (DISP) to the given superslice. -def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs VADDFP, VCTSXS, @@ -397,7 +394,7 @@ // 5 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three // dispatch units for the superslice. -def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs (instregex "MADD(HD|HDU|LD|LD8)$"), (instregex "MUL(HD|HW|LD|LI|LI8|LW)(U)?$") @@ -405,7 +402,7 @@ // 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three // dispatch units for the superslice. -def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs FRSP, (instregex "FRI(N|P|Z|M)(D|S)$"), @@ -448,25 +445,25 @@ // 7 cycle Restricted DP operation and one 3 cycle ALU operation. // These operations can be done in parallel. -// The DP is restricted so we need a full 5 dispatches. +// The DP is restricted so we need a full 4 dispatches. def : InstRW<[P9_DP_7C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "FSEL(D|S)o$") )>; // 5 Cycle Restricted DP operation and one 2 cycle ALU operation. def : InstRW<[P9_DPOpAndALUOp_7C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "MUL(H|L)(D|W)(U)?o$") )>; // 7 cycle Restricted DP operation and one 3 cycle ALU operation. // These operations must be done sequentially. -// The DP is restricted so we need a full 5 dispatches. +// The DP is restricted so we need a full 4 dispatches. def : InstRW<[P9_DPOpAndALU2Op_10C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "FRI(N|P|Z|M)(D|S)o$"), (instregex "FRE(S)?o$"), @@ -482,8 +479,8 @@ FRSPo )>; -// 7 cycle DP operation. One DP unit, one EXEC pipeline and two dispatch units. -def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C, DISP_1C], +// 7 cycle DP operation. One DP unit, one EXEC pipeline and 1 dispatch units. +def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_1C], (instrs XSADDDP, XSADDSP, @@ -519,9 +516,9 @@ )>; // Three Cycle PM operation. Only one PM unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and one // dispatches. -def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C], (instrs (instregex "LVS(L|R)$"), (instregex "VSPLTIS(W|H|B)$"), @@ -627,9 +624,9 @@ )>; // 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and one // dispatches. -def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs BCDSRo, XSADDQP, @@ -651,17 +648,17 @@ )>; // 23 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and one // dispatches. -def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs BCDCTSQo )>; // 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and one // dispatches. -def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs XSMADDQP, XSMADDQPO, @@ -676,17 +673,17 @@ )>; // 37 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and one // dispatches. -def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs BCDCFSQo )>; // 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and one // dispatches. -def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DFU_58C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs XSDIVQP, XSDIVQPO @@ -695,20 +692,20 @@ // 76 Cycle DFU operation. Only one DFU unit per CPU so we use a whole // superslice. That includes both exec pipelines (EXECO, EXECE) and all three // dispatches. -def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DFU_76C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs XSSQRTQP, XSSQRTQPO )>; // 6 Cycle Load uses a single slice. -def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LS_6C, IP_AGEN_1C, DISP_1C], (instrs (instregex "LXVL(L)?") )>; // 5 Cycle Load uses a single slice. -def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LS_5C, IP_AGEN_1C, DISP_1C], (instrs (instregex "LVE(B|H|W)X$"), (instregex "LVX(L)?"), @@ -727,7 +724,7 @@ )>; // 4 Cycle Load uses a single slice. -def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C], (instrs (instregex "DCB(F|T|ST)(EP)?$"), (instregex "DCBZ(L)?(EP)?$"), @@ -757,7 +754,7 @@ // 4 Cycle Restricted load uses a single slice but the dispatch for the whole // superslice. -def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LS_4C, IP_AGEN_1C, DISP_3SLOTS_1C], (instrs LFIWZX, LFDX, @@ -767,7 +764,7 @@ // Cracked Load Instructions. // Load instructions that can be done in parallel. def : InstRW<[P9_LS_4C, P9_LS_4C, IP_AGEN_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_PAIR_1C], (instrs SLBIA, SLBIE, @@ -781,17 +778,26 @@ // Requires Load and ALU pieces totaling 6 cycles. The Load and ALU // operations can be run in parallel. def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_PAIR_1C, DISP_PAIR_1C], + (instrs + (instregex "L(W|H)ZU(X)?(8)?$") +)>; + +// Cracked TEND Instruction. +// Requires Load and ALU pieces totaling 6 cycles. The Load and ALU +// operations can be run in parallel. +def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_EXEC_1C, IP_AGEN_1C, + DISP_1C, DISP_1C], (instrs - (instregex "L(W|H)ZU(X)?(8)?$"), TEND )>; + // Cracked Store Instruction // Consecutive Store and ALU instructions. The store is restricted and requires // three dispatches. def : InstRW<[P9_StoreAndALUOp_3C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "ST(B|H|W|D)CX$") )>; @@ -799,7 +805,7 @@ // Cracked Load Instruction. // Two consecutive load operations for a total of 8 cycles. def : InstRW<[P9_LoadAndLoadOp_8C, IP_AGEN_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_1C, DISP_1C], (instrs LDMX )>; @@ -808,7 +814,7 @@ // Requires consecutive Load and ALU pieces totaling 6 cycles. The Load and ALU // operations cannot be done at the same time and so their latencies are added. def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_1C, DISP_1C], (instrs (instregex "LHA(X)?(8)?$"), (instregex "CP_PASTE(8)?o$"), @@ -821,7 +827,7 @@ // operations cannot be done at the same time and so their latencies are added. // Full 6 dispatches are required as this is both cracked and restricted. def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs LFIWAX )>; @@ -830,8 +836,7 @@ // Requires consecutive Load and ALU pieces totaling 7 cycles. The Load and ALU // operations cannot be done at the same time and so their latencies are added. // Full 4 dispatches are required as this is a cracked instruction. -def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LoadAndALUOp_7C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs LXSIWAX, LIWAX @@ -843,7 +848,7 @@ // their latencies are added. // Full 6 dispatches are required as this is a restricted instruction. def : InstRW<[P9_LoadAndALU2Op_7C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs LFSX, LFS @@ -853,8 +858,7 @@ // Requires consecutive Load and ALU pieces totaling 8 cycles. The Load and ALU // operations cannot be done at the same time and so their latencies are added. // Full 4 dispatches are required as this is a cracked instruction. -def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LoadAndALU2Op_8C, IP_AGEN_1C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs LXSSP, LXSSPX, @@ -865,7 +869,7 @@ // Cracked 3-Way Load Instruction // Load with two ALU operations that depend on each other def : InstRW<[P9_LoadAndALUOp_6C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_PAIR_1C, DISP_PAIR_1C, DISP_1C], (instrs (instregex "LHAU(X)?(8)?$"), LWAUX @@ -875,10 +879,10 @@ // Since the Load and the PM cannot be done at the same time the latencies are // added. Requires 8 cycles. // Since the PM requires the full superslice we need both EXECE, EXECO pipelines -// as well as 3 dispatches for the PM. The Load requires the remaining 2 +// as well as 1 dispatches for the PM. The Load requires the remaining 1 // dispatches. def : InstRW<[P9_LoadAndPMOp_8C, IP_AGEN_1C, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_1C, DISP_1C], (instrs LXVH8X, LXVDSX, @@ -887,7 +891,7 @@ // Single slice Restricted store operation. The restricted operation requires // all three dispatches for the superslice. -def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LS_1C, IP_EXEC_1C, IP_AGEN_1C, DISP_3SLOTS_1C], (instrs (instregex "STF(S|D|IWX|SX|DX)$"), (instregex "STXS(D|DX|SPX|IWX|IBX|IHX|SP)(v)?$"), @@ -904,10 +908,9 @@ )>; // Vector Store Instruction -// Requires the whole superslice and therefore requires all three dispatches +// Requires the whole superslice and therefore requires one dispatches // as well as both the Even and Odd exec pipelines. -def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, - DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_LS_1C, IP_EXECE_1C, IP_EXECO_1C, IP_AGEN_1C, DISP_1C], (instrs (instregex "STVE(B|H|W)X$"), (instregex "STVX(L)?$"), @@ -915,18 +918,18 @@ )>; // 5 Cycle DIV operation. Only one DIV unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and two // dispatches. -def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DIV_5C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C], (instrs (instregex "MTCTR(8)?(loop)?$"), (instregex "MTLR(8)?$") )>; // 12 Cycle DIV operation. Only one DIV unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and two // dispatches. -def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C], (instrs (instregex "M(T|F)VRSAVE(v)?$"), (instregex "M(T|F)PMR$"), @@ -937,10 +940,9 @@ )>; // 16 Cycle DIV operation. Only one DIV unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and two // dispatches. -def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, - DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], (instrs DIVW, DIVWU, @@ -948,10 +950,9 @@ )>; // 24 Cycle DIV operation. Only one DIV unit per superslice so we use the whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three +// superslice. That includes both exec pipelines (EXECO, EXECE) and two // dispatches. -def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, - DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], (instrs DIVWE, DIVD, @@ -965,8 +966,7 @@ // 40 Cycle DIV operation. Only one DIV unit per superslice so we use the whole // superslice. That includes both exec pipelines (EXECO, EXECE) and all three // dispatches. -def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, - DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], (instrs DIVDE, DIVDEU @@ -976,7 +976,7 @@ // and one full superslice for the DIV operation since there is only one DIV // per superslice. Latency of DIV plus ALU is 26. def : InstRW<[P9_IntDivAndALUOp_18C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_EVEN_1C, DISP_1C], (instrs (instregex "DIVW(U)?(O)?o$") )>; @@ -985,7 +985,7 @@ // and one full superslice for the DIV operation since there is only one DIV // per superslice. Latency of DIV plus ALU is 26. def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_EVEN_1C, DISP_1C], (instrs DIVDo, DIVDUo, @@ -997,7 +997,7 @@ // and one full superslice for the DIV operation since there is only one DIV // per superslice. Latency of DIV plus ALU is 42. def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_EVEN_1C, DISP_1C], (instrs DIVDEo, DIVDEUo @@ -1011,7 +1011,7 @@ // instructions running together on two pipelines and 6 dispatches. // ALU ops are 2 cycles each. def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs MTCRF, MTCRF8 @@ -1020,10 +1020,10 @@ // Cracked ALU operations. // Here the two ALU ops can actually be done in parallel and therefore the // latencies are not added together. Otherwise this is like having two -// instructions running together on two pipelines and 4 dispatches. +// instructions running together on two pipelines and 2 dispatches. // ALU ops are 2 cycles each. def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_1C, DISP_1C], (instrs (instregex "ADDC(8)?o$"), (instregex "SUBFC(8)?o$") @@ -1035,7 +1035,7 @@ // One of the ALU ops is restricted the other is not so we have a total of // 5 dispatches. def : InstRW<[P9_ALU_2C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "F(N)?ABS(D|S)o$"), (instregex "FCPSGN(D|S)o$"), @@ -1046,10 +1046,10 @@ // Cracked ALU operations. // Here the two ALU ops can actually be done in parallel and therefore the // latencies are not added together. Otherwise this is like having two -// instructions running together on two pipelines and 4 dispatches. +// instructions running together on two pipelines and 2 dispatches. // ALU ops are 3 cycles each. def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_1C, DISP_1C], (instrs MCRFS )>; @@ -1060,7 +1060,7 @@ // instructions running together on two pipelines and 6 dispatches. // ALU ops are 3 cycles each. def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs (instregex "MTFSF(b|o)?$"), (instregex "MTFSFI(o)?$") @@ -1070,7 +1070,7 @@ // The two ops cannot be done in parallel. // One of the ALU ops is restricted and takes 3 dispatches. def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "RLD(I)?C(R|L)o$"), (instregex "RLW(IMI|INM|NM)(8)?o$"), @@ -1085,7 +1085,7 @@ // The two ops cannot be done in parallel. // Both of the ALU ops are restricted and take 3 dispatches. def : InstRW<[P9_ALU2OpAndALU2Op_6C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs (instregex "MFFS(L|CE|o)?$") )>; @@ -1094,16 +1094,14 @@ // total of 6 cycles. All of the ALU operations are also restricted so each // takes 3 dispatches for a total of 9. def : InstRW<[P9_ALUOpAndALUOpAndALUOp_6C, IP_EXEC_1C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs (instregex "MFCR(8)?$") )>; // Cracked instruction made of two ALU ops. // The two ops cannot be done in parallel. -def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs (instregex "EXTSWSLIo$"), (instregex "SRAD(I)?o$"), @@ -1113,110 +1111,110 @@ )>; // 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. -def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs FDIV )>; // 33 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU. def : InstRW<[P9_DPOpAndALU2Op_36C_8, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs FDIVo )>; // 36 Cycle DP Instruction. // Instruction can be done on a single slice. -def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C], (instrs XSSQRTDP )>; // 36 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. -def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_36C_10, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs FSQRT )>; // 36 Cycle DP Vector Instruction. def : InstRW<[P9_DPE_36C_10, P9_DPO_36C_10, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], + DISP_1C], (instrs XVSQRTDP )>; // 27 Cycle DP Vector Instruction. def : InstRW<[P9_DPE_27C_10, P9_DPO_27C_10, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], + DISP_1C], (instrs XVSQRTSP )>; // 36 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU. def : InstRW<[P9_DPOpAndALU2Op_39C_10, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs FSQRTo )>; // 26 Cycle DP Instruction. -def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C], (instrs XSSQRTSP )>; // 26 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. -def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs FSQRTS )>; // 26 Cycle DP Instruction Restricted and Cracked with 3 Cycle ALU. def : InstRW<[P9_DPOpAndALU2Op_29C_5, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs FSQRTSo )>; -// 33 Cycle DP Instruction. Takes one slice and 2 dispatches. -def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C, DISP_1C], +// 33 Cycle DP Instruction. Takes one slice and 1 dispatches. +def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_1C], (instrs XSDIVDP )>; // 22 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. -def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs FDIVS )>; // 22 Cycle DP Instruction Restricted and Cracked with 2 Cycle ALU. def : InstRW<[P9_DPOpAndALU2Op_25C_5, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs FDIVSo )>; -// 22 Cycle DP Instruction. Takes one slice and 2 dispatches. -def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C, DISP_1C], +// 22 Cycle DP Instruction. Takes one slice and 1 dispatches. +def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_1C], (instrs XSDIVSP )>; // 24 Cycle DP Vector Instruction. Takes one full superslice. -// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given +// Includes both EXECE, EXECO pipelines and 1 dispatch for the given // superslice. def : InstRW<[P9_DPE_24C_8, P9_DPO_24C_8, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], + DISP_1C], (instrs XVDIVSP )>; // 33 Cycle DP Vector Instruction. Takes one full superslice. -// Includes both EXECE, EXECO pipelines and all 3 dispatches for the given +// Includes both EXECE, EXECO pipelines and 1 dispatch for the given // superslice. def : InstRW<[P9_DPE_33C_8, P9_DPO_33C_8, IP_EXECE_1C, IP_EXECO_1C, - DISP_1C, DISP_1C, DISP_1C], + DISP_1C], (instrs XVDIVDP )>; @@ -1225,12 +1223,11 @@ // The Load and one of the ALU ops cannot be run at the same time and so the // latencies are added together for 6 cycles. The remainaing ALU is 2 cycles. // Both the load and the ALU that depends on it are restricted and so they take -// a total of 6 dispatches. The final 2 dispatches come from the second ALU op. +// a total of 7 dispatches. The final 2 dispatches come from the second ALU op. // The two EXEC pipelines are for the 2 ALUs while the AGEN is for the load. def : InstRW<[P9_LoadAndALU2Op_7C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "LF(SU|SUX)$") )>; @@ -1239,7 +1236,7 @@ // the store and so it can be run at the same time as the store. The store is // also restricted. def : InstRW<[P9_LS_1C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "STF(S|D)U(X)?$"), (instregex "ST(B|H|W|D)U(X)?(8)?$") @@ -1248,7 +1245,7 @@ // Cracked instruction made up of a Load and an ALU. The ALU does not depend on // the load and so it can be run at the same time as the load. def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_PAIR_1C, DISP_PAIR_1C], (instrs (instregex "LBZU(X)?(8)?$"), (instregex "LDU(X)?$") @@ -1261,7 +1258,7 @@ // are from the ALU. The AGEN pipeline is from the load and the EXEC pipeline // is required for the ALU. def : InstRW<[P9_LS_4C, P9_ALU_2C, IP_AGEN_1C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C], + DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "LF(DU|DUX)$") )>; @@ -1269,9 +1266,9 @@ // Crypto Instructions // 6 Cycle CY operation. Only one CY unit per CPU so we use a whole -// superslice. That includes both exec pipelines (EXECO, EXECE) and all three -// dispatches. -def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C, DISP_1C, DISP_1C], +// superslice. That includes both exec pipelines (EXECO, EXECE) and one +// dispatch. +def : InstRW<[P9_CY_6C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C], (instrs (instregex "VPMSUM(B|H|W|D)$"), (instregex "V(N)?CIPHER(LAST)?$"), @@ -1281,7 +1278,7 @@ // Branch Instructions // Two Cycle Branch -def : InstRW<[P9_BR_2C, DISP_1C, DISP_1C], +def : InstRW<[P9_BR_2C, DISP_BR_1C], (instrs (instregex "BCCCTR(L)?(8)?$"), (instregex "BCCL(A|R|RL)?$"), @@ -1312,8 +1309,7 @@ // Five Cycle Branch with a 2 Cycle ALU Op // Operations must be done consecutively and not in parallel. -def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C], +def : InstRW<[P9_BROpAndALUOp_7C, IP_EXEC_1C, DISP_BR_1C, DISP_1C], (instrs ADDPCIS )>; @@ -1323,17 +1319,15 @@ // Atomic Load def : InstRW<[P9_LS_1C, P9_LS_1C, P9_LS_4C, P9_LS_4C, P9_LS_4C, IP_EXEC_1C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C, IP_AGEN_1C, - IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C], + IP_AGEN_1C, IP_AGEN_1C, DISP_1C, DISP_3SLOTS_1C, + DISP_3SLOTS_1C, DISP_1C, DISP_1C, DISP_1C], (instrs (instregex "L(D|W)AT$") )>; // Atomic Store def : InstRW<[P9_LS_1C, P9_LS_4C, P9_LS_4C, IP_EXEC_1C, IP_AGEN_1C, IP_AGEN_1C, - IP_AGEN_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, DISP_1C, - DISP_1C], + IP_AGEN_1C, DISP_1C, DISP_3SLOTS_1C, DISP_1C], (instrs (instregex "ST(D|W)AT$") )>; Index: llvm/trunk/lib/Target/PowerPC/PPCScheduleP9.td =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCScheduleP9.td +++ llvm/trunk/lib/Target/PowerPC/PPCScheduleP9.td @@ -50,8 +50,21 @@ // ***************** Processor Resources ***************** - //Dispatcher: - def DISPATCHER : ProcResource<12>; + // Dispatcher slots: + // x0, x1, x2, and x3 are the dedicated slice dispatch ports, where each + // corresponds to one of the four execution slices. + def DISPx02 : ProcResource<2>; + def DISPx13 : ProcResource<2>; + // The xa and xb ports can be used to send an iop to either of the two slices + // of the superslice, but are restricted to iops with only two primary sources. + def DISPxab : ProcResource<2>; + // b0 and b1 are dedicated dispatch ports into the branch slice. + def DISPb01 : ProcResource<2>; + + // Any non BR dispatch ports + def DISP_NBR + : ProcResGroup<[ DISPx02, DISPx13, DISPxab]>; + def DISP_SS : ProcResGroup<[ DISPx02, DISPx13]>; // Issue Ports // An instruction can go down one of two issue queues. @@ -116,8 +129,37 @@ // ***************** SchedWriteRes Definitions ***************** - //Dispatcher - def DISP_1C : SchedWriteRes<[DISPATCHER]> { + // Dispatcher + // Dispatch Rules: '-' or 'V' + // Vector ('V') - vector iops (128-bit operand) take only one decode and + // dispatch slot but are dispatched to both the even and odd slices of a + // superslice. + def DISP_1C : SchedWriteRes<[DISP_NBR]> { + let NumMicroOps = 0; + let Latency = 1; + } + // Dispatch Rules: 'E' + // Even slice ('E')- certain operations must be sent only to an even slice. + // Also consumes odd dispatch slice slot of the same superslice at dispatch + def DISP_EVEN_1C : SchedWriteRes<[ DISPx02, DISPx13 ]> { + let NumMicroOps = 0; + let Latency = 1; + } + // Dispatch Rules: 'P' + // Paired ('P') - certain cracked and expanded iops are paired such that they + // must dispatch together to the same superslice. + def DISP_PAIR_1C : SchedWriteRes<[ DISP_SS, DISP_SS]> { + let NumMicroOps = 0; + let Latency = 1; + } + // Tuple Restricted ('R') - certain iops preclude dispatching more than one + // operation per slice for the super- slice to which they are dispatched + def DISP_3SLOTS_1C : SchedWriteRes<[DISPx02, DISPx13, DISPxab]> { + let NumMicroOps = 0; + let Latency = 1; + } + // Each execution and branch slice can receive up to two iops per cycle + def DISP_BR_1C : SchedWriteRes<[ DISPxab ]> { let NumMicroOps = 0; let Latency = 1; } Index: llvm/trunk/test/CodeGen/PowerPC/build-vector-tests.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/build-vector-tests.ll +++ llvm/trunk/test/CodeGen/PowerPC/build-vector-tests.ll @@ -2012,9 +2012,9 @@ ; P9BE-NEXT: lfd f0, 24(r3) ; P9BE-NEXT: lfd f1, 16(r3) ; P9BE-NEXT: lfd f2, 8(r3) +; P9BE-NEXT: xxmrghd vs0, vs0, vs2 ; P9BE-NEXT: lfd f3, 0(r3) ; P9BE-NEXT: xxmrghd vs1, vs1, vs3 -; P9BE-NEXT: xxmrghd vs0, vs0, vs2 ; P9BE-NEXT: xvcvdpsxws v2, vs1 ; P9BE-NEXT: xvcvdpsxws v3, vs0 ; P9BE-NEXT: vmrgew v2, v3, v2 @@ -2025,8 +2025,8 @@ ; P9LE-NEXT: lfd f0, 24(r3) ; P9LE-NEXT: lfd f2, 8(r3) ; P9LE-NEXT: lfd f1, 16(r3) -; P9LE-NEXT: lfd f3, 0(r3) ; P9LE-NEXT: xxmrghd vs0, vs2, vs0 +; P9LE-NEXT: lfd f3, 0(r3) ; P9LE-NEXT: xvcvdpsxws v2, vs0 ; P9LE-NEXT: xxmrghd vs0, vs3, vs1 ; P9LE-NEXT: xvcvdpsxws v3, vs0 @@ -3596,9 +3596,9 @@ ; P9BE-NEXT: lfd f0, 24(r3) ; P9BE-NEXT: lfd f1, 16(r3) ; P9BE-NEXT: lfd f2, 8(r3) +; P9BE-NEXT: xxmrghd vs0, vs0, vs2 ; P9BE-NEXT: lfd f3, 0(r3) ; P9BE-NEXT: xxmrghd vs1, vs1, vs3 -; P9BE-NEXT: xxmrghd vs0, vs0, vs2 ; P9BE-NEXT: xvcvdpuxws v2, vs1 ; P9BE-NEXT: xvcvdpuxws v3, vs0 ; P9BE-NEXT: vmrgew v2, v3, v2 @@ -3609,8 +3609,8 @@ ; P9LE-NEXT: lfd f0, 24(r3) ; P9LE-NEXT: lfd f2, 8(r3) ; P9LE-NEXT: lfd f1, 16(r3) -; P9LE-NEXT: lfd f3, 0(r3) ; P9LE-NEXT: xxmrghd vs0, vs2, vs0 +; P9LE-NEXT: lfd f3, 0(r3) ; P9LE-NEXT: xvcvdpuxws v2, vs0 ; P9LE-NEXT: xxmrghd vs0, vs3, vs1 ; P9LE-NEXT: xvcvdpuxws v3, vs0 Index: llvm/trunk/test/CodeGen/PowerPC/csr-save-restore-order.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/csr-save-restore-order.ll +++ llvm/trunk/test/CodeGen/PowerPC/csr-save-restore-order.ll @@ -58,18 +58,30 @@ ; CHECK-PWR9-NEXT: std r14, 240(r1) # 8-byte Folded Spill ; CHECK-PWR9-NEXT: std r15, 248(r1) # 8-byte Folded Spill ; CHECK-PWR9-NEXT: std r16, 256(r1) # 8-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v20, 48(r1) # 16-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v21, 64(r1) # 16-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v22, 80(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r17, 264(r1) # 8-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v23, 96(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r18, 272(r1) # 8-byte Folded Spill ; CHECK-PWR9-NEXT: std r19, 280(r1) # 8-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v24, 112(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r20, 288(r1) # 8-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v25, 128(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r21, 296(r1) # 8-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v26, 144(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r22, 304(r1) # 8-byte Folded Spill ; CHECK-PWR9-NEXT: std r23, 312(r1) # 8-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v27, 160(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r24, 320(r1) # 8-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v28, 176(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r25, 328(r1) # 8-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v29, 192(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r26, 336(r1) # 8-byte Folded Spill ; CHECK-PWR9-NEXT: std r27, 344(r1) # 8-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v30, 208(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r28, 352(r1) # 8-byte Folded Spill +; CHECK-PWR9-NEXT: stxv v31, 224(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r29, 360(r1) # 8-byte Folded Spill ; CHECK-PWR9-NEXT: std r30, 368(r1) # 8-byte Folded Spill ; CHECK-PWR9-NEXT: std r31, 376(r1) # 8-byte Folded Spill @@ -91,20 +103,8 @@ ; CHECK-PWR9-NEXT: stfd f29, 504(r1) # 8-byte Folded Spill ; CHECK-PWR9-NEXT: stfd f30, 512(r1) # 8-byte Folded Spill ; CHECK-PWR9-NEXT: stfd f31, 520(r1) # 8-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v20, 48(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: std r4, 40(r1) # 8-byte Folded Spill ; CHECK-PWR9-NEXT: std r3, 32(r1) # 8-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v21, 64(r1) # 16-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v22, 80(r1) # 16-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v23, 96(r1) # 16-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v24, 112(r1) # 16-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v25, 128(r1) # 16-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v26, 144(r1) # 16-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v27, 160(r1) # 16-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v28, 176(r1) # 16-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v29, 192(r1) # 16-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v30, 208(r1) # 16-byte Folded Spill -; CHECK-PWR9-NEXT: stxv v31, 224(r1) # 16-byte Folded Spill ; CHECK-PWR9-NEXT: #APP ; CHECK-PWR9-NEXT: nop ; CHECK-PWR9-NEXT: #NO_APP @@ -129,18 +129,6 @@ ; CHECK-PWR9-NEXT: lfd f28, 496(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: lfd f27, 488(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: lfd f26, 480(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f25, 472(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f24, 464(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f23, 456(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f22, 448(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f21, 440(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f20, 432(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f19, 424(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f18, 416(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f17, 408(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f16, 400(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f15, 392(r1) # 8-byte Folded Reload -; CHECK-PWR9-NEXT: lfd f14, 384(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r31, 376(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r30, 368(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r29, 360(r1) # 8-byte Folded Reload @@ -148,17 +136,29 @@ ; CHECK-PWR9-NEXT: ld r27, 344(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r26, 336(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r25, 328(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f25, 472(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r24, 320(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r23, 312(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r22, 304(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f24, 464(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r21, 296(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r20, 288(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r19, 280(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f23, 456(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r18, 272(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r17, 264(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r16, 256(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f22, 448(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r15, 248(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: ld r14, 240(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f21, 440(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f20, 432(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f19, 424(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f18, 416(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f17, 408(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f16, 400(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f15, 392(r1) # 8-byte Folded Reload +; CHECK-PWR9-NEXT: lfd f14, 384(r1) # 8-byte Folded Reload ; CHECK-PWR9-NEXT: addi r1, r1, 528 ; CHECK-PWR9-NEXT: blr entry: Index: llvm/trunk/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll +++ llvm/trunk/test/CodeGen/PowerPC/vec_conv_fp32_to_i8_elts.ll @@ -494,22 +494,101 @@ ; ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: lxv vs0, 0(r3) +; CHECK-P9-NEXT: lxv vs2, 0(r3) +; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3 +; CHECK-P9-NEXT: xscvspdpn f3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: lxv vs0, 48(r3) +; CHECK-P9-NEXT: lxv vs1, 32(r3) +; CHECK-P9-NEXT: lxv vs4, 16(r3) +; CHECK-P9-NEXT: mfvsrwz r3, f3 +; CHECK-P9-NEXT: mtvsrd f3, r3 +; CHECK-P9-NEXT: xxswapd v2, vs3 +; CHECK-P9-NEXT: xxswapd vs3, vs2 +; CHECK-P9-NEXT: xscvspdpn f3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: mfvsrwz r3, f3 +; CHECK-P9-NEXT: mtvsrd f3, r3 +; CHECK-P9-NEXT: xxswapd v3, vs3 +; CHECK-P9-NEXT: xscvspdpn f3, vs2 +; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: mfvsrwz r3, f3 +; CHECK-P9-NEXT: mtvsrd f3, r3 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 3 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-P9-NEXT: xxswapd v3, vs3 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 +; CHECK-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v3, vs2 +; CHECK-P9-NEXT: xxswapd vs2, vs4 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: xscvspdpn f2, vs4 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 +; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 1 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v5, vs2 +; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v4, v4, v5 +; CHECK-P9-NEXT: vmrglh v3, v4, v3 +; CHECK-P9-NEXT: vmrglw v2, v3, v2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v3, vs2 +; CHECK-P9-NEXT: xxswapd vs2, vs1 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: xscvspdpn f2, vs1 +; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-P9-NEXT: xscvspdpn f1, vs1 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: mfvsrwz r3, f1 +; CHECK-P9-NEXT: mtvsrd f1, r3 +; CHECK-P9-NEXT: xxswapd v5, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: mfvsrwz r4, f1 -; CHECK-P9-NEXT: mtvsrd f1, r4 -; CHECK-P9-NEXT: xxswapd v2, vs1 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 +; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: vmrglb v4, v4, v5 +; CHECK-P9-NEXT: vmrglh v3, v4, v3 +; CHECK-P9-NEXT: mfvsrwz r3, f1 +; CHECK-P9-NEXT: mtvsrd f1, r3 +; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xxswapd vs1, vs0 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: lxv vs2, 48(r3) -; CHECK-P9-NEXT: lxv vs3, 32(r3) -; CHECK-P9-NEXT: lxv vs4, 16(r3) ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 -; CHECK-P9-NEXT: xxswapd v3, vs1 +; CHECK-P9-NEXT: xxswapd v5, vs1 ; CHECK-P9-NEXT: xscvspdpn f1, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 @@ -519,87 +598,8 @@ ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 ; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs4, vs4, 3 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: vmrglb v2, v3, v2 -; CHECK-P9-NEXT: xxswapd v3, vs1 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 -; CHECK-P9-NEXT: vmrglh v2, v3, v2 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v3, vs0 -; CHECK-P9-NEXT: xxswapd vs0, vs4 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xscvspdpn f0, vs4 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs4, vs4, 1 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v5, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs3, vs3, 3 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: vmrglb v4, v4, v5 -; CHECK-P9-NEXT: vmrglh v3, v4, v3 -; CHECK-P9-NEXT: vmrglw v2, v3, v2 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v3, vs0 -; CHECK-P9-NEXT: xxswapd vs0, vs3 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xscvspdpn f0, vs3 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs3, vs3, 1 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v5, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs2, vs2, 3 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: vmrglb v4, v4, v5 -; CHECK-P9-NEXT: vmrglh v3, v4, v3 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xxswapd vs0, vs2 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v5, vs0 -; CHECK-P9-NEXT: xscvspdpn f0, vs2 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: vmrglb v4, v5, v4 -; CHECK-P9-NEXT: xxswapd v5, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs2, vs2, 1 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 +; CHECK-P9-NEXT: xxswapd v5, vs1 ; CHECK-P9-NEXT: xxswapd v0, vs0 ; CHECK-P9-NEXT: vmrglb v5, v5, v0 ; CHECK-P9-NEXT: vmrglh v4, v5, v4 @@ -1212,22 +1212,101 @@ ; ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: lxv vs0, 0(r3) +; CHECK-P9-NEXT: lxv vs2, 0(r3) +; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3 +; CHECK-P9-NEXT: xscvspdpn f3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: lxv vs0, 48(r3) +; CHECK-P9-NEXT: lxv vs1, 32(r3) +; CHECK-P9-NEXT: lxv vs4, 16(r3) +; CHECK-P9-NEXT: mfvsrwz r3, f3 +; CHECK-P9-NEXT: mtvsrd f3, r3 +; CHECK-P9-NEXT: xxswapd v2, vs3 +; CHECK-P9-NEXT: xxswapd vs3, vs2 +; CHECK-P9-NEXT: xscvspdpn f3, vs3 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: mfvsrwz r3, f3 +; CHECK-P9-NEXT: mtvsrd f3, r3 +; CHECK-P9-NEXT: xxswapd v3, vs3 +; CHECK-P9-NEXT: xscvspdpn f3, vs2 +; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f3, f3 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: mfvsrwz r3, f3 +; CHECK-P9-NEXT: mtvsrd f3, r3 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 3 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-P9-NEXT: xxswapd v3, vs3 +; CHECK-P9-NEXT: vmrglb v3, v3, v4 +; CHECK-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v3, vs2 +; CHECK-P9-NEXT: xxswapd vs2, vs4 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: xscvspdpn f2, vs4 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 +; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: xxsldwi vs2, vs4, vs4, 1 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v5, vs2 +; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: vmrglb v4, v4, v5 +; CHECK-P9-NEXT: vmrglh v3, v4, v3 +; CHECK-P9-NEXT: vmrglw v2, v3, v2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v3, vs2 +; CHECK-P9-NEXT: xxswapd vs2, vs1 +; CHECK-P9-NEXT: xscvspdpn f2, vs2 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: xscvspdpn f2, vs1 +; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1 +; CHECK-P9-NEXT: xscvspdpn f1, vs1 +; CHECK-P9-NEXT: xscvdpsxws f2, f2 +; CHECK-P9-NEXT: xscvdpsxws f1, f1 +; CHECK-P9-NEXT: mfvsrwz r3, f2 +; CHECK-P9-NEXT: mtvsrd f2, r3 +; CHECK-P9-NEXT: mfvsrwz r3, f1 +; CHECK-P9-NEXT: mtvsrd f1, r3 +; CHECK-P9-NEXT: xxswapd v5, vs1 ; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: mfvsrwz r4, f1 -; CHECK-P9-NEXT: mtvsrd f1, r4 -; CHECK-P9-NEXT: xxswapd v2, vs1 +; CHECK-P9-NEXT: vmrglb v3, v4, v3 +; CHECK-P9-NEXT: xxswapd v4, vs2 +; CHECK-P9-NEXT: vmrglb v4, v4, v5 +; CHECK-P9-NEXT: vmrglh v3, v4, v3 +; CHECK-P9-NEXT: mfvsrwz r3, f1 +; CHECK-P9-NEXT: mtvsrd f1, r3 +; CHECK-P9-NEXT: xxswapd v4, vs1 ; CHECK-P9-NEXT: xxswapd vs1, vs0 ; CHECK-P9-NEXT: xscvspdpn f1, vs1 ; CHECK-P9-NEXT: xscvdpsxws f1, f1 -; CHECK-P9-NEXT: lxv vs2, 48(r3) -; CHECK-P9-NEXT: lxv vs3, 32(r3) -; CHECK-P9-NEXT: lxv vs4, 16(r3) ; CHECK-P9-NEXT: mfvsrwz r3, f1 ; CHECK-P9-NEXT: mtvsrd f1, r3 -; CHECK-P9-NEXT: xxswapd v3, vs1 +; CHECK-P9-NEXT: xxswapd v5, vs1 ; CHECK-P9-NEXT: xscvspdpn f1, vs0 ; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 ; CHECK-P9-NEXT: xscvspdpn f0, vs0 @@ -1237,87 +1316,8 @@ ; CHECK-P9-NEXT: mtvsrd f1, r3 ; CHECK-P9-NEXT: mfvsrwz r3, f0 ; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs4, vs4, 3 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: vmrglb v2, v3, v2 -; CHECK-P9-NEXT: xxswapd v3, vs1 -; CHECK-P9-NEXT: vmrglb v3, v3, v4 -; CHECK-P9-NEXT: vmrglh v2, v3, v2 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v3, vs0 -; CHECK-P9-NEXT: xxswapd vs0, vs4 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xscvspdpn f0, vs4 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs4, vs4, 1 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v5, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs3, vs3, 3 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: vmrglb v4, v4, v5 -; CHECK-P9-NEXT: vmrglh v3, v4, v3 -; CHECK-P9-NEXT: vmrglw v2, v3, v2 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v3, vs0 -; CHECK-P9-NEXT: xxswapd vs0, vs3 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xscvspdpn f0, vs3 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: vmrglb v3, v4, v3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs3, vs3, 1 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v5, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs2, vs2, 3 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: vmrglb v4, v4, v5 -; CHECK-P9-NEXT: vmrglh v3, v4, v3 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v4, vs0 -; CHECK-P9-NEXT: xxswapd vs0, vs2 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: xxswapd v5, vs0 -; CHECK-P9-NEXT: xscvspdpn f0, vs2 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: vmrglb v4, v5, v4 -; CHECK-P9-NEXT: xxswapd v5, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs2, vs2, 1 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvdpsxws f0, f0 -; CHECK-P9-NEXT: mfvsrwz r3, f0 -; CHECK-P9-NEXT: mtvsrd f0, r3 +; CHECK-P9-NEXT: xxswapd v5, vs1 ; CHECK-P9-NEXT: xxswapd v0, vs0 ; CHECK-P9-NEXT: vmrglb v5, v5, v0 ; CHECK-P9-NEXT: vmrglh v4, v5, v4