diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6347,7 +6347,6 @@ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), ssub))>; - def : Pat<(i64 (intOp (v4i32 V128:$Rn))), (i64 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -35,6 +35,16 @@ // 5. %reg = INSERT_SUBREG %reg(tied-def 0), %subreg, subidx // ==> %reg:subidx = SUBREG_TO_REG 0, %subreg, subidx // +// 6. %intermediate:gpr32 = COPY %src:fpr128 +// %dst:fpr128 = INSvi32gpr %dst_vec:fpr128, dst_index, %intermediate:gpr32 +// ==> %dst:fpr128 = INSvi32lane %dst_vec:fpr128, dst_index, %src:fpr128, 0 +// +// In cases where a source FPR is copied to a GPR in order to be copied +// to a destination FPR, we can directly copy the values between the FPRs, eliminating the use of +// the Integer unit. +// When we match a pattern of INSvi[X]gpr that is preceded by a chain of COPY instructions from a FPR source, +// we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr instructions. +// //===----------------------------------------------------------------------===// #include "AArch64ExpandImm.h" @@ -99,6 +109,7 @@ bool visitAND(unsigned Opc, MachineInstr &MI); bool visitORR(MachineInstr &MI); bool visitINSERT(MachineInstr &MI); + bool visitINSviGPR(MachineInstr &MI, unsigned Opc); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -523,6 +534,64 @@ return true; } +bool AArch64MIPeepholeOpt::visitINSviGPR(MachineInstr &MI, unsigned Opc) { + // Check if this INSvi[X]gpr comes from COPY of a source FPR128 + // + // From + // %intermediate1:gpr64 = COPY %src:fpr128 + // %intermediate2:gpr32 = COPY %intermediate1:gpr64 + // %dst:fpr128 = INSvi[X]gpr %dst_vec:fpr128, dst_index, %intermediate2:gpr32 + // To + // %dst:fpr128 = INSvi[X]lane %dst_vec:fpr128, dst_index, %src:fpr128, src_index + // where src_index = 0, X = [8|16|32|64] + + if (!MI.isRegTiedToDefOperand(1)) + return false; + + if (!MI.getOperand(2).isImm()) + return false; + + Register SrcReg; + SmallVector ReplaceMIList; + MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg()); + + // For a chain of COPY instructions, find the initial source register + // and check if it's an FPR128 + while (true) { + if (!SrcMI || SrcMI->getOpcode() != TargetOpcode::COPY) + return false; + + if(!SrcMI->getOperand(1).getReg().isVirtual()) { + return false; + } + + ReplaceMIList.push_back(SrcMI); + if(MRI->getRegClass(SrcMI->getOperand(1).getReg()) == &AArch64::FPR128RegClass) { + SrcReg = SrcMI->getOperand(1).getReg(); + break; + } + SrcMI = MRI->getUniqueVRegDef(SrcMI->getOperand(1).getReg()); + } + + Register DstReg = MI.getOperand(0).getReg(); + MachineInstr *INSvilaneMI = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(Opc), DstReg) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .addUse(SrcReg, RegState::Kill) + .addImm(0); + + for (auto ReplaceMI : ReplaceMIList) { + LLVM_DEBUG(dbgs() << "Removed: " << ReplaceMI << "\n"); + ReplaceMI->eraseFromParent(); + } + + LLVM_DEBUG(dbgs() << MI << " replace by:\n: " << *INSvilaneMI << "\n"); + MI.eraseFromParent(); + return true; +} + bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -586,6 +655,18 @@ {AArch64::ADDXri, AArch64::ADDSXri}, MI); break; + case AArch64::INSvi64gpr: + Changed = visitINSviGPR(MI, AArch64::INSvi64lane); + break; + case AArch64::INSvi32gpr: + Changed = visitINSviGPR(MI, AArch64::INSvi32lane); + break; + case AArch64::INSvi16gpr: + Changed = visitINSviGPR(MI, AArch64::INSvi16lane); + break; + case AArch64::INSvi8gpr: + Changed = visitINSviGPR(MI, AArch64::INSvi8lane); + break; } } } diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -13,8 +13,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov.s v1[0], w8 +; CHECK-NEXT: mov.s v1[0], v0[0] ; CHECK-NEXT: ucvtf.2s v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -32,8 +31,7 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s1, v0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.s v0[0], w8 +; CHECK-NEXT: mov.s v0[0], v1[0] ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -50,12 +48,11 @@ ; CHECK-LABEL: _insert_vec_v16i32_uaddlv_from_v8i16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: movi.2d v2, #0000000000000000 -; CHECK-NEXT: uaddlv.8h s1, v0 +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: uaddlv.8h s2, v0 ; CHECK-NEXT: stp q0, q0, [x0, #32] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.s v2[0], w8 -; CHECK-NEXT: ucvtf.4s v1, v2 +; CHECK-NEXT: mov.s v1[0], v2[0] +; CHECK-NEXT: ucvtf.4s v1, v1 ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret @@ -71,16 +68,15 @@ ; CHECK-LABEL: _insert_vec_v23i32_uaddlv_from_v8i16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: movi.2d v2, #0000000000000000 -; CHECK-NEXT: uaddlv.8h s1, v0 +; CHECK-NEXT: add x8, x0, #88 +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: uaddlv.8h s2, v0 ; CHECK-NEXT: stp q0, q0, [x0, #16] ; CHECK-NEXT: stp q0, q0, [x0, #48] -; CHECK-NEXT: str d0, [x0, #80] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.s v2[0], w8 -; CHECK-NEXT: add x8, x0, #88 ; CHECK-NEXT: st1.s { v0 }[2], [x8] -; CHECK-NEXT: ucvtf.4s v1, v2 +; CHECK-NEXT: mov.s v1[0], v2[0] +; CHECK-NEXT: str d0, [x0, #80] +; CHECK-NEXT: ucvtf.4s v1, v1 ; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret @@ -98,8 +94,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: uaddlv.16b h0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov.s v1[0], w8 +; CHECK-NEXT: mov.s v1[0], v0[0] ; CHECK-NEXT: ucvtf.2s v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -118,8 +113,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: uaddlv.8b h0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov.s v1[0], w8 +; CHECK-NEXT: mov.s v1[0], v0[0] ; CHECK-NEXT: ucvtf.2s v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -138,8 +132,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: uaddlv.4h s0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov.s v1[0], w8 +; CHECK-NEXT: mov.s v1[0], v0[0] ; CHECK-NEXT: ucvtf.2s v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -157,12 +150,11 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: movi.2d v3, #0000000000000000 -; CHECK-NEXT: uaddlv.4s d2, v1 +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: uaddlv.4s d3, v1 ; CHECK-NEXT: str d1, [x0, #16] -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: mov.d v3[0], x8 -; CHECK-NEXT: ucvtf.2d v2, v3 +; CHECK-NEXT: mov.d v2[0], v3[0] +; CHECK-NEXT: ucvtf.2d v2, v2 ; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: mov.d v2[1], v0[0] ; CHECK-NEXT: str q2, [x0] @@ -181,8 +173,7 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d1, v0 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: mov.d v0[0], x8 +; CHECK-NEXT: mov.d v0[0], v1[0] ; CHECK-NEXT: ucvtf.2d v0, v0 ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: str d0, [x0] @@ -203,8 +194,7 @@ ; CHECK-NEXT: str wzr, [x0, #16] ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d2, v1 -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: mov.d v1[0], x8 +; CHECK-NEXT: mov.d v1[0], v2[0] ; CHECK-NEXT: ucvtf.2d v1, v1 ; CHECK-NEXT: fcvtn v1.2s, v1.2d ; CHECK-NEXT: mov.d v1[1], v0[0] @@ -226,8 +216,7 @@ ; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s1, v1 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.h v0[0], w8 +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] @@ -246,11 +235,10 @@ ; CHECK-LABEL: _insert_vec_v3i16_uaddlv_from_v8i16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: add x8, x0, #8 ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s1, v1 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.h v0[0], w8 -; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: st1.s { v0 }[2], [x8] @@ -271,12 +259,11 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: movi.2d v3, #0000000000000000 -; CHECK-NEXT: uaddlv.4h s2, v1 +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: uaddlv.4h s3, v1 ; CHECK-NEXT: stp q1, q1, [x0, #32] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov.s v3[0], w8 -; CHECK-NEXT: ucvtf.2d v2, v3 +; CHECK-NEXT: mov.s v2[0], v3[0] +; CHECK-NEXT: ucvtf.2d v2, v2 ; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: mov.d v2[1], v0[0] ; CHECK-NEXT: stp q2, q1, [x0] @@ -298,8 +285,7 @@ ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.8b h2, v1 ; CHECK-NEXT: stp q1, q1, [x0, #32] -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov.b v0[0], w8 +; CHECK-NEXT: mov.b v0[0], v2[0] ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: ushll.4s v0, v0, #0 @@ -323,8 +309,7 @@ ; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.8b h1, v1 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.h v0[0], w8 +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 @@ -348,8 +333,7 @@ ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: stp xzr, xzr, [x0, #32] ; CHECK-NEXT: uaddlv.4h s1, v1 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.h v0[0], w8 +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] @@ -370,8 +354,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: uaddlv.4s d1, v0 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: mov.s v0[0], w8 +; CHECK-NEXT: mov.s v0[0], v1[0] ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -389,12 +372,11 @@ ; CHECK-LABEL: _insert_vec_v16i32_uaddlv_from_v4i32: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: movi.2d v2, #0000000000000000 -; CHECK-NEXT: uaddlv.4s d1, v0 +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: uaddlv.4s d2, v0 ; CHECK-NEXT: stp q0, q0, [x0, #32] -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: mov.s v2[0], w8 -; CHECK-NEXT: ucvtf.4s v1, v2 +; CHECK-NEXT: mov.s v1[0], v2[0] +; CHECK-NEXT: ucvtf.4s v1, v1 ; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret @@ -413,8 +395,7 @@ ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d0, v0 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: mov.h v1[0], w8 +; CHECK-NEXT: mov.h v1[0], v0[0] ; CHECK-NEXT: ushll.4s v0, v1, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] @@ -436,8 +417,7 @@ ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d2, v1 ; CHECK-NEXT: stp q1, q1, [x0, #32] -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: mov.h v0[0], w8 +; CHECK-NEXT: mov.h v0[0], v2[0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: stp q0, q1, [x0] @@ -459,8 +439,7 @@ ; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d1, v1 -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: mov.h v0[0], w8 +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: ucvtf.4s v0, v0 @@ -483,8 +462,7 @@ ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: uaddlv.4s d2, v1 ; CHECK-NEXT: stp q1, q1, [x0, #32] -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: mov.b v0[0], w8 +; CHECK-NEXT: mov.b v0[0], v2[0] ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: ushll.4s v0, v0, #0 @@ -506,8 +484,7 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: uaddlv.8h s1, v0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov.s v0[2], w8 +; CHECK-NEXT: mov.s v0[2], v1[0] ; CHECK-NEXT: ucvtf.4s v0, v0 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -3373,143 +3373,136 @@ define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) { ; CHECK-LABEL: test_signed_v16f64_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d16, v0.d[1] -; CHECK-NEXT: mov w8, #127 -; CHECK-NEXT: fcvtzs w11, d0 -; CHECK-NEXT: mov w9, #-128 -; CHECK-NEXT: fcvtzs w13, d1 -; CHECK-NEXT: mov d0, v2.d[1] -; CHECK-NEXT: fcvtzs w14, d2 -; CHECK-NEXT: fcvtzs w10, d16 -; CHECK-NEXT: mov d16, v1.d[1] -; CHECK-NEXT: mov d1, v3.d[1] -; CHECK-NEXT: fcvtzs w15, d0 -; CHECK-NEXT: cmp w10, #127 -; CHECK-NEXT: csel w10, w10, w8, lt -; CHECK-NEXT: fcvtzs w12, d16 -; CHECK-NEXT: cmn w10, #128 -; CHECK-NEXT: csel w10, w10, w9, gt -; CHECK-NEXT: cmp w11, #127 -; CHECK-NEXT: csel w11, w11, w8, lt -; CHECK-NEXT: cmn w11, #128 -; CHECK-NEXT: csel w11, w11, w9, gt -; CHECK-NEXT: cmp w12, #127 -; CHECK-NEXT: csel w12, w12, w8, lt -; CHECK-NEXT: cmn w12, #128 -; CHECK-NEXT: csel w12, w12, w9, gt -; CHECK-NEXT: cmp w13, #127 -; CHECK-NEXT: csel w13, w13, w8, lt -; CHECK-NEXT: fmov s0, w11 -; CHECK-NEXT: cmn w13, #128 -; CHECK-NEXT: csel w11, w13, w9, gt -; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: csel w10, w15, w8, lt -; CHECK-NEXT: cmn w10, #128 -; CHECK-NEXT: fcvtzs w13, d3 -; CHECK-NEXT: fmov s2, w11 -; CHECK-NEXT: csel w10, w10, w9, gt -; CHECK-NEXT: cmp w14, #127 -; CHECK-NEXT: fcvtzs w11, d1 -; CHECK-NEXT: mov w15, v0.s[1] -; CHECK-NEXT: csel w14, w14, w8, lt -; CHECK-NEXT: mov v2.s[1], w12 -; CHECK-NEXT: cmn w14, #128 -; CHECK-NEXT: csel w12, w14, w9, gt -; CHECK-NEXT: cmp w11, #127 -; CHECK-NEXT: csel w11, w11, w8, lt -; CHECK-NEXT: mov d1, v4.d[1] -; CHECK-NEXT: mov v0.b[1], w15 -; CHECK-NEXT: cmn w11, #128 -; CHECK-NEXT: fmov w14, s2 -; CHECK-NEXT: csel w11, w11, w9, gt -; CHECK-NEXT: fmov s3, w12 -; CHECK-NEXT: cmp w13, #127 -; CHECK-NEXT: mov w12, v2.s[1] -; CHECK-NEXT: csel w13, w13, w8, lt -; CHECK-NEXT: mov v0.b[2], w14 -; CHECK-NEXT: cmn w13, #128 -; CHECK-NEXT: mov v3.s[1], w10 -; CHECK-NEXT: csel w13, w13, w9, gt -; CHECK-NEXT: fcvtzs w15, d1 -; CHECK-NEXT: fcvtzs w14, d4 -; CHECK-NEXT: mov d1, v5.d[1] -; CHECK-NEXT: mov v0.b[3], w12 -; CHECK-NEXT: fmov s4, w13 -; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: fmov w13, s3 -; CHECK-NEXT: csel w10, w15, w8, lt -; CHECK-NEXT: mov w12, v3.s[1] -; CHECK-NEXT: cmn w10, #128 -; CHECK-NEXT: fcvtzs w15, d1 -; CHECK-NEXT: csel w10, w10, w9, gt -; CHECK-NEXT: cmp w14, #127 -; CHECK-NEXT: mov v0.b[4], w13 -; CHECK-NEXT: csel w14, w14, w8, lt -; CHECK-NEXT: mov v4.s[1], w11 -; CHECK-NEXT: cmn w14, #128 -; CHECK-NEXT: csel w14, w14, w9, gt -; CHECK-NEXT: fcvtzs w13, d5 -; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: mov d2, v6.d[1] -; CHECK-NEXT: mov v0.b[5], w12 -; CHECK-NEXT: csel w11, w15, w8, lt -; CHECK-NEXT: fmov w12, s4 -; CHECK-NEXT: cmn w11, #128 -; CHECK-NEXT: fmov s1, w14 -; CHECK-NEXT: csel w11, w11, w9, gt -; CHECK-NEXT: cmp w13, #127 -; CHECK-NEXT: mov w14, v4.s[1] -; CHECK-NEXT: mov v0.b[6], w12 -; CHECK-NEXT: csel w13, w13, w8, lt -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: cmn w13, #128 -; CHECK-NEXT: fcvtzs w15, d2 -; CHECK-NEXT: csel w13, w13, w9, gt -; CHECK-NEXT: fcvtzs w10, d6 -; CHECK-NEXT: mov v0.b[7], w14 -; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: fmov w14, s1 -; CHECK-NEXT: csel w12, w15, w8, lt -; CHECK-NEXT: fmov s2, w13 -; CHECK-NEXT: mov w13, v1.s[1] -; CHECK-NEXT: mov d1, v7.d[1] -; CHECK-NEXT: cmn w12, #128 -; CHECK-NEXT: fcvtzs w15, d7 -; CHECK-NEXT: csel w12, w12, w9, gt -; CHECK-NEXT: cmp w10, #127 -; CHECK-NEXT: mov v0.b[8], w14 -; CHECK-NEXT: csel w10, w10, w8, lt -; CHECK-NEXT: mov v2.s[1], w11 -; CHECK-NEXT: cmn w10, #128 -; CHECK-NEXT: fcvtzs w11, d1 -; CHECK-NEXT: csel w10, w10, w9, gt -; CHECK-NEXT: mov v0.b[9], w13 -; CHECK-NEXT: fmov w14, s2 -; CHECK-NEXT: cmp w11, #127 -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: csel w10, w11, w8, lt -; CHECK-NEXT: cmn w10, #128 -; CHECK-NEXT: mov w13, v2.s[1] -; CHECK-NEXT: mov v0.b[10], w14 -; CHECK-NEXT: csel w10, w10, w9, gt -; CHECK-NEXT: cmp w15, #127 -; CHECK-NEXT: mov v1.s[1], w12 -; CHECK-NEXT: csel w8, w15, w8, lt -; CHECK-NEXT: cmn w8, #128 -; CHECK-NEXT: csel w8, w8, w9, gt -; CHECK-NEXT: mov v0.b[11], w13 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.b[12], w9 -; CHECK-NEXT: mov v2.s[1], w10 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov w9, v2.s[1] -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w9 -; CHECK-NEXT: ret +; CHECK-NEXT: mov d16, v0.d[1] +; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: fcvtzs w11, d0 +; CHECK-NEXT: mov w9, #-128 +; CHECK-NEXT: mov d0, v2.d[1] +; CHECK-NEXT: fcvtzs w13, d1 +; CHECK-NEXT: fcvtzs w10, d16 +; CHECK-NEXT: mov d16, v1.d[1] +; CHECK-NEXT: mov d1, v3.d[1] +; CHECK-NEXT: fcvtzs w14, d0 +; CHECK-NEXT: cmp w10, #127 +; CHECK-NEXT: csel w10, w10, w8, lt +; CHECK-NEXT: fcvtzs w12, d16 +; CHECK-NEXT: cmn w10, #128 +; CHECK-NEXT: csel w10, w10, w9, gt +; CHECK-NEXT: cmp w11, #127 +; CHECK-NEXT: csel w11, w11, w8, lt +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: csel w11, w11, w9, gt +; CHECK-NEXT: cmp w12, #127 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: cmn w12, #128 +; CHECK-NEXT: csel w12, w12, w9, gt +; CHECK-NEXT: cmp w13, #127 +; CHECK-NEXT: fmov s0, w11 +; CHECK-NEXT: csel w11, w13, w8, lt +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: fcvtzs w13, d2 +; CHECK-NEXT: csel w11, w11, w9, gt +; CHECK-NEXT: cmp w14, #127 +; CHECK-NEXT: mov v0.s[1], w10 +; CHECK-NEXT: csel w14, w14, w8, lt +; CHECK-NEXT: cmn w14, #128 +; CHECK-NEXT: fcvtzs w10, d3 +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: csel w11, w14, w9, gt +; CHECK-NEXT: cmp w13, #127 +; CHECK-NEXT: mov w14, v0.s[1] +; CHECK-NEXT: csel w13, w13, w8, lt +; CHECK-NEXT: cmn w13, #128 +; CHECK-NEXT: csel w13, w13, w9, gt +; CHECK-NEXT: mov v2.s[1], w12 +; CHECK-NEXT: fcvtzs w12, d1 +; CHECK-NEXT: mov v0.b[1], w14 +; CHECK-NEXT: mov d1, v4.d[1] +; CHECK-NEXT: fmov s3, w13 +; CHECK-NEXT: cmp w12, #127 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: fcvtzs w14, d4 +; CHECK-NEXT: cmn w12, #128 +; CHECK-NEXT: mov v3.s[1], w11 +; CHECK-NEXT: mov w11, v2.s[1] +; CHECK-NEXT: mov v0.b[2], v2.b[0] +; CHECK-NEXT: csel w12, w12, w9, gt +; CHECK-NEXT: fcvtzs w13, d1 +; CHECK-NEXT: cmp w10, #127 +; CHECK-NEXT: csel w10, w10, w8, lt +; CHECK-NEXT: mov d1, v5.d[1] +; CHECK-NEXT: cmn w10, #128 +; CHECK-NEXT: mov d2, v6.d[1] +; CHECK-NEXT: csel w10, w10, w9, gt +; CHECK-NEXT: cmp w13, #127 +; CHECK-NEXT: mov v0.b[3], w11 +; CHECK-NEXT: csel w13, w13, w8, lt +; CHECK-NEXT: cmn w13, #128 +; CHECK-NEXT: fcvtzs w11, d1 +; CHECK-NEXT: csel w13, w13, w9, gt +; CHECK-NEXT: cmp w14, #127 +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: csel w10, w14, w8, lt +; CHECK-NEXT: mov w14, v3.s[1] +; CHECK-NEXT: cmn w10, #128 +; CHECK-NEXT: mov v0.b[4], v3.b[0] +; CHECK-NEXT: csel w10, w10, w9, gt +; CHECK-NEXT: mov v1.s[1], w12 +; CHECK-NEXT: cmp w11, #127 +; CHECK-NEXT: csel w11, w11, w8, lt +; CHECK-NEXT: fcvtzs w12, d5 +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: mov v0.b[5], w14 +; CHECK-NEXT: fcvtzs w14, d2 +; CHECK-NEXT: fmov s2, w10 +; CHECK-NEXT: csel w10, w11, w9, gt +; CHECK-NEXT: mov w11, v1.s[1] +; CHECK-NEXT: cmp w12, #127 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: mov v0.b[6], v1.b[0] +; CHECK-NEXT: cmn w12, #128 +; CHECK-NEXT: mov v2.s[1], w13 +; CHECK-NEXT: csel w12, w12, w9, gt +; CHECK-NEXT: cmp w14, #127 +; CHECK-NEXT: csel w13, w14, w8, lt +; CHECK-NEXT: mov v0.b[7], w11 +; CHECK-NEXT: fcvtzs w11, d6 +; CHECK-NEXT: cmn w13, #128 +; CHECK-NEXT: fmov s1, w12 +; CHECK-NEXT: csel w12, w13, w9, gt +; CHECK-NEXT: mov w13, v2.s[1] +; CHECK-NEXT: cmp w11, #127 +; CHECK-NEXT: mov v0.b[8], v2.b[0] +; CHECK-NEXT: mov d2, v7.d[1] +; CHECK-NEXT: csel w11, w11, w8, lt +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: csel w10, w11, w9, gt +; CHECK-NEXT: mov v0.b[9], w13 +; CHECK-NEXT: fcvtzs w11, d2 +; CHECK-NEXT: fmov s2, w10 +; CHECK-NEXT: fcvtzs w13, d7 +; CHECK-NEXT: mov w10, v1.s[1] +; CHECK-NEXT: mov v0.b[10], v1.b[0] +; CHECK-NEXT: cmp w11, #127 +; CHECK-NEXT: csel w11, w11, w8, lt +; CHECK-NEXT: cmn w11, #128 +; CHECK-NEXT: mov v2.s[1], w12 +; CHECK-NEXT: mov v0.b[11], w10 +; CHECK-NEXT: csel w10, w11, w9, gt +; CHECK-NEXT: cmp w13, #127 +; CHECK-NEXT: csel w8, w13, w8, lt +; CHECK-NEXT: cmn w8, #128 +; CHECK-NEXT: csel w8, w8, w9, gt +; CHECK-NEXT: mov w9, v2.s[1] +; CHECK-NEXT: mov v0.b[12], v2.b[0] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.b[13], w9 +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.b[14], v1.b[0] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ret %x = call <16 x i8> @llvm.fptosi.sat.v16f64.v16i8(<16 x double> %f) ret <16 x i8> %x } diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -2803,110 +2803,103 @@ define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) { ; CHECK-LABEL: test_unsigned_v16f64_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d16, v0.d[1] -; CHECK-NEXT: fcvtzu w10, d0 -; CHECK-NEXT: mov d0, v1.d[1] -; CHECK-NEXT: mov w8, #255 -; CHECK-NEXT: fcvtzu w12, d1 -; CHECK-NEXT: mov d1, v2.d[1] -; CHECK-NEXT: fcvtzu w9, d16 -; CHECK-NEXT: fcvtzu w11, d0 -; CHECK-NEXT: cmp w9, #255 -; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: cmp w10, #255 -; CHECK-NEXT: csel w10, w10, w8, lo -; CHECK-NEXT: cmp w11, #255 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: csel w10, w11, w8, lo -; CHECK-NEXT: cmp w12, #255 -; CHECK-NEXT: csel w11, w12, w8, lo -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: fcvtzu w9, d1 -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: fcvtzu w11, d2 -; CHECK-NEXT: cmp w9, #255 -; CHECK-NEXT: mov d2, v3.d[1] -; CHECK-NEXT: mov w12, v0.s[1] -; CHECK-NEXT: csel w9, w9, w8, lo -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: cmp w11, #255 -; CHECK-NEXT: csel w11, w11, w8, lo -; CHECK-NEXT: fcvtzu w10, d2 -; CHECK-NEXT: mov d2, v4.d[1] -; CHECK-NEXT: mov v0.b[1], w12 -; CHECK-NEXT: fmov w13, s1 -; CHECK-NEXT: mov w12, v1.s[1] -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: fcvtzu w11, d3 -; CHECK-NEXT: cmp w10, #255 -; CHECK-NEXT: mov v0.b[2], w13 -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: csel w9, w10, w8, lo -; CHECK-NEXT: cmp w11, #255 -; CHECK-NEXT: fcvtzu w10, d2 -; CHECK-NEXT: csel w11, w11, w8, lo -; CHECK-NEXT: mov d2, v5.d[1] -; CHECK-NEXT: mov v0.b[3], w12 -; CHECK-NEXT: fmov w12, s1 -; CHECK-NEXT: cmp w10, #255 -; CHECK-NEXT: mov w13, v1.s[1] -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: fcvtzu w11, d4 -; CHECK-NEXT: mov v0.b[4], w12 -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: csel w9, w10, w8, lo -; CHECK-NEXT: cmp w11, #255 -; CHECK-NEXT: csel w10, w11, w8, lo -; CHECK-NEXT: mov v0.b[5], w13 -; CHECK-NEXT: fcvtzu w13, d2 -; CHECK-NEXT: fmov w11, s1 -; CHECK-NEXT: mov w12, v1.s[1] -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: fcvtzu w10, d5 -; CHECK-NEXT: cmp w13, #255 -; CHECK-NEXT: mov v0.b[6], w11 -; CHECK-NEXT: mov d2, v6.d[1] -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: csel w9, w13, w8, lo -; CHECK-NEXT: cmp w10, #255 -; CHECK-NEXT: fcvtzu w13, d6 -; CHECK-NEXT: csel w10, w10, w8, lo -; CHECK-NEXT: mov v0.b[7], w12 -; CHECK-NEXT: fcvtzu w12, d2 -; CHECK-NEXT: fmov w11, s1 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: mov w10, v1.s[1] -; CHECK-NEXT: cmp w12, #255 -; CHECK-NEXT: mov d1, v7.d[1] -; CHECK-NEXT: mov v0.b[8], w11 -; CHECK-NEXT: mov v2.s[1], w9 -; CHECK-NEXT: csel w9, w12, w8, lo -; CHECK-NEXT: cmp w13, #255 -; CHECK-NEXT: csel w11, w13, w8, lo -; CHECK-NEXT: fcvtzu w13, d7 -; CHECK-NEXT: mov v0.b[9], w10 -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: fmov s3, w11 -; CHECK-NEXT: fcvtzu w11, d1 -; CHECK-NEXT: mov w12, v2.s[1] -; CHECK-NEXT: mov v0.b[10], w10 -; CHECK-NEXT: mov v3.s[1], w9 -; CHECK-NEXT: cmp w11, #255 -; CHECK-NEXT: csel w9, w11, w8, lo -; CHECK-NEXT: cmp w13, #255 -; CHECK-NEXT: csel w8, w13, w8, lo -; CHECK-NEXT: mov v0.b[11], w12 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov w8, v3.s[1] -; CHECK-NEXT: mov v0.b[12], w10 -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov w9, v1.s[1] -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w9 -; CHECK-NEXT: ret +; CHECK-NEXT: mov d16, v0.d[1] +; CHECK-NEXT: fcvtzu w10, d0 +; CHECK-NEXT: mov d0, v1.d[1] +; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: fcvtzu w12, d1 +; CHECK-NEXT: mov d1, v2.d[1] +; CHECK-NEXT: fcvtzu w9, d16 +; CHECK-NEXT: fcvtzu w11, d0 +; CHECK-NEXT: cmp w9, #255 +; CHECK-NEXT: csel w9, w9, w8, lo +; CHECK-NEXT: cmp w10, #255 +; CHECK-NEXT: csel w10, w10, w8, lo +; CHECK-NEXT: cmp w11, #255 +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: csel w10, w11, w8, lo +; CHECK-NEXT: cmp w12, #255 +; CHECK-NEXT: csel w11, w12, w8, lo +; CHECK-NEXT: fcvtzu w12, d2 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: fcvtzu w9, d1 +; CHECK-NEXT: mov d1, v3.d[1] +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: cmp w9, #255 +; CHECK-NEXT: mov w11, v0.s[1] +; CHECK-NEXT: csel w9, w9, w8, lo +; CHECK-NEXT: cmp w12, #255 +; CHECK-NEXT: mov v2.s[1], w10 +; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: fcvtzu w10, d1 +; CHECK-NEXT: mov v0.b[1], w11 +; CHECK-NEXT: fcvtzu w11, d3 +; CHECK-NEXT: fmov s3, w12 +; CHECK-NEXT: mov w12, v2.s[1] +; CHECK-NEXT: cmp w10, #255 +; CHECK-NEXT: mov d1, v4.d[1] +; CHECK-NEXT: csel w10, w10, w8, lo +; CHECK-NEXT: mov v0.b[2], v2.b[0] +; CHECK-NEXT: cmp w11, #255 +; CHECK-NEXT: mov v3.s[1], w9 +; CHECK-NEXT: csel w11, w11, w8, lo +; CHECK-NEXT: fcvtzu w9, d1 +; CHECK-NEXT: mov d1, v5.d[1] +; CHECK-NEXT: mov v0.b[3], w12 +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: mov w11, v3.s[1] +; CHECK-NEXT: fcvtzu w12, d4 +; CHECK-NEXT: cmp w9, #255 +; CHECK-NEXT: csel w9, w9, w8, lo +; CHECK-NEXT: mov v0.b[4], v3.b[0] +; CHECK-NEXT: cmp w12, #255 +; CHECK-NEXT: mov v2.s[1], w10 +; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: fcvtzu w10, d1 +; CHECK-NEXT: mov v0.b[5], w11 +; CHECK-NEXT: fcvtzu w11, d5 +; CHECK-NEXT: fmov s1, w12 +; CHECK-NEXT: mov w12, v2.s[1] +; CHECK-NEXT: cmp w10, #255 +; CHECK-NEXT: csel w10, w10, w8, lo +; CHECK-NEXT: cmp w11, #255 +; CHECK-NEXT: mov v0.b[6], v2.b[0] +; CHECK-NEXT: mov d2, v6.d[1] +; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: csel w11, w11, w8, lo +; CHECK-NEXT: fcvtzu w9, d6 +; CHECK-NEXT: mov v0.b[7], w12 +; CHECK-NEXT: fcvtzu w12, d2 +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: mov w11, v1.s[1] +; CHECK-NEXT: cmp w12, #255 +; CHECK-NEXT: mov v0.b[8], v1.b[0] +; CHECK-NEXT: csel w12, w12, w8, lo +; CHECK-NEXT: cmp w9, #255 +; CHECK-NEXT: mov d1, v7.d[1] +; CHECK-NEXT: mov v2.s[1], w10 +; CHECK-NEXT: csel w9, w9, w8, lo +; CHECK-NEXT: mov v0.b[9], w11 +; CHECK-NEXT: fcvtzu w11, d7 +; CHECK-NEXT: fmov s3, w9 +; CHECK-NEXT: mov w9, v2.s[1] +; CHECK-NEXT: fcvtzu w10, d1 +; CHECK-NEXT: mov v0.b[10], v2.b[0] +; CHECK-NEXT: mov v3.s[1], w12 +; CHECK-NEXT: cmp w10, #255 +; CHECK-NEXT: mov v0.b[11], w9 +; CHECK-NEXT: csel w9, w10, w8, lo +; CHECK-NEXT: cmp w11, #255 +; CHECK-NEXT: mov w10, v3.s[1] +; CHECK-NEXT: csel w8, w11, w8, lo +; CHECK-NEXT: mov v0.b[12], v3.b[0] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.b[13], w10 +; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: mov v0.b[14], v1.b[0] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ret %x = call <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f) ret <16 x i8> %x } diff --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll --- a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll +++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll @@ -37,20 +37,19 @@ define <8 x i8> @extract_2_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: extract_2_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov v0.b[2], w9 -; CHECK-NEXT: mov w9, v1.s[1] -; CHECK-NEXT: mov v0.b[3], w10 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov w8, v1.s[2] -; CHECK-NEXT: mov v0.b[5], w9 -; CHECK-NEXT: mov w9, v1.s[3] -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w9 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: mov w10, v0.s[3] +; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov v0.b[2], w9 +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov v0.b[3], w10 +; CHECK-NEXT: mov v0.b[4], v1.b[0] +; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: mov w8, v1.s[3] +; CHECK-NEXT: mov v0.b[6], w9 +; CHECK-NEXT: mov v0.b[7], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir @@ -0,0 +1,592 @@ +# RUN: llc -run-pass=aarch64-mi-peephole-opt -simplify-mir -o - %s | FileCheck %s +--- | + ; ModuleID = '/Users/nilanjana/Documents/code/llvm-project/llvm/test/CodeGen/AArch64/tmp.ll' + source_filename = "/Users/nilanjana/Documents/code/llvm-project/llvm/test/CodeGen/AArch64/tmp.ll" + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "arm64-apple-ios" + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>) #0 + + ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) + declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>) #0 + + define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) { + entry: + %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer) + %1 = insertelement <6 x i64> zeroinitializer, i64 %vaddlv, i64 0 + %2 = uitofp <6 x i64> %1 to <6 x float> + store <6 x float> %2, ptr %0, align 8 + ret void + } + + define void @insert_vec_v2i32_uaddlv_from_v8i16(ptr %0) { + entry: + %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> zeroinitializer) + %1 = insertelement <2 x i32> zeroinitializer, i32 %vaddlv, i64 0 + %2 = uitofp <2 x i32> %1 to <2 x float> + store <2 x float> %2, ptr %0, align 8 + ret void + } + + define void @insert_vec_v8i16_uaddlv_from_v8i16(ptr %0) { + entry: + %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> zeroinitializer) + %1 = trunc i32 %vaddlv to i16 + %2 = insertelement <8 x i16> zeroinitializer, i16 %1, i64 0 + %3 = uitofp <8 x i16> %2 to <8 x float> + store <8 x float> %3, ptr %0, align 8 + ret void + } + + define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) { + entry: + %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer) + %1 = trunc i64 %vaddlv to i8 + %2 = insertelement <16 x i8> zeroinitializer, i8 %1, i64 0 + %3 = uitofp <16 x i8> %2 to <16 x float> + store <16 x float> %3, ptr %0, align 8 + ret void + } + + define void @insert_vec_v2i32_uaddlv_from_v8i16_nz_index(ptr %0) { + entry: + %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> zeroinitializer) + %1 = insertelement <4 x i32> zeroinitializer, i32 %vaddlv, i64 2 + %2 = uitofp <4 x i32> %1 to <4 x float> + store <4 x float> %2, ptr %0, align 8 + ret void + } + + attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } + +... +--- +name: insert_vec_v6i64_uaddlv_from_v4i32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gpr64common, preferred-register: '' } + - { id: 1, class: fpr128, preferred-register: '' } + - { id: 2, class: fpr64, preferred-register: '' } + - { id: 3, class: fpr128, preferred-register: '' } + - { id: 4, class: fpr128, preferred-register: '' } + - { id: 5, class: gpr64, preferred-register: '' } + - { id: 6, class: fpr128, preferred-register: '' } + - { id: 7, class: fpr128, preferred-register: '' } + - { id: 8, class: fpr64, preferred-register: '' } + - { id: 9, class: fpr128, preferred-register: '' } + - { id: 10, class: fpr128, preferred-register: '' } + - { id: 11, class: fpr128, preferred-register: '' } + - { id: 12, class: fpr64, preferred-register: '' } + - { id: 13, class: fpr128, preferred-register: '' } + - { id: 14, class: fpr128, preferred-register: '' } + - { id: 15, class: fpr128, preferred-register: '' } + - { id: 16, class: gpr64all, preferred-register: '' } + - { id: 17, class: fpr64, preferred-register: '' } +liveins: + - { reg: '$x0', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: insert_vec_v6i64_uaddlv_from_v4i32 + ; CHECK: bb.0.entry: + ; CHECK: liveins: $x0 + ; CHECK: %0:gpr64common = COPY $x0 + ; CHECK: %1:fpr128 = MOVIv2d_ns 0 + ; CHECK: %2:fpr64 = UADDLVv4i32v %1 + ; CHECK: %4:fpr128 = IMPLICIT_DEF + ; CHECK: %3:fpr128 = INSERT_SUBREG %4, killed %2, %subreg.dsub + ; CHECK: %7:fpr128 = INSvi64lane %1, 0, killed %3, 0 + ; CHECK: %8:fpr64 = MOVID 0 + ; CHECK: %10:fpr128 = IMPLICIT_DEF + ; CHECK: %9:fpr128 = INSERT_SUBREG %10, killed %8, %subreg.dsub + ; CHECK: %11:fpr128 = nofpexcept UCVTFv2f64 killed %7, implicit $fpcr + ; CHECK: %12:fpr64 = nofpexcept FCVTNv2i32 killed %11, implicit $fpcr + ; CHECK: %14:fpr128 = IMPLICIT_DEF + ; CHECK: %13:fpr128 = INSERT_SUBREG %14, killed %12, %subreg.dsub + ; CHECK: %15:fpr128 = INSvi64lane %13, 1, killed %9, 0 + ; CHECK: %17:fpr64 = COPY %1.dsub + ; CHECK: STRDui killed %17, %0, 2 :: (store (s64) into %ir.0 + 16) + ; CHECK: STRQui killed %15, %0, 0 :: (store (s128) into %ir.0, align 8) + ; CHECK: RET_ReallyLR + bb.0.entry: + liveins: $x0 + + %0:gpr64common = COPY $x0 + %1:fpr128 = MOVIv2d_ns 0 + %2:fpr64 = UADDLVv4i32v %1 + %4:fpr128 = IMPLICIT_DEF + %3:fpr128 = INSERT_SUBREG %4, killed %2, %subreg.dsub + %5:gpr64 = COPY %3.dsub + %7:fpr128 = INSvi64gpr %1, 0, killed %5 + %8:fpr64 = MOVID 0 + %10:fpr128 = IMPLICIT_DEF + %9:fpr128 = INSERT_SUBREG %10, killed %8, %subreg.dsub + %11:fpr128 = nofpexcept UCVTFv2f64 killed %7, implicit $fpcr + %12:fpr64 = nofpexcept FCVTNv2i32 killed %11, implicit $fpcr + %14:fpr128 = IMPLICIT_DEF + %13:fpr128 = INSERT_SUBREG %14, killed %12, %subreg.dsub + %15:fpr128 = INSvi64lane %13, 1, killed %9, 0 + %17:fpr64 = COPY %1.dsub + STRDui killed %17, %0, 2 :: (store (s64) into %ir.0 + 16) + STRQui killed %15, %0, 0 :: (store (s128) into %ir.0, align 8) + RET_ReallyLR + +... +--- +name: insert_vec_v2i32_uaddlv_from_v8i16 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gpr64common, preferred-register: '' } + - { id: 1, class: fpr128, preferred-register: '' } + - { id: 2, class: fpr32, preferred-register: '' } + - { id: 3, class: fpr128, preferred-register: '' } + - { id: 4, class: fpr128, preferred-register: '' } + - { id: 5, class: gpr32, preferred-register: '' } + - { id: 6, class: fpr64, preferred-register: '' } + - { id: 7, class: fpr128, preferred-register: '' } + - { id: 8, class: fpr128, preferred-register: '' } + - { id: 9, class: fpr128, preferred-register: '' } + - { id: 10, class: fpr64, preferred-register: '' } + - { id: 11, class: fpr64, preferred-register: '' } +liveins: + - { reg: '$x0', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: insert_vec_v2i32_uaddlv_from_v8i16 + ; CHECK: bb.0.entry: + ; CHECK: liveins: $x0 + ; CHECK: %0:gpr64common = COPY $x0 + ; CHECK: %1:fpr128 = MOVIv2d_ns 0 + ; CHECK: %2:fpr32 = UADDLVv8i16v killed %1 + ; CHECK: %4:fpr128 = IMPLICIT_DEF + ; CHECK: %3:fpr128 = INSERT_SUBREG %4, killed %2, %subreg.ssub + ; CHECK: %6:fpr64 = MOVID 0 + ; CHECK: %8:fpr128 = IMPLICIT_DEF + ; CHECK: %7:fpr128 = INSERT_SUBREG %8, killed %6, %subreg.dsub + ; CHECK: %9:fpr128 = INSvi32lane %7, 0, killed %3, 0 + ; CHECK: %10:fpr64 = COPY %9.dsub + ; CHECK: %11:fpr64 = nofpexcept UCVTFv2f32 killed %10, implicit $fpcr + ; CHECK: STRDui killed %11, %0, 0 :: (store (s64) into %ir.0) + ; CHECK: RET_ReallyLR + bb.0.entry: + liveins: $x0 + + %0:gpr64common = COPY $x0 + %1:fpr128 = MOVIv2d_ns 0 + %2:fpr32 = UADDLVv8i16v killed %1 + %4:fpr128 = IMPLICIT_DEF + %3:fpr128 = INSERT_SUBREG %4, killed %2, %subreg.ssub + %5:gpr32 = COPY %3.ssub + %6:fpr64 = MOVID 0 + %8:fpr128 = IMPLICIT_DEF + %7:fpr128 = INSERT_SUBREG %8, killed %6, %subreg.dsub + %9:fpr128 = INSvi32gpr %7, 0, killed %5 + %10:fpr64 = COPY %9.dsub + %11:fpr64 = nofpexcept UCVTFv2f32 killed %10, implicit $fpcr + STRDui killed %11, %0, 0 :: (store (s64) into %ir.0) + RET_ReallyLR + +... +--- +name: insert_vec_v8i16_uaddlv_from_v8i16 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gpr64common, preferred-register: '' } + - { id: 1, class: fpr128, preferred-register: '' } + - { id: 2, class: fpr32, preferred-register: '' } + - { id: 3, class: fpr128, preferred-register: '' } + - { id: 4, class: fpr128, preferred-register: '' } + - { id: 5, class: gpr32, preferred-register: '' } + - { id: 6, class: fpr64, preferred-register: '' } + - { id: 7, class: fpr128, preferred-register: '' } + - { id: 8, class: fpr128, preferred-register: '' } + - { id: 9, class: fpr128, preferred-register: '' } + - { id: 10, class: fpr64, preferred-register: '' } + - { id: 11, class: fpr128, preferred-register: '' } + - { id: 12, class: fpr128, preferred-register: '' } + - { id: 13, class: gpr32, preferred-register: '' } +liveins: + - { reg: '$x0', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: insert_vec_v8i16_uaddlv_from_v8i16 + ; CHECK: bb.0.entry: + ; CHECK: liveins: $x0 + ; CHECK: %0:gpr64common = COPY $x0 + ; CHECK: %1:fpr128 = MOVIv2d_ns 0 + ; CHECK: %2:fpr32 = UADDLVv8i16v killed %1 + ; CHECK: %4:fpr128 = IMPLICIT_DEF + ; CHECK: %3:fpr128 = INSERT_SUBREG %4, killed %2, %subreg.ssub + ; CHECK: %6:fpr64 = MOVID 0 + ; CHECK: %8:fpr128 = IMPLICIT_DEF + ; CHECK: %7:fpr128 = INSERT_SUBREG %8, killed %6, %subreg.dsub + ; CHECK: %9:fpr128 = INSvi16lane %7, 0, killed %3, 0 + ; CHECK: %10:fpr64 = COPY %9.dsub + ; CHECK: %11:fpr128 = USHLLv4i16_shift killed %10, 0 + ; CHECK: %12:fpr128 = nofpexcept UCVTFv4f32 killed %11, implicit $fpcr + ; CHECK: %13:gpr32 = COPY $wzr + ; CHECK: STRWui %13, %0, 7 :: (store (s32) into %ir.0 + 28) + ; CHECK: STRWui %13, %0, 6 :: (store (s32) into %ir.0 + 24, align 8) + ; CHECK: STRWui %13, %0, 5 :: (store (s32) into %ir.0 + 20) + ; CHECK: STRWui %13, %0, 4 :: (store (s32) into %ir.0 + 16, align 8) + ; CHECK: STRQui killed %12, %0, 0 :: (store (s128) into %ir.0, align 8) + ; CHECK: RET_ReallyLR + bb.0.entry: + liveins: $x0 + + %0:gpr64common = COPY $x0 + %1:fpr128 = MOVIv2d_ns 0 + %2:fpr32 = UADDLVv8i16v killed %1 + %4:fpr128 = IMPLICIT_DEF + %3:fpr128 = INSERT_SUBREG %4, killed %2, %subreg.ssub + %5:gpr32 = COPY %3.ssub + %6:fpr64 = MOVID 0 + %8:fpr128 = IMPLICIT_DEF + %7:fpr128 = INSERT_SUBREG %8, killed %6, %subreg.dsub + %9:fpr128 = INSvi16gpr %7, 0, killed %5 + %10:fpr64 = COPY %9.dsub + %11:fpr128 = USHLLv4i16_shift killed %10, 0 + %12:fpr128 = nofpexcept UCVTFv4f32 killed %11, implicit $fpcr + %13:gpr32 = COPY $wzr + STRWui %13, %0, 7 :: (store (s32) into %ir.0 + 28) + STRWui %13, %0, 6 :: (store (s32) into %ir.0 + 24, align 8) + STRWui %13, %0, 5 :: (store (s32) into %ir.0 + 20) + STRWui %13, %0, 4 :: (store (s32) into %ir.0 + 16, align 8) + STRQui killed %12, %0, 0 :: (store (s128) into %ir.0, align 8) + RET_ReallyLR + +... +--- +name: insert_vec_v16i8_uaddlv_from_v4i32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gpr64common, preferred-register: '' } + - { id: 1, class: fpr128, preferred-register: '' } + - { id: 2, class: fpr64, preferred-register: '' } + - { id: 3, class: fpr128, preferred-register: '' } + - { id: 4, class: fpr128, preferred-register: '' } + - { id: 5, class: gpr64all, preferred-register: '' } + - { id: 6, class: gpr32, preferred-register: '' } + - { id: 7, class: fpr64, preferred-register: '' } + - { id: 8, class: fpr128, preferred-register: '' } + - { id: 9, class: fpr128, preferred-register: '' } + - { id: 10, class: fpr128, preferred-register: '' } + - { id: 11, class: fpr64, preferred-register: '' } + - { id: 12, class: fpr64, preferred-register: '' } + - { id: 13, class: fpr64, preferred-register: '' } + - { id: 14, class: fpr64, preferred-register: '' } + - { id: 15, class: fpr128, preferred-register: '' } + - { id: 16, class: fpr128, preferred-register: '' } + - { id: 17, class: fpr128, preferred-register: '' } +liveins: + - { reg: '$x0', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: insert_vec_v16i8_uaddlv_from_v4i32 + ; CHECK: bb.0.entry: + ; CHECK: liveins: $x0 + ; CHECK: %0:gpr64common = COPY $x0 + ; CHECK: %1:fpr128 = MOVIv2d_ns 0 + ; CHECK: %2:fpr64 = UADDLVv4i32v %1 + ; CHECK: %4:fpr128 = IMPLICIT_DEF + ; CHECK: %3:fpr128 = INSERT_SUBREG %4, killed %2, %subreg.dsub + ; CHECK: %7:fpr64 = MOVID 0 + ; CHECK: %9:fpr128 = IMPLICIT_DEF + ; CHECK: %8:fpr128 = INSERT_SUBREG %9, killed %7, %subreg.dsub + ; CHECK: %10:fpr128 = INSvi8lane %8, 0, killed %3, 0 + ; CHECK: %11:fpr64 = COPY %10.dsub + ; CHECK: %13:fpr64 = IMPLICIT_DEF + ; CHECK: %12:fpr64 = ZIP1v8i8 killed %11, killed %13 + ; CHECK: %14:fpr64 = BICv4i16 %12, 255, 8 + ; CHECK: %15:fpr128 = USHLLv4i16_shift killed %14, 0 + ; CHECK: %16:fpr128 = nofpexcept UCVTFv4f32 killed %15, implicit $fpcr + ; CHECK: STRQui %1, %0, 3 :: (store (s128) into %ir.0 + 48, align 8) + ; CHECK: STRQui %1, %0, 2 :: (store (s128) into %ir.0 + 32, align 8) + ; CHECK: STRQui %1, %0, 1 :: (store (s128) into %ir.0 + 16, align 8) + ; CHECK: STRQui killed %16, %0, 0 :: (store (s128) into %ir.0, align 8) + ; CHECK: RET_ReallyLR + bb.0.entry: + liveins: $x0 + + %0:gpr64common = COPY $x0 + %1:fpr128 = MOVIv2d_ns 0 + %2:fpr64 = UADDLVv4i32v %1 + %4:fpr128 = IMPLICIT_DEF + %3:fpr128 = INSERT_SUBREG %4, killed %2, %subreg.dsub + %5:gpr64all = COPY %3.dsub + %6:gpr32 = COPY %5.sub_32 + %7:fpr64 = MOVID 0 + %9:fpr128 = IMPLICIT_DEF + %8:fpr128 = INSERT_SUBREG %9, killed %7, %subreg.dsub + %10:fpr128 = INSvi8gpr %8, 0, killed %6 + %11:fpr64 = COPY %10.dsub + %13:fpr64 = IMPLICIT_DEF + %12:fpr64 = ZIP1v8i8 killed %11, killed %13 + %14:fpr64 = BICv4i16 %12, 255, 8 + %15:fpr128 = USHLLv4i16_shift killed %14, 0 + %16:fpr128 = nofpexcept UCVTFv4f32 killed %15, implicit $fpcr + STRQui %1, %0, 3 :: (store (s128) into %ir.0 + 48, align 8) + STRQui %1, %0, 2 :: (store (s128) into %ir.0 + 32, align 8) + STRQui %1, %0, 1 :: (store (s128) into %ir.0 + 16, align 8) + STRQui killed %16, %0, 0 :: (store (s128) into %ir.0, align 8) + RET_ReallyLR + +... +--- +name: insert_vec_v2i32_uaddlv_from_v8i16_nz_index +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: false +registers: + - { id: 0, class: gpr64common, preferred-register: '' } + - { id: 1, class: fpr128, preferred-register: '' } + - { id: 2, class: fpr32, preferred-register: '' } + - { id: 3, class: fpr128, preferred-register: '' } + - { id: 4, class: fpr128, preferred-register: '' } + - { id: 5, class: gpr32, preferred-register: '' } + - { id: 6, class: fpr128, preferred-register: '' } + - { id: 7, class: fpr128, preferred-register: '' } + - { id: 8, class: fpr128, preferred-register: '' } +liveins: + - { reg: '$x0', virtual-reg: '%0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: insert_vec_v2i32_uaddlv_from_v8i16_nz_index + ; CHECK: bb.0.entry: + ; CHECK: liveins: $x0 + ; CHECK: %0:gpr64common = COPY $x0 + ; CHECK: %1:fpr128 = MOVIv2d_ns 0 + ; CHECK: %2:fpr32 = UADDLVv8i16v %1 + ; CHECK: %4:fpr128 = IMPLICIT_DEF + ; CHECK: %3:fpr128 = INSERT_SUBREG %4, killed %2, %subreg.ssub + ; CHECK: %7:fpr128 = INSvi32lane %1, 2, killed %3, 0 + ; CHECK: %8:fpr128 = nofpexcept UCVTFv4f32 killed %7, implicit $fpcr + ; CHECK: STRQui killed %8, %0, 0 :: (store (s128) into %ir.0, align 8) + ; CHECK: RET_ReallyLR + bb.0.entry: + liveins: $x0 + + %0:gpr64common = COPY $x0 + %1:fpr128 = MOVIv2d_ns 0 + %2:fpr32 = UADDLVv8i16v %1 + %4:fpr128 = IMPLICIT_DEF + %3:fpr128 = INSERT_SUBREG %4, killed %2, %subreg.ssub + %5:gpr32 = COPY %3.ssub + %7:fpr128 = INSvi32gpr %1, 2, killed %5 + %8:fpr128 = nofpexcept UCVTFv4f32 killed %7, implicit $fpcr + STRQui killed %8, %0, 0 :: (store (s128) into %ir.0, align 8) + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -571,26 +571,25 @@ define void @masked_gather_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s1, [x0] -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: fcmeq v1.4h, v1.4h, #0.0 -; CHECK-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov w9, v1.s[1] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: mov v0.h[0], w8 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: shl v0.4h, v0.4h, #15 -; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: str s0, [x0] -; CHECK-NEXT: ret +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: fcmeq v1.4h, v1.4h, #0.0 +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret %cval = load <2 x half>, ptr %a %ptrs = load <2 x ptr>, ptr %b %mask = fcmp oeq <2 x half> %cval, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -12,20 +12,19 @@ define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_load_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s1, [x0] -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: ldr s2, [x1] -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: fcmeq v1.4h, v1.4h, v2.4h -; CHECK-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov w9, v1.s[1] -; CHECK-NEXT: mov v0.h[0], w8 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: ret +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: ldr s2, [x1] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fcmeq v1.4h, v1.4h, v2.4h +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ret %a = load <2 x half>, ptr %ap %b = load <2 x half>, ptr %bp %mask = fcmp oeq <2 x half> %a, %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -533,25 +533,24 @@ define void @masked_scatter_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s1, [x0] -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0 -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov w9, v2.s[1] -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: mov v0.h[0], w8 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: shl v0.4h, v0.4h, #15 -; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: uunpklo z0.d, z1.s -; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] -; CHECK-NEXT: ret +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: mov v0.h[0], v2.h[0] +; CHECK-NEXT: mov w8, v2.s[1] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: uunpklo z0.d, z1.s +; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] +; CHECK-NEXT: ret %vals = load <2 x half>, ptr %a %ptrs = load <2 x ptr>, ptr %b %mask = fcmp oeq <2 x half> %vals, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -12,21 +12,20 @@ define void @masked_store_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_store_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s1, [x0] -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: ldr s2, [x1] -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: fcmeq v2.4h, v1.4h, v2.4h -; CHECK-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov w9, v2.s[1] -; CHECK-NEXT: mov v0.h[0], w8 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: shl v0.4h, v0.4h, #15 -; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: st1h { z1.h }, p0, [x1] -; CHECK-NEXT: ret +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: ldr s2, [x1] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: fcmeq v2.4h, v1.4h, v2.4h +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: mov v0.h[0], v2.h[0] +; CHECK-NEXT: mov w8, v2.s[1] +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret %a = load <2 x half>, ptr %ap %b = load <2 x half>, ptr %bp %mask = fcmp oeq <2 x half> %a, %b