diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -36,11 +36,16 @@ #include "AArch64ExpandImm.h" #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64RegisterInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Register.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" using namespace llvm; @@ -97,6 +102,12 @@ template bool visitAND(unsigned Opc, MachineInstr &MI); bool visitORR(MachineInstr &MI); + bool visitCopy(MachineInstr &MI); + + bool hasSameNumberOfBits(const TargetRegisterClass *FPRegClass, + const TargetRegisterClass *GPRRegClass); + bool isGPRegister(const TargetRegisterClass *RC); + bool isFPRRegister(const TargetRegisterClass *RC); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -155,6 +166,25 @@ return true; } +bool AArch64MIPeepholeOpt::hasSameNumberOfBits( + const TargetRegisterClass *FPRegClass, + const TargetRegisterClass *GPRRegClass) { + return (FPRegClass == &AArch64::FPR32RegClass && + GPRRegClass == &AArch64::GPR32RegClass) || + (FPRegClass == &AArch64::FPR64RegClass && + GPRRegClass == &AArch64::GPR64RegClass); +} + +bool AArch64MIPeepholeOpt::isFPRRegister(const TargetRegisterClass *RC) { + // If RC is of FPR128RegClass, it couldn't be copied from GPR register class; + // so FPR128RegClass is omitted. + return (RC == &AArch64::FPR32RegClass || RC == &AArch64::FPR64RegClass); +} + +bool AArch64MIPeepholeOpt::isGPRegister(const TargetRegisterClass *RC) { + return (RC == &AArch64::GPR32RegClass || RC == &AArch64::GPR64RegClass); +} + template bool AArch64MIPeepholeOpt::visitAND( unsigned Opc, MachineInstr &MI) { @@ -189,6 +219,156 @@ }); } +// Combine a load into GPR followed by a copy to FPR to a load into FPR +// directly. +// +// For example, +// %2:gpr64 = LDRXui %1:gpr64common, 1 :: (load (s64) from %ir.3) +// %3:fpr64 = COPY %2:gpr64 +// => +// %3:fpr64 = LDRDui %1:gpr64common, 1 :: (load (s64) from %ir.3) +bool AArch64MIPeepholeOpt::visitCopy(MachineInstr &MI) { + // Don't mess with bundled instructions (e.g., copy is + // the start of a bundled instruction). + if (MI.isBundled()) + return false; + + const MachineOperand &DstOperand = MI.getOperand(0); + const MachineOperand &SrcOperand = MI.getOperand(1); + if (!DstOperand.isReg() || !SrcOperand.isReg()) + return false; + + Register DstReg = DstOperand.getReg(); + Register SrcReg = SrcOperand.getReg(); + if (!DstReg.isVirtual() || !SrcReg.isVirtual() || + !MRI->hasOneNonDBGUse(SrcReg)) + return false; + + const TargetRegisterClass *DstRegClass = MRI->getRegClass(DstReg); + const TargetRegisterClass *SrcRegClass = MRI->getRegClass(SrcReg); + if (!isFPRRegister(DstRegClass) || !isGPRegister(SrcRegClass) || + !hasSameNumberOfBits(DstRegClass, SrcRegClass)) + return false; + + // FIXME: When SrcMI and MI are not in the same basic block but + // the two basic blocks dominates each other, it's still possible + // to detect load-fold barrier between SrcMI and MI. + auto *SrcMI = MRI->getUniqueVRegDef(SrcReg); + if (!SrcMI || !SrcMI->mayLoad() || SrcMI->isBundled() || + SrcMI->getParent() != MI.getParent()) + return false; + + // Bail if there is a load fold barrier between SrcMI and MI. + // + // Note, we find the (cross-register-bank) copy and follow the def to the load + // instead of finding load and follow the use to the copy for two reasons: + // 1. Keep DstReg allocation at the same position rather than advancing it. + // 2. [Minor] Avoid disrupting instruction iteration by erasing an instruction + // that's to be looked into by the work loop. + for (const MachineInstr &Instr : + make_range(std::next(MachineBasicBlock::iterator(SrcMI)), + MachineBasicBlock::iterator(MI))) { + if (Instr.isLoadFoldBarrier()) + return false; + } + + // Initializing SawStore to false is fine here, since it's already verified + // above that there are no load-fold-barriers between SrcMI and MI. + bool SawStore = false; + if (!SrcMI->isSafeToMove(nullptr, SawStore)) + return false; + + int64_t NewOpCode = -1; + + switch (SrcMI->getOpcode()) { + case AArch64::LDRXui: + NewOpCode = AArch64::LDRDui; + break; + case AArch64::LDRWui: + NewOpCode = AArch64::LDRSui; + break; + case AArch64::LDRXroW: + NewOpCode = AArch64::LDRDroW; + break; + case AArch64::LDRXroX: + NewOpCode = AArch64::LDRDroX; + break; + case AArch64::LDRWroW: + NewOpCode = AArch64::LDRSroW; + break; + case AArch64::LDRWroX: + NewOpCode = AArch64::LDRSroX; + break; + case AArch64::LDRWpre: + NewOpCode = AArch64::LDRSpre; + break; + case AArch64::LDRXpre: + NewOpCode = AArch64::LDRDpre; + break; + case AArch64::LDRWpost: + NewOpCode = AArch64::LDRSpost; + break; + case AArch64::LDRXpost: + NewOpCode = AArch64::LDRDpost; + break; + } + + MachineInstrBuilder MIB; + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + int ReplacedRegIndex = -1; + switch (NewOpCode) { + default: + break; + case AArch64::LDRDui: + case AArch64::LDRSui: { + MIB = BuildMI(MBB, MI, DL, TII->get(NewOpCode), DstReg) + .add(SrcMI->getOperand(1)) + .add(SrcMI->getOperand(2)); + ReplacedRegIndex = 0; + break; + } + case AArch64::LDRDroW: + case AArch64::LDRDroX: + case AArch64::LDRSroW: + case AArch64::LDRSroX: { + MIB = BuildMI(MBB, MI, DL, TII->get(NewOpCode), DstOperand.getReg()) + .add(SrcMI->getOperand(1)) + .add(SrcMI->getOperand(2)) + .add(SrcMI->getOperand(3)); + ReplacedRegIndex = 0; + break; + } + case AArch64::LDRSpre: + case AArch64::LDRDpre: + case AArch64::LDRSpost: + case AArch64::LDRDpost: { + MIB = BuildMI(MBB, MI, DL, TII->get(NewOpCode)) + .add(SrcMI->getOperand(0)) + .add(DstOperand) + .add(SrcMI->getOperand(2)) + .add(SrcMI->getOperand(3)); + ReplacedRegIndex = 1; + break; + } + } + + if (ReplacedRegIndex != -1) { + MIB.setMemRefs(SrcMI->memoperands()) + .setMIFlags(SrcMI->getFlags()) + .copyImplicitOps(*SrcMI); + Register ReplacedReg = SrcMI->getOperand(ReplacedRegIndex).getReg(); + // Replace 'ReplacedReg' with 'DstReg' so all debug uses of 'ReplacedReg' + // are updated. + MRI->replaceRegWith(ReplacedReg, DstReg); + SrcMI->eraseFromParent(); + MI.eraseFromParent(); + return true; + } + + return false; +} + bool AArch64MIPeepholeOpt::visitORR(MachineInstr &MI) { // Check this ORR comes from below zero-extend pattern. // @@ -534,6 +714,9 @@ {AArch64::ADDXri, AArch64::ADDSXri}, MI); break; + case AArch64::COPY: + Changed = visitCopy(MI); + break; } } } diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll --- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll +++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll @@ -223,16 +223,15 @@ define <32 x i8> @zext_v32i1(<32 x i1> %arg) { ; CHECK-LABEL: zext_v32i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [sp, #64] +; CHECK-NEXT: ldr s1, [sp, #64] ; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ldr w9, [sp] -; CHECK-NEXT: ldr w10, [sp, #8] -; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: ldr w8, [sp, #72] +; CHECK-NEXT: ldr w9, [sp] ; CHECK-NEXT: mov.b v0[1], w1 -; CHECK-NEXT: movi.16b v2, #1 +; CHECK-NEXT: ldr w10, [sp, #8] ; CHECK-NEXT: mov.b v1[1], w8 ; CHECK-NEXT: ldr w8, [sp, #80] +; CHECK-NEXT: movi.16b v2, #1 ; CHECK-NEXT: mov.b v0[2], w2 ; CHECK-NEXT: mov.b v1[2], w8 ; CHECK-NEXT: ldr w8, [sp, #88] @@ -290,66 +289,65 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) { ; CHECK-LABEL: sext_v32i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [sp, #64] -; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ldr s0, [sp, #64] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: ldr w8, [sp, #72] ; CHECK-NEXT: ldr w9, [sp] +; CHECK-NEXT: mov.b v1[1], w1 ; CHECK-NEXT: ldr w10, [sp, #8] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: ldr w8, [sp, #72] -; CHECK-NEXT: mov.b v0[1], w1 -; CHECK-NEXT: mov.b v1[1], w8 +; CHECK-NEXT: mov.b v0[1], w8 ; CHECK-NEXT: ldr w8, [sp, #80] -; CHECK-NEXT: mov.b v0[2], w2 -; CHECK-NEXT: mov.b v1[2], w8 +; CHECK-NEXT: mov.b v1[2], w2 +; CHECK-NEXT: mov.b v0[2], w8 ; CHECK-NEXT: ldr w8, [sp, #88] -; CHECK-NEXT: mov.b v0[3], w3 -; CHECK-NEXT: mov.b v1[3], w8 +; CHECK-NEXT: mov.b v1[3], w3 +; CHECK-NEXT: mov.b v0[3], w8 ; CHECK-NEXT: ldr w8, [sp, #96] -; CHECK-NEXT: mov.b v0[4], w4 -; CHECK-NEXT: mov.b v1[4], w8 +; CHECK-NEXT: mov.b v1[4], w4 +; CHECK-NEXT: mov.b v0[4], w8 ; CHECK-NEXT: ldr w8, [sp, #104] -; CHECK-NEXT: mov.b v0[5], w5 -; CHECK-NEXT: mov.b v1[5], w8 +; CHECK-NEXT: mov.b v1[5], w5 +; CHECK-NEXT: mov.b v0[5], w8 ; CHECK-NEXT: ldr w8, [sp, #112] -; CHECK-NEXT: mov.b v0[6], w6 -; CHECK-NEXT: mov.b v1[6], w8 +; CHECK-NEXT: mov.b v1[6], w6 +; CHECK-NEXT: mov.b v0[6], w8 ; CHECK-NEXT: ldr w8, [sp, #120] -; CHECK-NEXT: mov.b v0[7], w7 -; CHECK-NEXT: mov.b v1[7], w8 +; CHECK-NEXT: mov.b v1[7], w7 +; CHECK-NEXT: mov.b v0[7], w8 ; CHECK-NEXT: ldr w8, [sp, #128] -; CHECK-NEXT: mov.b v0[8], w9 +; CHECK-NEXT: mov.b v1[8], w9 ; CHECK-NEXT: ldr w9, [sp, #16] -; CHECK-NEXT: mov.b v1[8], w8 +; CHECK-NEXT: mov.b v0[8], w8 ; CHECK-NEXT: ldr w8, [sp, #136] -; CHECK-NEXT: mov.b v0[9], w10 +; CHECK-NEXT: mov.b v1[9], w10 ; CHECK-NEXT: ldr w10, [sp, #24] -; CHECK-NEXT: mov.b v1[9], w8 +; CHECK-NEXT: mov.b v0[9], w8 ; CHECK-NEXT: ldr w8, [sp, #144] -; CHECK-NEXT: mov.b v0[10], w9 +; CHECK-NEXT: mov.b v1[10], w9 ; CHECK-NEXT: ldr w9, [sp, #32] -; CHECK-NEXT: mov.b v1[10], w8 +; CHECK-NEXT: mov.b v0[10], w8 ; CHECK-NEXT: ldr w8, [sp, #152] -; CHECK-NEXT: mov.b v0[11], w10 +; CHECK-NEXT: mov.b v1[11], w10 ; CHECK-NEXT: ldr w10, [sp, #40] -; CHECK-NEXT: mov.b v1[11], w8 +; CHECK-NEXT: mov.b v0[11], w8 ; CHECK-NEXT: ldr w8, [sp, #160] -; CHECK-NEXT: mov.b v0[12], w9 +; CHECK-NEXT: mov.b v1[12], w9 ; CHECK-NEXT: ldr w9, [sp, #48] -; CHECK-NEXT: mov.b v1[12], w8 +; CHECK-NEXT: mov.b v0[12], w8 ; CHECK-NEXT: ldr w8, [sp, #168] -; CHECK-NEXT: mov.b v0[13], w10 +; CHECK-NEXT: mov.b v1[13], w10 ; CHECK-NEXT: ldr w10, [sp, #56] -; CHECK-NEXT: mov.b v1[13], w8 +; CHECK-NEXT: mov.b v0[13], w8 ; CHECK-NEXT: ldr w8, [sp, #176] -; CHECK-NEXT: mov.b v0[14], w9 -; CHECK-NEXT: mov.b v1[14], w8 +; CHECK-NEXT: mov.b v1[14], w9 +; CHECK-NEXT: mov.b v0[14], w8 ; CHECK-NEXT: ldr w8, [sp, #184] -; CHECK-NEXT: mov.b v0[15], w10 -; CHECK-NEXT: mov.b v1[15], w8 -; CHECK-NEXT: shl.16b v0, v0, #7 +; CHECK-NEXT: mov.b v1[15], w10 +; CHECK-NEXT: mov.b v0[15], w8 ; CHECK-NEXT: shl.16b v1, v1, #7 -; CHECK-NEXT: cmlt.16b v0, v0, #0 -; CHECK-NEXT: cmlt.16b v1, v1, #0 +; CHECK-NEXT: shl.16b v2, v0, #7 +; CHECK-NEXT: cmlt.16b v0, v1, #0 +; CHECK-NEXT: cmlt.16b v1, v2, #0 ; CHECK-NEXT: ret %res = sext <32 x i1> %arg to <32 x i8> ret <32 x i8> %res @@ -358,131 +356,128 @@ define <64 x i8> @zext_v64i1(<64 x i1> %arg) { ; CHECK-LABEL: zext_v64i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [sp, #320] -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ldr w9, [sp, #64] -; CHECK-NEXT: ldr w10, [sp, #192] -; CHECK-NEXT: fmov s3, w8 +; CHECK-NEXT: ldr s0, [sp, #320] +; CHECK-NEXT: fmov s4, w0 ; CHECK-NEXT: ldr w8, [sp, #328] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: ldr w9, [sp, #200] -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: ldr w10, [sp, #336] -; CHECK-NEXT: mov.b v3[1], w8 -; CHECK-NEXT: ldr w8, [sp, #72] -; CHECK-NEXT: mov.b v0[1], w1 -; CHECK-NEXT: ldr w11, [sp, #352] -; CHECK-NEXT: mov.b v2[1], w9 +; CHECK-NEXT: ldr s1, [sp, #64] +; CHECK-NEXT: ldr w9, [sp, #72] +; CHECK-NEXT: ldr s2, [sp, #192] +; CHECK-NEXT: ldr w10, [sp, #200] +; CHECK-NEXT: mov.b v1[1], w9 ; CHECK-NEXT: ldr w9, [sp, #80] -; CHECK-NEXT: mov.b v1[1], w8 -; CHECK-NEXT: ldr w8, [sp, #344] -; CHECK-NEXT: mov.b v3[2], w10 -; CHECK-NEXT: ldr w10, [sp, #208] -; CHECK-NEXT: mov.b v0[2], w2 -; CHECK-NEXT: ldr w12, [sp, #368] -; CHECK-NEXT: ldr w13, [sp, #384] -; CHECK-NEXT: mov.b v1[2], w9 -; CHECK-NEXT: ldr w9, [sp, #360] -; CHECK-NEXT: mov.b v2[2], w10 +; CHECK-NEXT: mov.b v0[1], w8 +; CHECK-NEXT: ldr w8, [sp, #336] +; CHECK-NEXT: mov.b v2[1], w10 ; CHECK-NEXT: ldr w10, [sp, #88] -; CHECK-NEXT: mov.b v3[3], w8 -; CHECK-NEXT: ldr w8, [sp, #216] -; CHECK-NEXT: mov.b v0[3], w3 -; CHECK-NEXT: ldr w14, [sp, #400] +; CHECK-NEXT: mov.b v4[1], w1 +; CHECK-NEXT: ldr w11, [sp, #96] +; CHECK-NEXT: mov.b v1[2], w9 +; CHECK-NEXT: ldr w9, [sp, #208] +; CHECK-NEXT: mov.b v0[2], w8 +; CHECK-NEXT: ldr w8, [sp, #344] +; CHECK-NEXT: ldr w12, [sp, #104] +; CHECK-NEXT: mov.b v2[2], w9 +; CHECK-NEXT: ldr w9, [sp, #352] +; CHECK-NEXT: mov.b v4[2], w2 +; CHECK-NEXT: ldr w13, [sp, #112] ; CHECK-NEXT: mov.b v1[3], w10 -; CHECK-NEXT: ldr w10, [sp, #376] -; CHECK-NEXT: mov.b v2[3], w8 -; CHECK-NEXT: ldr w8, [sp, #96] -; CHECK-NEXT: mov.b v3[4], w11 +; CHECK-NEXT: ldr w10, [sp, #216] +; CHECK-NEXT: mov.b v0[3], w8 +; CHECK-NEXT: ldr w8, [sp, #360] +; CHECK-NEXT: ldr w14, [sp, #120] +; CHECK-NEXT: mov.b v2[3], w10 +; CHECK-NEXT: ldr w10, [sp, #368] +; CHECK-NEXT: mov.b v4[3], w3 +; CHECK-NEXT: ldr w15, [sp, #128] +; CHECK-NEXT: mov.b v1[4], w11 ; CHECK-NEXT: ldr w11, [sp, #224] -; CHECK-NEXT: mov.b v0[4], w4 -; CHECK-NEXT: ldr w15, [sp, #416] -; CHECK-NEXT: mov.b v1[4], w8 -; CHECK-NEXT: ldr w8, [sp, #392] +; CHECK-NEXT: mov.b v0[4], w9 +; CHECK-NEXT: ldr w9, [sp, #376] +; CHECK-NEXT: ldr w16, [sp, #136] ; CHECK-NEXT: mov.b v2[4], w11 -; CHECK-NEXT: ldr w11, [sp, #104] -; CHECK-NEXT: mov.b v3[5], w9 -; CHECK-NEXT: ldr w9, [sp, #232] -; CHECK-NEXT: mov.b v0[5], w5 -; CHECK-NEXT: ldr w16, [sp, #432] -; CHECK-NEXT: mov.b v1[5], w11 -; CHECK-NEXT: ldr w11, [sp, #408] -; CHECK-NEXT: mov.b v2[5], w9 -; CHECK-NEXT: ldr w9, [sp, #112] -; CHECK-NEXT: mov.b v3[6], w12 -; CHECK-NEXT: ldr w12, [sp, #240] -; CHECK-NEXT: mov.b v0[6], w6 -; CHECK-NEXT: mov.b v1[6], w9 +; CHECK-NEXT: ldr w11, [sp, #384] +; CHECK-NEXT: mov.b v4[4], w4 +; CHECK-NEXT: mov.b v1[5], w12 +; CHECK-NEXT: ldr w12, [sp, #232] +; CHECK-NEXT: mov.b v0[5], w8 +; CHECK-NEXT: ldr w8, [sp, #392] +; CHECK-NEXT: movi.16b v5, #1 +; CHECK-NEXT: mov.b v2[5], w12 +; CHECK-NEXT: ldr w12, [sp, #400] +; CHECK-NEXT: mov.b v4[5], w5 +; CHECK-NEXT: mov.b v1[6], w13 +; CHECK-NEXT: ldr w13, [sp, #240] +; CHECK-NEXT: mov.b v0[6], w10 +; CHECK-NEXT: ldr w10, [sp, #408] +; CHECK-NEXT: mov.b v2[6], w13 +; CHECK-NEXT: ldr w13, [sp, #416] +; CHECK-NEXT: mov.b v4[6], w6 +; CHECK-NEXT: mov.b v1[7], w14 +; CHECK-NEXT: ldr w14, [sp, #248] +; CHECK-NEXT: mov.b v0[7], w9 ; CHECK-NEXT: ldr w9, [sp, #424] -; CHECK-NEXT: mov.b v2[6], w12 -; CHECK-NEXT: ldr w12, [sp, #120] -; CHECK-NEXT: mov.b v3[7], w10 -; CHECK-NEXT: ldr w10, [sp, #248] -; CHECK-NEXT: mov.b v0[7], w7 -; CHECK-NEXT: mov.b v1[7], w12 -; CHECK-NEXT: ldr w12, [sp] -; CHECK-NEXT: mov.b v2[7], w10 -; CHECK-NEXT: ldr w10, [sp, #128] -; CHECK-NEXT: mov.b v3[8], w13 -; CHECK-NEXT: ldr w13, [sp, #256] -; CHECK-NEXT: mov.b v0[8], w12 -; CHECK-NEXT: ldr w12, [sp, #440] -; CHECK-NEXT: mov.b v1[8], w10 -; CHECK-NEXT: ldr w10, [sp, #8] -; CHECK-NEXT: mov.b v2[8], w13 -; CHECK-NEXT: ldr w13, [sp, #136] -; CHECK-NEXT: mov.b v3[9], w8 -; CHECK-NEXT: ldr w8, [sp, #264] -; CHECK-NEXT: mov.b v0[9], w10 -; CHECK-NEXT: ldr w10, [sp, #272] -; CHECK-NEXT: mov.b v1[9], w13 -; CHECK-NEXT: ldr w13, [sp, #16] -; CHECK-NEXT: mov.b v2[9], w8 -; CHECK-NEXT: ldr w8, [sp, #144] -; CHECK-NEXT: mov.b v3[10], w14 -; CHECK-NEXT: ldr w14, [sp, #280] -; CHECK-NEXT: mov.b v0[10], w13 -; CHECK-NEXT: ldr w13, [sp, #296] -; CHECK-NEXT: mov.b v1[10], w8 +; CHECK-NEXT: mov.b v2[7], w14 +; CHECK-NEXT: ldr w14, [sp, #432] +; CHECK-NEXT: mov.b v4[7], w7 +; CHECK-NEXT: mov.b v1[8], w15 +; CHECK-NEXT: ldr w15, [sp, #256] +; CHECK-NEXT: mov.b v0[8], w11 +; CHECK-NEXT: ldr w11, [sp] +; CHECK-NEXT: mov.b v2[8], w15 +; CHECK-NEXT: ldr w15, [sp, #440] +; CHECK-NEXT: mov.b v4[8], w11 +; CHECK-NEXT: ldr w11, [sp, #144] +; CHECK-NEXT: mov.b v1[9], w16 +; CHECK-NEXT: ldr w16, [sp, #264] +; CHECK-NEXT: mov.b v0[9], w8 +; CHECK-NEXT: ldr w8, [sp, #8] +; CHECK-NEXT: mov.b v2[9], w16 +; CHECK-NEXT: ldr w16, [sp, #272] +; CHECK-NEXT: mov.b v4[9], w8 +; CHECK-NEXT: ldr w8, [sp, #16] +; CHECK-NEXT: mov.b v1[10], w11 +; CHECK-NEXT: ldr w11, [sp, #152] +; CHECK-NEXT: mov.b v0[10], w12 +; CHECK-NEXT: ldr w12, [sp, #280] +; CHECK-NEXT: mov.b v2[10], w16 +; CHECK-NEXT: mov.b v4[10], w8 ; CHECK-NEXT: ldr w8, [sp, #24] -; CHECK-NEXT: mov.b v2[10], w10 -; CHECK-NEXT: ldr w10, [sp, #152] -; CHECK-NEXT: mov.b v3[11], w11 -; CHECK-NEXT: ldr w11, [sp, #288] -; CHECK-NEXT: mov.b v0[11], w8 +; CHECK-NEXT: mov.b v1[11], w11 +; CHECK-NEXT: ldr w11, [sp, #160] +; CHECK-NEXT: mov.b v0[11], w10 +; CHECK-NEXT: ldr w10, [sp, #288] +; CHECK-NEXT: mov.b v2[11], w12 +; CHECK-NEXT: mov.b v4[11], w8 ; CHECK-NEXT: ldr w8, [sp, #32] -; CHECK-NEXT: mov.b v1[11], w10 -; CHECK-NEXT: ldr w10, [sp, #160] -; CHECK-NEXT: mov.b v2[11], w14 -; CHECK-NEXT: mov.b v3[12], w15 -; CHECK-NEXT: mov.b v0[12], w8 +; CHECK-NEXT: mov.b v1[12], w11 +; CHECK-NEXT: ldr w11, [sp, #168] +; CHECK-NEXT: mov.b v0[12], w13 +; CHECK-NEXT: mov.b v2[12], w10 +; CHECK-NEXT: ldr w10, [sp, #296] +; CHECK-NEXT: mov.b v4[12], w8 ; CHECK-NEXT: ldr w8, [sp, #40] -; CHECK-NEXT: mov.b v1[12], w10 -; CHECK-NEXT: ldr w10, [sp, #168] -; CHECK-NEXT: mov.b v2[12], w11 -; CHECK-NEXT: ldr w11, [sp, #312] -; CHECK-NEXT: mov.b v3[13], w9 +; CHECK-NEXT: mov.b v1[13], w11 +; CHECK-NEXT: ldr w11, [sp, #176] +; CHECK-NEXT: mov.b v0[13], w9 ; CHECK-NEXT: ldr w9, [sp, #304] -; CHECK-NEXT: mov.b v0[13], w8 +; CHECK-NEXT: mov.b v2[13], w10 +; CHECK-NEXT: ldr w10, [sp, #184] +; CHECK-NEXT: mov.b v4[13], w8 ; CHECK-NEXT: ldr w8, [sp, #48] -; CHECK-NEXT: mov.b v1[13], w10 -; CHECK-NEXT: ldr w10, [sp, #176] -; CHECK-NEXT: mov.b v2[13], w13 -; CHECK-NEXT: mov.b v3[14], w16 -; CHECK-NEXT: mov.b v0[14], w8 -; CHECK-NEXT: ldr w8, [sp, #56] -; CHECK-NEXT: mov.b v1[14], w10 +; CHECK-NEXT: mov.b v1[14], w11 +; CHECK-NEXT: mov.b v0[14], w14 ; CHECK-NEXT: mov.b v2[14], w9 -; CHECK-NEXT: ldr w9, [sp, #184] -; CHECK-NEXT: movi.16b v4, #1 -; CHECK-NEXT: mov.b v0[15], w8 -; CHECK-NEXT: mov.b v1[15], w9 -; CHECK-NEXT: mov.b v2[15], w11 -; CHECK-NEXT: mov.b v3[15], w12 -; CHECK-NEXT: and.16b v0, v0, v4 -; CHECK-NEXT: and.16b v1, v1, v4 -; CHECK-NEXT: and.16b v2, v2, v4 -; CHECK-NEXT: and.16b v3, v3, v4 +; CHECK-NEXT: ldr w9, [sp, #312] +; CHECK-NEXT: mov.b v4[14], w8 +; CHECK-NEXT: ldr w8, [sp, #56] +; CHECK-NEXT: mov.b v1[15], w10 +; CHECK-NEXT: mov.b v0[15], w15 +; CHECK-NEXT: mov.b v2[15], w9 +; CHECK-NEXT: mov.b v4[15], w8 +; CHECK-NEXT: and.16b v1, v1, v5 +; CHECK-NEXT: and.16b v3, v0, v5 +; CHECK-NEXT: and.16b v2, v2, v5 +; CHECK-NEXT: and.16b v0, v4, v5 ; CHECK-NEXT: ret %res = zext <64 x i1> %arg to <64 x i8> ret <64 x i8> %res @@ -491,133 +486,130 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) { ; CHECK-LABEL: sext_v64i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [sp, #320] +; CHECK-NEXT: ldr s0, [sp, #320] ; CHECK-NEXT: fmov s3, w0 -; CHECK-NEXT: ldr w9, [sp, #64] -; CHECK-NEXT: ldr w10, [sp, #192] -; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: ldr w8, [sp, #328] -; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: ldr s1, [sp, #64] ; CHECK-NEXT: ldr w9, [sp, #72] -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: ldr w10, [sp, #80] -; CHECK-NEXT: mov.b v0[1], w8 -; CHECK-NEXT: ldr w8, [sp, #200] +; CHECK-NEXT: ldr s2, [sp, #192] +; CHECK-NEXT: ldr w10, [sp, #200] ; CHECK-NEXT: mov.b v1[1], w9 -; CHECK-NEXT: ldr w9, [sp, #336] +; CHECK-NEXT: ldr w9, [sp, #80] +; CHECK-NEXT: mov.b v0[1], w8 +; CHECK-NEXT: ldr w8, [sp, #336] +; CHECK-NEXT: mov.b v2[1], w10 +; CHECK-NEXT: ldr w10, [sp, #88] ; CHECK-NEXT: mov.b v3[1], w1 -; CHECK-NEXT: ldr w11, [sp, #88] -; CHECK-NEXT: mov.b v2[1], w8 -; CHECK-NEXT: ldr w8, [sp, #344] -; CHECK-NEXT: mov.b v0[2], w9 +; CHECK-NEXT: ldr w11, [sp, #96] +; CHECK-NEXT: mov.b v1[2], w9 ; CHECK-NEXT: ldr w9, [sp, #208] -; CHECK-NEXT: mov.b v1[2], w10 -; CHECK-NEXT: ldr w10, [sp, #352] -; CHECK-NEXT: mov.b v3[2], w2 -; CHECK-NEXT: ldr w12, [sp, #96] +; CHECK-NEXT: mov.b v0[2], w8 +; CHECK-NEXT: ldr w8, [sp, #344] +; CHECK-NEXT: ldr w12, [sp, #104] ; CHECK-NEXT: mov.b v2[2], w9 -; CHECK-NEXT: ldr w9, [sp, #360] +; CHECK-NEXT: ldr w9, [sp, #352] +; CHECK-NEXT: mov.b v3[2], w2 +; CHECK-NEXT: ldr w13, [sp, #112] +; CHECK-NEXT: mov.b v1[3], w10 +; CHECK-NEXT: ldr w10, [sp, #216] ; CHECK-NEXT: mov.b v0[3], w8 -; CHECK-NEXT: ldr w8, [sp, #216] -; CHECK-NEXT: mov.b v1[3], w11 -; CHECK-NEXT: ldr w13, [sp, #104] +; CHECK-NEXT: ldr w8, [sp, #360] +; CHECK-NEXT: ldr w14, [sp, #120] +; CHECK-NEXT: mov.b v2[3], w10 +; CHECK-NEXT: ldr w10, [sp, #368] ; CHECK-NEXT: mov.b v3[3], w3 -; CHECK-NEXT: ldr w11, [sp, #368] -; CHECK-NEXT: mov.b v2[3], w8 -; CHECK-NEXT: ldr w14, [sp, #112] -; CHECK-NEXT: mov.b v0[4], w10 -; CHECK-NEXT: ldr w10, [sp, #224] -; CHECK-NEXT: mov.b v1[4], w12 -; CHECK-NEXT: ldr w8, [sp, #376] +; CHECK-NEXT: ldr w15, [sp, #128] +; CHECK-NEXT: mov.b v1[4], w11 +; CHECK-NEXT: ldr w11, [sp, #224] +; CHECK-NEXT: mov.b v0[4], w9 +; CHECK-NEXT: ldr w9, [sp, #376] +; CHECK-NEXT: ldr w16, [sp, #136] +; CHECK-NEXT: mov.b v2[4], w11 +; CHECK-NEXT: ldr w11, [sp, #384] ; CHECK-NEXT: mov.b v3[4], w4 -; CHECK-NEXT: ldr w15, [sp, #120] -; CHECK-NEXT: mov.b v2[4], w10 -; CHECK-NEXT: ldr w12, [sp, #384] -; CHECK-NEXT: mov.b v0[5], w9 -; CHECK-NEXT: ldr w9, [sp, #232] -; CHECK-NEXT: mov.b v1[5], w13 -; CHECK-NEXT: ldr w16, [sp, #128] +; CHECK-NEXT: mov.b v1[5], w12 +; CHECK-NEXT: ldr w12, [sp, #232] +; CHECK-NEXT: mov.b v0[5], w8 +; CHECK-NEXT: ldr w8, [sp, #392] +; CHECK-NEXT: mov.b v2[5], w12 +; CHECK-NEXT: ldr w12, [sp, #400] ; CHECK-NEXT: mov.b v3[5], w5 -; CHECK-NEXT: ldr w10, [sp, #392] -; CHECK-NEXT: mov.b v2[5], w9 -; CHECK-NEXT: ldr w13, [sp, #400] -; CHECK-NEXT: mov.b v0[6], w11 -; CHECK-NEXT: ldr w11, [sp, #240] -; CHECK-NEXT: mov.b v1[6], w14 -; CHECK-NEXT: ldr w9, [sp, #408] +; CHECK-NEXT: mov.b v1[6], w13 +; CHECK-NEXT: ldr w13, [sp, #240] +; CHECK-NEXT: mov.b v0[6], w10 +; CHECK-NEXT: ldr w10, [sp, #408] +; CHECK-NEXT: mov.b v2[6], w13 +; CHECK-NEXT: ldr w13, [sp, #416] ; CHECK-NEXT: mov.b v3[6], w6 -; CHECK-NEXT: ldr w14, [sp, #416] -; CHECK-NEXT: mov.b v2[6], w11 -; CHECK-NEXT: ldr w11, [sp, #424] -; CHECK-NEXT: mov.b v0[7], w8 -; CHECK-NEXT: ldr w8, [sp, #248] -; CHECK-NEXT: mov.b v1[7], w15 -; CHECK-NEXT: ldr w15, [sp, #432] +; CHECK-NEXT: mov.b v1[7], w14 +; CHECK-NEXT: ldr w14, [sp, #248] +; CHECK-NEXT: mov.b v0[7], w9 +; CHECK-NEXT: ldr w9, [sp, #424] +; CHECK-NEXT: mov.b v2[7], w14 +; CHECK-NEXT: ldr w14, [sp, #432] ; CHECK-NEXT: mov.b v3[7], w7 -; CHECK-NEXT: mov.b v2[7], w8 -; CHECK-NEXT: ldr w8, [sp] -; CHECK-NEXT: mov.b v0[8], w12 -; CHECK-NEXT: ldr w12, [sp, #256] -; CHECK-NEXT: mov.b v1[8], w16 -; CHECK-NEXT: ldr w16, [sp, #440] -; CHECK-NEXT: mov.b v3[8], w8 -; CHECK-NEXT: ldr w8, [sp, #136] -; CHECK-NEXT: mov.b v2[8], w12 -; CHECK-NEXT: ldr w12, [sp, #8] -; CHECK-NEXT: mov.b v0[9], w10 -; CHECK-NEXT: ldr w10, [sp, #264] -; CHECK-NEXT: mov.b v1[9], w8 -; CHECK-NEXT: ldr w8, [sp, #272] -; CHECK-NEXT: mov.b v3[9], w12 -; CHECK-NEXT: ldr w12, [sp, #144] -; CHECK-NEXT: mov.b v2[9], w10 -; CHECK-NEXT: ldr w10, [sp, #16] -; CHECK-NEXT: mov.b v0[10], w13 -; CHECK-NEXT: ldr w13, [sp, #280] -; CHECK-NEXT: mov.b v1[10], w12 -; CHECK-NEXT: ldr w12, [sp, #152] -; CHECK-NEXT: mov.b v3[10], w10 -; CHECK-NEXT: ldr w10, [sp, #160] -; CHECK-NEXT: mov.b v2[10], w8 +; CHECK-NEXT: mov.b v1[8], w15 +; CHECK-NEXT: ldr w15, [sp, #256] +; CHECK-NEXT: mov.b v0[8], w11 +; CHECK-NEXT: ldr w11, [sp] +; CHECK-NEXT: mov.b v2[8], w15 +; CHECK-NEXT: ldr w15, [sp, #440] +; CHECK-NEXT: mov.b v3[8], w11 +; CHECK-NEXT: ldr w11, [sp, #144] +; CHECK-NEXT: mov.b v1[9], w16 +; CHECK-NEXT: ldr w16, [sp, #264] +; CHECK-NEXT: mov.b v0[9], w8 +; CHECK-NEXT: ldr w8, [sp, #8] +; CHECK-NEXT: mov.b v2[9], w16 +; CHECK-NEXT: ldr w16, [sp, #272] +; CHECK-NEXT: mov.b v3[9], w8 +; CHECK-NEXT: ldr w8, [sp, #16] +; CHECK-NEXT: mov.b v1[10], w11 +; CHECK-NEXT: ldr w11, [sp, #152] +; CHECK-NEXT: mov.b v0[10], w12 +; CHECK-NEXT: ldr w12, [sp, #280] +; CHECK-NEXT: mov.b v2[10], w16 +; CHECK-NEXT: mov.b v3[10], w8 ; CHECK-NEXT: ldr w8, [sp, #24] -; CHECK-NEXT: mov.b v0[11], w9 -; CHECK-NEXT: ldr w9, [sp, #288] -; CHECK-NEXT: mov.b v1[11], w12 -; CHECK-NEXT: ldr w12, [sp, #296] +; CHECK-NEXT: mov.b v1[11], w11 +; CHECK-NEXT: ldr w11, [sp, #160] +; CHECK-NEXT: mov.b v0[11], w10 +; CHECK-NEXT: ldr w10, [sp, #288] +; CHECK-NEXT: mov.b v2[11], w12 ; CHECK-NEXT: mov.b v3[11], w8 ; CHECK-NEXT: ldr w8, [sp, #32] -; CHECK-NEXT: mov.b v2[11], w13 -; CHECK-NEXT: mov.b v0[12], w14 -; CHECK-NEXT: mov.b v1[12], w10 -; CHECK-NEXT: ldr w10, [sp, #168] +; CHECK-NEXT: mov.b v1[12], w11 +; CHECK-NEXT: ldr w11, [sp, #168] +; CHECK-NEXT: mov.b v0[12], w13 +; CHECK-NEXT: mov.b v2[12], w10 +; CHECK-NEXT: ldr w10, [sp, #296] ; CHECK-NEXT: mov.b v3[12], w8 ; CHECK-NEXT: ldr w8, [sp, #40] -; CHECK-NEXT: mov.b v2[12], w9 +; CHECK-NEXT: mov.b v1[13], w11 +; CHECK-NEXT: ldr w11, [sp, #176] +; CHECK-NEXT: mov.b v0[13], w9 ; CHECK-NEXT: ldr w9, [sp, #304] -; CHECK-NEXT: mov.b v0[13], w11 -; CHECK-NEXT: ldr w11, [sp, #312] -; CHECK-NEXT: mov.b v1[13], w10 -; CHECK-NEXT: ldr w10, [sp, #176] +; CHECK-NEXT: mov.b v2[13], w10 +; CHECK-NEXT: ldr w10, [sp, #184] ; CHECK-NEXT: mov.b v3[13], w8 ; CHECK-NEXT: ldr w8, [sp, #48] -; CHECK-NEXT: mov.b v2[13], w12 -; CHECK-NEXT: mov.b v0[14], w15 -; CHECK-NEXT: mov.b v1[14], w10 -; CHECK-NEXT: ldr w10, [sp, #184] +; CHECK-NEXT: mov.b v1[14], w11 +; CHECK-NEXT: mov.b v0[14], w14 +; CHECK-NEXT: mov.b v2[14], w9 +; CHECK-NEXT: ldr w9, [sp, #312] ; CHECK-NEXT: mov.b v3[14], w8 ; CHECK-NEXT: ldr w8, [sp, #56] -; CHECK-NEXT: mov.b v2[14], w9 -; CHECK-NEXT: mov.b v0[15], w16 ; CHECK-NEXT: mov.b v1[15], w10 +; CHECK-NEXT: mov.b v0[15], w15 +; CHECK-NEXT: mov.b v2[15], w9 ; CHECK-NEXT: mov.b v3[15], w8 -; CHECK-NEXT: mov.b v2[15], w11 -; CHECK-NEXT: shl.16b v4, v0, #7 ; CHECK-NEXT: shl.16b v1, v1, #7 -; CHECK-NEXT: shl.16b v3, v3, #7 +; CHECK-NEXT: shl.16b v4, v0, #7 ; CHECK-NEXT: shl.16b v2, v2, #7 -; CHECK-NEXT: cmlt.16b v0, v3, #0 +; CHECK-NEXT: shl.16b v0, v3, #7 ; CHECK-NEXT: cmlt.16b v1, v1, #0 ; CHECK-NEXT: cmlt.16b v2, v2, #0 +; CHECK-NEXT: cmlt.16b v0, v0, #0 ; CHECK-NEXT: cmlt.16b v3, v4, #0 ; CHECK-NEXT: ret %res = sext <64 x i1> %arg to <64 x i8> diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -217,10 +217,8 @@ define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind { ; CHECK-LABEL: sqdmulh_1s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: ldr w9, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldr s1, [x1] ; CHECK-NEXT: sqdmulh s0, s0, s1 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -291,10 +289,8 @@ define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind { ; CHECK-LABEL: sqrdmulh_1s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: ldr w9, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldr s1, [x1] ; CHECK-NEXT: sqrdmulh s0, s0, s1 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/dp1.ll b/llvm/test/CodeGen/AArch64/dp1.ll --- a/llvm/test/CodeGen/AArch64/dp1.ll +++ b/llvm/test/CodeGen/AArch64/dp1.ll @@ -241,8 +241,7 @@ ; CHECK-GISEL: // %bb.0: ; CHECK-GISEL-NEXT: adrp x8, :got:var64 ; CHECK-GISEL-NEXT: ldr x8, [x8, :got_lo12:var64] -; CHECK-GISEL-NEXT: ldr x9, [x8] -; CHECK-GISEL-NEXT: fmov d0, x9 +; CHECK-GISEL-NEXT: ldr d0, [x8] ; CHECK-GISEL-NEXT: cnt v0.8b, v0.8b ; CHECK-GISEL-NEXT: uaddlv h0, v0.8b ; CHECK-GISEL-NEXT: fmov w9, s0 diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -532,69 +532,63 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) { ; CHECK-LABEL: i12: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr w12, [sp, #32] -; CHECK-NEXT: fmov s5, w0 -; CHECK-NEXT: ldr w15, [sp] +; CHECK-NEXT: ldr w10, [sp, #40] ; CHECK-NEXT: fmov s4, w4 -; CHECK-NEXT: ldr w14, [sp, #40] -; CHECK-NEXT: fmov s0, w12 -; CHECK-NEXT: ldr w16, [sp, #48] -; CHECK-NEXT: fmov s1, w15 -; CHECK-NEXT: ldr w15, [sp, #8] -; CHECK-NEXT: ldr w18, [sp, #16] -; CHECK-NEXT: mov v0.h[1], w14 -; CHECK-NEXT: ldr w17, [sp, #56] -; CHECK-NEXT: mov v1.h[1], w15 -; CHECK-NEXT: ldr w0, [sp, #24] -; CHECK-NEXT: mov v5.h[1], w1 -; CHECK-NEXT: ldr w13, [sp, #64] -; CHECK-NEXT: ldr w1, [sp, #128] -; CHECK-NEXT: mov v0.h[2], w16 -; CHECK-NEXT: ldr w16, [sp, #96] -; CHECK-NEXT: mov v1.h[2], w18 -; CHECK-NEXT: ldr w10, [sp, #72] -; CHECK-NEXT: mov v5.h[2], w2 -; CHECK-NEXT: ldr w2, [sp, #160] -; CHECK-NEXT: mov v4.h[1], w5 -; CHECK-NEXT: ldr w5, [sp, #168] -; CHECK-NEXT: mov v0.h[3], w17 -; CHECK-NEXT: ldr w14, [sp, #104] -; CHECK-NEXT: mov v1.h[3], w0 -; CHECK-NEXT: ldr w18, [sp, #136] -; CHECK-NEXT: fmov s6, w1 +; CHECK-NEXT: ldr s0, [sp, #32] +; CHECK-NEXT: fmov s5, w0 +; CHECK-NEXT: ldr w13, [sp, #8] +; CHECK-NEXT: ldr s1, [sp] +; CHECK-NEXT: mov v0.h[1], w10 +; CHECK-NEXT: ldr w12, [sp, #48] +; CHECK-NEXT: ldr w16, [sp, #16] +; CHECK-NEXT: mov v1.h[1], w13 +; CHECK-NEXT: ldr w14, [sp, #56] +; CHECK-NEXT: ldr w18, [sp, #24] +; CHECK-NEXT: mov v0.h[2], w12 +; CHECK-NEXT: ldr w11, [sp, #72] +; CHECK-NEXT: ldr w13, [sp, #104] +; CHECK-NEXT: mov v1.h[2], w16 +; CHECK-NEXT: ldr w17, [sp, #136] +; CHECK-NEXT: ldr s6, [sp, #128] +; CHECK-NEXT: mov v0.h[3], w14 +; CHECK-NEXT: ldr s7, [sp, #96] +; CHECK-NEXT: ldr s16, [sp, #64] +; CHECK-NEXT: mov v1.h[3], w18 +; CHECK-NEXT: ldr w18, [sp, #168] ; CHECK-NEXT: ldr w0, [sp, #176] -; CHECK-NEXT: fmov s7, w16 -; CHECK-NEXT: fmov s16, w13 ; CHECK-NEXT: ushll v2.4s, v0.4h, #0 ; CHECK-NEXT: ldr w9, [sp, #80] ; CHECK-NEXT: movi v0.4s, #15, msl #8 ; CHECK-NEXT: ldr w12, [sp, #112] ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ldr w17, [sp, #144] -; CHECK-NEXT: mov v6.h[1], w18 -; CHECK-NEXT: ldr w4, [sp, #184] -; CHECK-NEXT: mov v7.h[1], w14 +; CHECK-NEXT: ldr w15, [sp, #144] +; CHECK-NEXT: mov v4.h[1], w5 +; CHECK-NEXT: ldr w16, [sp, #184] +; CHECK-NEXT: mov v5.h[1], w1 ; CHECK-NEXT: ldr w8, [sp, #88] ; CHECK-NEXT: and v3.16b, v2.16b, v0.16b -; CHECK-NEXT: ldr w11, [sp, #120] +; CHECK-NEXT: ldr w10, [sp, #120] ; CHECK-NEXT: and v2.16b, v1.16b, v0.16b -; CHECK-NEXT: ldr w15, [sp, #152] -; CHECK-NEXT: fmov s1, w2 -; CHECK-NEXT: mov v16.h[1], w10 +; CHECK-NEXT: ldr s1, [sp, #160] +; CHECK-NEXT: mov v6.h[1], w17 +; CHECK-NEXT: ldr w14, [sp, #152] +; CHECK-NEXT: mov v7.h[1], w13 +; CHECK-NEXT: mov v1.h[1], w18 +; CHECK-NEXT: mov v16.h[1], w11 ; CHECK-NEXT: mov v4.h[2], w6 -; CHECK-NEXT: mov v1.h[1], w5 -; CHECK-NEXT: mov v6.h[2], w17 +; CHECK-NEXT: mov v5.h[2], w2 +; CHECK-NEXT: mov v1.h[2], w0 +; CHECK-NEXT: mov v6.h[2], w15 ; CHECK-NEXT: mov v7.h[2], w12 ; CHECK-NEXT: mov v16.h[2], w9 -; CHECK-NEXT: mov v1.h[2], w0 +; CHECK-NEXT: mov v1.h[3], w16 ; CHECK-NEXT: mov v4.h[3], w7 ; CHECK-NEXT: mov v5.h[3], w3 -; CHECK-NEXT: mov v6.h[3], w15 -; CHECK-NEXT: mov v1.h[3], w4 -; CHECK-NEXT: mov v7.h[3], w11 +; CHECK-NEXT: mov v6.h[3], w14 +; CHECK-NEXT: mov v7.h[3], w10 ; CHECK-NEXT: mov v16.h[3], w8 -; CHECK-NEXT: ushll v4.4s, v4.4h, #0 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 ; CHECK-NEXT: ushll v5.4s, v5.4h, #0 ; CHECK-NEXT: ushll v6.4s, v6.4h, #0 ; CHECK-NEXT: and v17.16b, v1.16b, v0.16b