diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h --- a/llvm/lib/Target/ARM/ARM.h +++ b/llvm/lib/Target/ARM/ARM.h @@ -58,6 +58,7 @@ Pass *createMVEGatherScatterLoweringPass(); FunctionPass *createARMSLSHardeningPass(); FunctionPass *createARMIndirectThunks(); +Pass *createMVELaneInterleavingPass(); void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, ARMAsmPrinter &AP); @@ -76,6 +77,7 @@ void initializeMVETailPredicationPass(PassRegistry &); void initializeMVEGatherScatterLoweringPass(PassRegistry &); void initializeARMSLSHardeningPass(PassRegistry &); +void initializeMVELaneInterleavingPass(PassRegistry &); } // end namespace llvm diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -102,6 +102,7 @@ initializeARMBlockPlacementPass(Registry); initializeMVEGatherScatterLoweringPass(Registry); initializeARMSLSHardeningPass(Registry); + initializeMVELaneInterleavingPass(Registry); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -416,6 +417,7 @@ })); addPass(createMVEGatherScatterLoweringPass()); + addPass(createMVELaneInterleavingPass()); TargetPassConfig::addIRPasses(); diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt --- a/llvm/lib/Target/ARM/CMakeLists.txt +++ b/llvm/lib/Target/ARM/CMakeLists.txt @@ -56,6 +56,7 @@ ARMTargetTransformInfo.cpp MLxExpansionPass.cpp MVEGatherScatterLowering.cpp + MVELaneInterleavingPass.cpp MVETailPredication.cpp MVEVPTBlockPass.cpp MVETPAndVPTOptimisationsPass.cpp diff --git a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp @@ -0,0 +1,328 @@ +//===- MVELaneInterleaving.cpp - Inverleave for MVE instructions ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass interleaves around sext/zext/trunc instructions. MVE does not have +// a single sext/zext or trunc instruction that takes the bottom half of a +// vector and extends to a full width, like NEON has with MOVL. Instead it is +// expected that this happens through top/bottom instructions. So the MVE +// equivalent VMOVLT/B instructions take either the even or odd elements of the +// input and extend them to the larger type, producing a vector with half the +// number of elements each of double the bitwidth. As there is no simple +// instruction, we often have to turn sext/zext/trunc into a series of lane +// moves (or stack loads/stores, which we do not do yet). +// +// This pass takes vector code that starts at truncs, looks for interconnected +// blobs of operations that end with sext/zext (or constants/splats) of the +// form: +// %sa = sext v8i16 %a to v8i32 +// %sb = sext v8i16 %b to v8i32 +// %add = add v8i32 %sa, %sb +// %r = trunc %add to v8i16 +// And adds shuffles to allow the use of VMOVL/VMOVN instrctions: +// %sha = shuffle v8i16 %a, undef, <0, 2, 4, 6, 1, 3, 5, 7> +// %sa = sext v8i16 %sha to v8i32 +// %shb = shuffle v8i16 %b, undef, <0, 2, 4, 6, 1, 3, 5, 7> +// %sb = sext v8i16 %shb to v8i32 +// %add = add v8i32 %sa, %sb +// %r = trunc %add to v8i16 +// %shr = shuffle v8i16 %r, undef, <0, 4, 1, 5, 2, 6, 3, 7> +// Which can then be split and lowered to MVE instructions efficiently: +// %sa_b = VMOVLB.s16 %a +// %sa_t = VMOVLT.s16 %a +// %sb_b = VMOVLB.s16 %b +// %sb_t = VMOVLT.s16 %b +// %add_b = VADD.i32 %sa_b, %sb_b +// %add_t = VADD.i32 %sa_t, %sb_t +// %r = VMOVNT.i16 %add_b, %add_t +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMSubtarget.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "mve-laneinterleave" + +cl::opt EnableInterleave( + "enable-mve-interleave", cl::Hidden, cl::init(true), + cl::desc("Enable interleave MVE vector operation lowering")); + +namespace { + +class MVELaneInterleaving : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + explicit MVELaneInterleaving() : FunctionPass(ID) { + initializeMVELaneInterleavingPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { return "MVE lane interleaving"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + FunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char MVELaneInterleaving::ID = 0; + +INITIALIZE_PASS(MVELaneInterleaving, DEBUG_TYPE, "MVE lane interleaving", false, + false) + +Pass *llvm::createMVELaneInterleavingPass() { + return new MVELaneInterleaving(); +} + +static bool isProfitableToInterleave(SmallSetVector &Exts, + SmallSetVector &Truncs) { + // This is not always beneficial to transform. Exts can be incorporated into + // loads, Truncs can be folded into stores. + // Truncs are usually the same number of instructions, + // VSTRH.32(A);VSTRH.32(B) vs VSTRH.16(VMOVNT A, B) with interleaving + // Exts are unfortunately more instructions in the general case: + // A=VLDRH.32; B=VLDRH.32; + // vs with interleaving: + // T=VLDRH.16; A=VMOVNB T; B=VMOVNT T + // But those VMOVL may be folded into a VMULL. + + // But expensive extends/truncs are always good to remove. + for (auto *E : Exts) + if (!isa(E->getOperand(0))) { + LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n"); + return true; + } + for (auto *T : Truncs) + if (T->hasOneUse() && !isa(*T->user_begin())) { + LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n"); + return true; + } + + // Otherwise, we know we have a load(ext), see if any of the Extends are a + // vmull. This is a simple heuristic and certainly not perfect. + for (auto *E : Exts) { + if (!E->hasOneUse() || + cast(*E->user_begin())->getOpcode() != Instruction::Mul) { + LLVM_DEBUG(dbgs() << "Not beneficial due to " << *E << "\n"); + return false; + } + } + return true; +} + +static bool tryInterleave(Instruction *Start, + SmallPtrSetImpl &Visited) { + LLVM_DEBUG(dbgs() << "tryInterleave from " << *Start << "\n"); + auto *VT = cast(Start->getType()); + + if (!isa(Start->getOperand(0))) + return false; + + // Look for connected operations starting from Ext's, terminating at Truncs. + std::vector Worklist; + Worklist.push_back(Start); + Worklist.push_back(cast(Start->getOperand(0))); + + SmallSetVector Truncs; + SmallSetVector Exts; + SmallSetVector Ops; + + while (!Worklist.empty()) { + Instruction *I = Worklist.back(); + Worklist.pop_back(); + + switch (I->getOpcode()) { + // Truncs + case Instruction::Trunc: + if (Truncs.count(I)) + continue; + Truncs.insert(I); + Visited.insert(I); + break; + + // Extend leafs + case Instruction::SExt: + case Instruction::ZExt: + if (Exts.count(I)) + continue; + for (auto *Use : I->users()) + Worklist.push_back(cast(Use)); + Exts.insert(I); + break; + + // Binary/tertiary ops + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::Shl: + case Instruction::ICmp: + case Instruction::Select: + if (Ops.count(I)) + continue; + Ops.insert(I); + + for (Use &Op : I->operands()) { + if (isa(Op)) + Worklist.push_back(cast(&Op)); + else + return false; + } + + for (auto *Use : I->users()) + Worklist.push_back(cast(Use)); + break; + + default: + LLVM_DEBUG(dbgs() << " Unhandled instruction: " << *I << "\n"); + return false; + } + } + + LLVM_DEBUG({ + dbgs() << "Found group:\n Exts:"; + for (auto *I : Exts) + dbgs() << " " << *I << "\n"; + dbgs() << " Ops:"; + for (auto *I : Ops) + dbgs() << " " << *I << "\n"; + dbgs() << "Truncs:"; + for (auto *I : Truncs) + dbgs() << " " << *I << "\n"; + }); + + assert(!Truncs.empty() && "Expected some truncs"); + assert(!Exts.empty() && "Expected some leaves"); + + // Check types + unsigned NumElts = VT->getNumElements(); + unsigned BaseElts = VT->getScalarSizeInBits() == 16 + ? 8 + : (VT->getScalarSizeInBits() == 8 ? 16 : 0); + if (BaseElts == 0 || NumElts % BaseElts != 0) { + LLVM_DEBUG(dbgs() << " Type is unsupported\n"); + return false; + } + if (Start->getOperand(0)->getType()->getScalarSizeInBits() != + VT->getScalarSizeInBits() * 2) { + LLVM_DEBUG(dbgs() << " Type not double sized\n"); + return false; + } + for (Instruction *I : Exts) + if (I->getOperand(0)->getType() != VT) { + LLVM_DEBUG(dbgs() << " Wrong type on " << *I << "\n"); + return false; + } + for (Instruction *I : Truncs) + if (I->getType() != VT) { + LLVM_DEBUG(dbgs() << " Wrong type on " << *I << "\n"); + return false; + } + + // Check that it looks beneficial + if (!isProfitableToInterleave(Exts, Truncs)) + return false; + + // Create new shuffles around the extends / truncs / other leaves. + IRBuilder<> Builder(Start); + + SmallVector LeafMask; + SmallVector TruncMask; + // LeafMask : 0, 2, 4, 6, 1, 3, 5, 7 8, 10, 12, 14, 9, 11, 13, 15 + // TruncMask: 0, 4, 1, 5, 2, 6, 3, 7 8, 12, 9, 13, 10, 14, 11, 15 + for (unsigned Base = 0; Base < NumElts; Base += BaseElts) { + for (unsigned i = 0; i < BaseElts / 2; i++) + LeafMask.push_back(Base + i * 2); + for (unsigned i = 0; i < BaseElts / 2; i++) + LeafMask.push_back(Base + i * 2 + 1); + } + for (unsigned Base = 0; Base < NumElts; Base += BaseElts) { + for (unsigned i = 0; i < BaseElts / 2; i++) { + TruncMask.push_back(Base + i); + TruncMask.push_back(Base + i + BaseElts / 2); + } + } + + for (Instruction *I : Exts) { + LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n"); + Builder.SetInsertPoint(I); + Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask); + bool Sext = isa(I); + Value *Ext = Sext ? Builder.CreateSExt(Shuffle, I->getType()) + : Builder.CreateZExt(Shuffle, I->getType()); + I->replaceAllUsesWith(Ext); + LLVM_DEBUG(dbgs() << " with " << *Shuffle << "\n"); + } + + for (Instruction *I : Truncs) { + LLVM_DEBUG(dbgs() << "Replacing trunc " << *I << "\n"); + + Builder.SetInsertPoint(I->getParent(), ++I->getIterator()); + Value *Shuf = Builder.CreateShuffleVector(I, TruncMask); + I->replaceAllUsesWith(Shuf); + cast(Shuf)->setOperand(0, I); + + LLVM_DEBUG(dbgs() << " with " << *Shuf << "\n"); + } + + return false; +} + +bool MVELaneInterleaving::runOnFunction(Function &F) { + if (!EnableInterleave) + return false; + auto &TPC = getAnalysis(); + auto &TM = TPC.getTM(); + auto *ST = &TM.getSubtarget(F); + if (!ST->hasMVEIntegerOps()) + return false; + + bool Changed = false; + + SmallPtrSet Visited; + for (Instruction &I : reverse(instructions(F))) { + if (I.getType()->isVectorTy() && + (isa(I) || isa(I)) && !Visited.count(&I)) + Changed |= tryInterleave(&I, Visited); + } + + return Changed; +} diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -10,6 +10,7 @@ ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: MVE gather/scatter lowering +; CHECK-NEXT: MVE lane interleaving ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll @@ -76,34 +76,22 @@ define arm_aapcs_vfpcc <8 x i16> @loads_i16(<8 x i16> *%A, <8 x i16> *%B, <8 x i16> *%C) { ; CHECK-LABEL: loads_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vldrh.s32 q1, [r0] -; CHECK-NEXT: vldrh.s32 q2, [r0, #8] -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vldrh.u32 q1, [r2] -; CHECK-NEXT: vneg.s32 q1, q1 -; CHECK-NEXT: vshl.s32 q1, q0, q1 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.u32 q2, [r2, #8] -; CHECK-NEXT: vneg.s32 q2, q2 -; CHECK-NEXT: vshl.s32 q1, q1, q2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmovlb.s16 q1, q0 +; CHECK-NEXT: vmovlb.s16 q3, q2 +; CHECK-NEXT: vmovlt.s16 q0, q0 +; CHECK-NEXT: vmovlt.s16 q2, q2 +; CHECK-NEXT: vadd.i32 q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r2] +; CHECK-NEXT: vadd.i32 q1, q3, q1 +; CHECK-NEXT: vmovlt.u16 q3, q2 +; CHECK-NEXT: vneg.s32 q3, q3 +; CHECK-NEXT: vshl.s32 q3, q0, q3 +; CHECK-NEXT: vmovlb.u16 q0, q2 +; CHECK-NEXT: vneg.s32 q0, q0 +; CHECK-NEXT: vshl.s32 q0, q1, q0 +; CHECK-NEXT: vmovnt.i32 q0, q3 ; CHECK-NEXT: bx lr entry: %a = load <8 x i16>, <8 x i16> *%A, align 4 @@ -121,50 +109,22 @@ define arm_aapcs_vfpcc <16 x i8> @loads_i8(<16 x i8> *%A, <16 x i8> *%B, <16 x i8> *%C) { ; CHECK-LABEL: loads_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s16 q0, [r1] -; CHECK-NEXT: vldrb.s16 q1, [r0] -; CHECK-NEXT: vldrb.s16 q2, [r0, #8] -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vldrb.u16 q1, [r2] -; CHECK-NEXT: vneg.s16 q1, q1 -; CHECK-NEXT: vshl.s16 q1, q0, q1 -; CHECK-NEXT: vmov.u16 r3, q1[0] -; CHECK-NEXT: vmov.8 q0[0], r3 -; CHECK-NEXT: vmov.u16 r3, q1[1] -; CHECK-NEXT: vmov.8 q0[1], r3 -; CHECK-NEXT: vmov.u16 r3, q1[2] -; CHECK-NEXT: vmov.8 q0[2], r3 -; CHECK-NEXT: vmov.u16 r3, q1[3] -; CHECK-NEXT: vmov.8 q0[3], r3 -; CHECK-NEXT: vmov.u16 r3, q1[4] -; CHECK-NEXT: vmov.8 q0[4], r3 -; CHECK-NEXT: vmov.u16 r3, q1[5] -; CHECK-NEXT: vmov.8 q0[5], r3 -; CHECK-NEXT: vmov.u16 r3, q1[6] -; CHECK-NEXT: vmov.8 q0[6], r3 -; CHECK-NEXT: vmov.u16 r3, q1[7] -; CHECK-NEXT: vldrb.s16 q1, [r1, #8] -; CHECK-NEXT: vmov.8 q0[7], r3 -; CHECK-NEXT: vadd.i16 q1, q2, q1 -; CHECK-NEXT: vldrb.u16 q2, [r2, #8] -; CHECK-NEXT: vneg.s16 q2, q2 -; CHECK-NEXT: vshl.s16 q1, q1, q2 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmovlb.s8 q1, q0 +; CHECK-NEXT: vmovlb.s8 q3, q2 +; CHECK-NEXT: vmovlt.s8 q0, q0 +; CHECK-NEXT: vmovlt.s8 q2, q2 +; CHECK-NEXT: vadd.i16 q0, q2, q0 +; CHECK-NEXT: vldrw.u32 q2, [r2] +; CHECK-NEXT: vadd.i16 q1, q3, q1 +; CHECK-NEXT: vmovlt.u8 q3, q2 +; CHECK-NEXT: vneg.s16 q3, q3 +; CHECK-NEXT: vshl.s16 q3, q0, q3 +; CHECK-NEXT: vmovlb.u8 q0, q2 +; CHECK-NEXT: vneg.s16 q0, q0 +; CHECK-NEXT: vshl.s16 q0, q1, q0 +; CHECK-NEXT: vmovnt.i16 q0, q3 ; CHECK-NEXT: bx lr entry: %a = load <16 x i8>, <16 x i8> *%A, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -96,52 +96,11 @@ define arm_aapcs_vfpcc <8 x i16> @ext_add_trunc_v8i16(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: ext_add_trunc_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vadd.i32 q0, q3, q4 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vrev32.16 q3, q0 +; CHECK-NEXT: vrev32.16 q2, q1 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmovnt.i32 q0, q2 ; CHECK-NEXT: bx lr entry: %sa = sext <8 x i16> %a to <8 x i32> @@ -154,108 +113,11 @@ define arm_aapcs_vfpcc <16 x i8> @ext_add_trunc_v16i8(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: ext_add_trunc_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vadd.i16 q3, q3, q2 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.8 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.8 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.8 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.8 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.8 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.8 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.8 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.8 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vadd.i16 q0, q3, q4 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.8 q2[8], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.8 q2[9], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.8 q2[10], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.8 q2[11], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.8 q2[12], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.8 q2[13], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.8 q2[14], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.8 q2[15], r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vrev16.8 q3, q0 +; CHECK-NEXT: vrev16.8 q2, q1 +; CHECK-NEXT: vadd.i16 q2, q3, q2 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmovnt.i16 q0, q2 ; CHECK-NEXT: bx lr entry: %sa = sext <16 x i8> %a to <16 x i16> @@ -268,95 +130,19 @@ define arm_aapcs_vfpcc <16 x i16> @ext_add_trunc_v16i16(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: ext_add_trunc_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.u16 r1, q2[0] -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.u16 r1, q2[1] -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.u16 r1, q4[0] -; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.u16 r1, q4[1] -; CHECK-NEXT: vmov q5[3], q5[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q4[4] -; CHECK-NEXT: vadd.i32 q5, q5, q0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.u16 r1, q2[4] -; CHECK-NEXT: vmov q6[2], q6[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.u16 r1, q2[5] -; CHECK-NEXT: vmov q6[3], q6[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.u16 r1, q4[5] -; CHECK-NEXT: vmov q5[3], q5[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q3[0] -; CHECK-NEXT: vadd.i32 q2, q5, q6 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.u16 r1, q3[1] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.u16 r1, q3[4] -; CHECK-NEXT: vmov q5[2], q5[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.u16 r1, q3[5] -; CHECK-NEXT: vmov q5[3], q5[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 -; CHECK-NEXT: vadd.i32 q1, q4, q5 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vrev32.16 q5, q0 +; CHECK-NEXT: vrev32.16 q4, q2 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmovnt.i32 q0, q4 +; CHECK-NEXT: vrev32.16 q4, q1 +; CHECK-NEXT: vrev32.16 q2, q3 +; CHECK-NEXT: vadd.i32 q1, q1, q3 +; CHECK-NEXT: vadd.i32 q2, q4, q2 +; CHECK-NEXT: vmovnt.i32 q1, q2 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %sa = sext <16 x i16> %a to <16 x i32> @@ -369,207 +155,19 @@ define arm_aapcs_vfpcc <32 x i8> @ext_add_trunc_v32i8(<32 x i8> %a, <32 x i8> %b) { ; CHECK-LABEL: ext_add_trunc_v32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q4[0] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q4[1] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q4[2] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q4[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q4[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q4[5] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q4[6] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q4[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vadd.i16 q5, q5, q0 -; CHECK-NEXT: vmov.u16 r0, q5[0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q5[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q5[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q5[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q5[7] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q4[8] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q4[9] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q4[10] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q4[11] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q4[12] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q4[13] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q4[14] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.16 q6[2], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u8 r0, q4[15] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vadd.i16 q2, q5, q6 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vmov.u8 r0, q3[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[1] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[2] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[3] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q3[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q3[6] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q3[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vadd.i16 q4, q4, q2 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.8 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.8 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.8 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.8 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.8 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.8 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.8 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.8 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[9] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q3[10] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q3[11] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q3[12] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q3[13] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q3[14] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u8 r0, q3[15] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vadd.i16 q1, q4, q5 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.8 q2[8], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.8 q2[9], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.8 q2[10], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.8 q2[11], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.8 q2[12], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.8 q2[13], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.8 q2[14], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.8 q2[15], r0 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vrev16.8 q5, q0 +; CHECK-NEXT: vrev16.8 q4, q2 +; CHECK-NEXT: vadd.i16 q0, q0, q2 +; CHECK-NEXT: vadd.i16 q4, q5, q4 +; CHECK-NEXT: vmovnt.i16 q0, q4 +; CHECK-NEXT: vrev16.8 q4, q1 +; CHECK-NEXT: vrev16.8 q2, q3 +; CHECK-NEXT: vadd.i16 q1, q1, q3 +; CHECK-NEXT: vadd.i16 q2, q4, q2 +; CHECK-NEXT: vmovnt.i16 q1, q2 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %sa = sext <32 x i8> %a to <32 x i16> @@ -1075,70 +673,31 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmovlb.u16 q2, q2 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vneg.s32 q5, q2 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vmov.u16 r1, q1[0] +; CHECK-NEXT: vmovlt.u16 q2, q1 +; CHECK-NEXT: vmovlt.s16 q3, q0 ; CHECK-NEXT: vadd.i32 q4, q3, q2 -; CHECK-NEXT: vcmp.i32 eq, q3, q2 +; CHECK-NEXT: vneg.s32 q5, q2 ; CHECK-NEXT: vshl.s32 q4, q4, q5 ; CHECK-NEXT: vneg.s32 q5, q3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.u16 r1, q1[1] ; CHECK-NEXT: vsub.i32 q4, q4, q2 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vcmp.i32 eq, q3, q2 ; CHECK-NEXT: vmul.i32 q4, q4, q2 -; CHECK-NEXT: vmovlb.u16 q1, q3 -; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmovlb.u16 q1, q1 ; CHECK-NEXT: vshl.u32 q4, q4, q5 -; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vshl.u32 q4, q4, q2 -; CHECK-NEXT: vmovlb.s16 q0, q3 -; CHECK-NEXT: vpsel q2, q4, q2 ; CHECK-NEXT: vadd.i32 q3, q0, q1 +; CHECK-NEXT: vpsel q2, q4, q2 ; CHECK-NEXT: vneg.s32 q4, q1 -; CHECK-NEXT: vcmp.i32 eq, q0, q1 ; CHECK-NEXT: vshl.s32 q3, q3, q4 ; CHECK-NEXT: vneg.s32 q4, q0 ; CHECK-NEXT: vsub.i32 q3, q3, q1 +; CHECK-NEXT: vcmp.i32 eq, q0, q1 ; CHECK-NEXT: vmul.i32 q3, q3, q1 ; CHECK-NEXT: vshl.u32 q3, q3, q4 ; CHECK-NEXT: vshl.u32 q3, q3, q1 -; CHECK-NEXT: vpsel q1, q3, q1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vpsel q0, q3, q1 +; CHECK-NEXT: vmovnt.i32 q0, q2 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: @@ -1161,126 +720,31 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmovlb.u8 q2, q2 -; CHECK-NEXT: vmovlb.s8 q3, q3 -; CHECK-NEXT: vneg.s16 q5, q2 +; CHECK-NEXT: vmovlt.u8 q2, q1 +; CHECK-NEXT: vmovlt.s8 q3, q0 ; CHECK-NEXT: vadd.i16 q4, q3, q2 -; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vneg.s16 q5, q2 ; CHECK-NEXT: vshl.s16 q4, q4, q5 ; CHECK-NEXT: vneg.s16 q5, q3 -; CHECK-NEXT: vcmp.i16 eq, q3, q2 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] ; CHECK-NEXT: vsub.i16 q4, q4, q2 -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q3 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vcmp.i16 eq, q3, q2 ; CHECK-NEXT: vmul.i16 q4, q4, q2 -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vshl.u16 q4, q4, q5 -; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vshl.u16 q4, q4, q2 -; CHECK-NEXT: vmovlb.s8 q0, q3 -; CHECK-NEXT: vpsel q2, q4, q2 ; CHECK-NEXT: vadd.i16 q3, q0, q1 +; CHECK-NEXT: vpsel q2, q4, q2 ; CHECK-NEXT: vneg.s16 q4, q1 -; CHECK-NEXT: vcmp.i16 eq, q0, q1 ; CHECK-NEXT: vshl.s16 q3, q3, q4 ; CHECK-NEXT: vneg.s16 q4, q0 ; CHECK-NEXT: vsub.i16 q3, q3, q1 +; CHECK-NEXT: vcmp.i16 eq, q0, q1 ; CHECK-NEXT: vmul.i16 q3, q3, q1 ; CHECK-NEXT: vshl.u16 q3, q3, q4 ; CHECK-NEXT: vshl.u16 q3, q3, q1 -; CHECK-NEXT: vpsel q1, q3, q1 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vpsel q0, q3, q1 +; CHECK-NEXT: vmovnt.i16 q0, q2 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: