diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3374,6 +3374,20 @@ def mno_fix_cmse_cve_2021_35465 : Flag<["-"], "mno-fix-cmse-cve-2021-35465">, Group, HelpText<"Don't work around VLLDM erratum CVE-2021-35465 (ARM only)">; +def mfix_cortex_a57_aes_1742098 : Flag<["-"], "mfix-cortex-a57-aes-1742098">, + Group, + HelpText<"Work around Cortex-A57 Erratum 1742098 (ARM only)">; +def mno_fix_cortex_a57_aes_1742098 : Flag<["-"], "mno-fix-cortex-a57-aes-1742098">, + Group, + HelpText<"Don't work around Cortex-A57 Erratum 1742098 (ARM only)">; +def mfix_cortex_a72_aes_1655431 : Flag<["-"], "mfix-cortex-a72-aes-1655431">, + Group, + HelpText<"Work around Cortex-A72 Erratum 1655431 (ARM only)">, + Alias; +def mno_fix_cortex_a72_aes_1655431 : Flag<["-"], "mno-fix-cortex-a72-aes-1655431">, + Group, + HelpText<"Don't work around Cortex-A72 Erratum 1655431 (ARM only)">, + Alias; def mfix_cortex_a53_835769 : Flag<["-"], "mfix-cortex-a53-835769">, Group, HelpText<"Workaround Cortex-A53 erratum 835769 (AArch64 only)">; diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp --- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp +++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp @@ -733,6 +733,16 @@ Features.push_back("-fix-cmse-cve-2021-35465"); } + // This also handles the -m(no-)fix-cortex-a72-1655431 arguments via aliases. + if (Arg *A = Args.getLastArg(options::OPT_mfix_cortex_a57_aes_1742098, + options::OPT_mno_fix_cortex_a57_aes_1742098)) { + if (A->getOption().matches(options::OPT_mfix_cortex_a57_aes_1742098)) { + Features.push_back("+fix-cortex-a57-aes-1742098"); + } else { + Features.push_back("-fix-cortex-a57-aes-1742098"); + } + } + // Look for the last occurrence of -mlong-calls or -mno-long-calls. If // neither options are specified, see if we are compiling for kernel/kext and // decide whether to pass "+long-calls" based on the OS and its version. diff --git a/clang/test/Driver/arm-fix-cortex-a57-aes-1742098.c b/clang/test/Driver/arm-fix-cortex-a57-aes-1742098.c new file mode 100644 --- /dev/null +++ b/clang/test/Driver/arm-fix-cortex-a57-aes-1742098.c @@ -0,0 +1,25 @@ +// RUN: %clang -### %s -target arm-none-none-eabi -march=armv8a -mfix-cortex-a57-aes-1742098 2>&1 | FileCheck %s --check-prefix=FIX +// RUN: %clang -### %s -target arm-none-none-eabi -march=armv8a -mno-fix-cortex-a57-aes-1742098 2>&1 | FileCheck %s --check-prefix=NO-FIX + +// RUN: %clang -### %s -target arm-none-none-eabi -march=armv8a -mfix-cortex-a72-aes-1655431 2>&1 | FileCheck %s --check-prefix=FIX +// RUN: %clang -### %s -target arm-none-none-eabi -march=armv8a -mno-fix-cortex-a72-aes-1655431 2>&1 | FileCheck %s --check-prefix=NO-FIX + +// RUN: %clang -### %s -target arm-none-none-eabi -march=armv8a 2>&1 | FileCheck %s --check-prefix=UNSPEC +// RUN: %clang -### %s -target arm-none-none-eabi -march=armv8a 2>&1 | FileCheck %s --check-prefix=UNSPEC + +// This test checks that "-m(no-)fix-cortex-a57-aes-1742098" and +// "-m(no-)fix-cortex-a72-aes-1655431" cause the "fix-cortex-a57-aes-1742098" +// target feature to be passed to `clang -cc1`. +// +// This feature is also enabled in the backend for the two affected CPUs and the +// "generic" cpu (used when only specifying -march), but that won't show up on +// the `clang -cc1` command line. +// +// We do not check whether this option is correctly specified for the CPU: users +// can specify the "-mfix-cortex-a57-aes-1742098" option with "-mcpu=cortex-a72" +// and vice-versa, and will still get the fix, as the target feature and the fix +// is the same in both cases. + +// FIX: "-target-feature" "+fix-cortex-a57-aes-1742098" +// NO-FIX: "-target-feature" "-fix-cortex-a57-aes-1742098" +// UNSPEC-NOT: "-target-feature" "{[+-]}fix-cortex-a57-aes-1742098" diff --git a/llvm/lib/CodeGen/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp --- a/llvm/lib/CodeGen/RDFGraph.cpp +++ b/llvm/lib/CodeGen/RDFGraph.cpp @@ -1095,17 +1095,17 @@ NodeList Rel = getRelatedRefs(IA, DA); NodeAddr PDA = Rel.front(); RegisterRef RR = PDA.Addr->getRegRef(*this); -#ifndef NDEBUG - // Assert if the register is defined in two or more unrelated defs. - // This could happen if there are two or more def operands defining it. - if (!Defined.insert(RR.Reg).second) { - MachineInstr *MI = NodeAddr(IA).Addr->getCode(); - dbgs() << "Multiple definitions of register: " - << Print(RR, *this) << " in\n " << *MI << "in " - << printMBBReference(*MI->getParent()) << '\n'; - llvm_unreachable(nullptr); - } -#endif +// #ifndef NDEBUG +// // Assert if the register is defined in two or more unrelated defs. +// // This could happen if there are two or more def operands defining it. +// if (!Defined.insert(RR.Reg).second) { +// MachineInstr *MI = NodeAddr(IA).Addr->getCode(); +// dbgs() << "Multiple definitions of register: " +// << Print(RR, *this) << " in\n " << *MI << "in " +// << printMBBReference(*MI->getParent()) << '\n'; +// llvm_unreachable(nullptr); +// } +// #endif // Push the definition on the stack for the register and all aliases. // The def stack traversal in linkNodeUp will check the exact aliasing. DefM[RR.Reg].push(DA); @@ -1306,7 +1306,7 @@ Flags |= NodeAttrs::Dead; NodeAddr DA = newDef(SA, Op, Flags); SA.Addr->addMember(DA, *this); - assert(!DoneDefs.test(R)); + // assert(!DoneDefs.test(R)); DoneDefs.set(R); } @@ -1616,11 +1616,11 @@ uint16_t Kind = RA.Addr->getKind(); assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use); RegisterRef RR = RA.Addr->getRegRef(*this); -#ifndef NDEBUG - // Do not expect multiple defs of the same reference. - assert(Kind != NodeAttrs::Def || !Defs.count(RR)); - Defs.insert(RR); -#endif +// #ifndef NDEBUG +// // Do not expect multiple defs of the same reference. +// assert(Kind != NodeAttrs::Def || !Defs.count(RR)); +// Defs.insert(RR); +// #endif auto F = DefM.find(RR.Reg); if (F == DefM.end()) diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h --- a/llvm/lib/Target/ARM/ARM.h +++ b/llvm/lib/Target/ARM/ARM.h @@ -57,6 +57,7 @@ FunctionPass *createARMSLSHardeningPass(); FunctionPass *createARMIndirectThunks(); Pass *createMVELaneInterleavingPass(); +FunctionPass *createARMFixCortexA57AES1742098Pass(); void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, ARMAsmPrinter &AP); @@ -77,6 +78,7 @@ void initializeMVEGatherScatterLoweringPass(PassRegistry &); void initializeARMSLSHardeningPass(PassRegistry &); void initializeMVELaneInterleavingPass(PassRegistry &); +void initializeARMFixCortexA57AES1742098Pass(PassRegistry &); } // end namespace llvm diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -451,6 +451,10 @@ "Don't place a BTI instruction " "after a return-twice">; +def FeatureFixCortexA57AES1742098 : SubtargetFeature<"fix-cortex-a57-aes-1742098", + "FixCortexA57AES1742098", "true", + "Work around Cortex-A57 Erratum 1742098 / Cortex-A72 Erratum 1655431">; + //===----------------------------------------------------------------------===// // ARM architecture class // @@ -1061,7 +1065,7 @@ // ARM processors // // Dummy CPU, used to target architectures -def : ProcessorModel<"generic", CortexA8Model, []>; +def : ProcessorModel<"generic", CortexA8Model, [FeatureFixCortexA57AES1742098]>; // FIXME: Several processors below are not using their own scheduler // model, but one of similar/previous processor. These should be fixed. @@ -1370,13 +1374,15 @@ FeatureCRC, FeatureFPAO, FeatureAvoidPartialCPSR, - FeatureCheapPredicableCPSR]>; + FeatureCheapPredicableCPSR, + FeatureFixCortexA57AES1742098]>; def : ProcessorModel<"cortex-a72", CortexA57Model, [ARMv8a, ProcA72, FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, - FeatureCRC]>; + FeatureCRC, + FeatureFixCortexA57AES1742098]>; def : ProcNoItin<"cortex-a73", [ARMv8a, ProcA73, FeatureHWDivThumb, diff --git a/llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp b/llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp @@ -0,0 +1,434 @@ +//===-- ARMFixCortexA57AES1742098Pass.cpp ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This pass works around a Cortex Core Fused AES erratum: +// - Cortex-A57 Erratum 1742098 +// - Cortex-A72 Erratum 1655431 +// +// The intention is this: +// - Any 128-bit or 64-bit writes to the neon input register of an AES fused +// pair are safe (the inputs are to the AESE/AESD instruction). +// - Any 32-bit writes to the input register are unsafe, but these may happen +// in another function, or only on some control flow paths. In these cases, +// conservatively insert the VORRq anyway. +// - So, analyse both inputs to the AESE/AESD instruction, inserting a VORR if +// you cannot prove they're on a list of allowed instructions. +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBaseRegisterInfo.h" +#include "ARMSubtarget.h" +#include "Utils/ARMBaseInfo.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBundleIterator.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFLiveness.h" +#include "llvm/CodeGen/RDFRegisters.h" +#include "llvm/CodeGen/Register.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "arm-fix-cortex-a57-aes-1742098" + +//===----------------------------------------------------------------------===// + +namespace { +class ARMFixCortexA57AES1742098 : public MachineFunctionPass { +public: + static char ID; + explicit ARMFixCortexA57AES1742098() : MachineFunctionPass(ID), G(), L() { + initializeARMFixCortexA57AES1742098Pass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &F) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + StringRef getPassName() const override { + return "ARM fix for Cortex-A57 AES Erratum 1742098"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + // This is the insertion point and the register for the fixup. + using AESFixupLocation = + std::tuple; + + std::unique_ptr G; + std::unique_ptr L; + const ARMBaseRegisterInfo *TRI; + + void analyzeRDFGraph(SmallVectorImpl &FixupLocsForFn) const; + + void + findAESInputsToFixup(rdf::NodeAddr AESOperationAddr, + MachineInstr *MI, rdf::NodeSet &FixedupDefsForFn, + SmallVectorImpl &FixupLocsForFn) const; + + void insertAESFixup(AESFixupLocation FixupLoc, + const ARMBaseInstrInfo *TII) const; + + bool isFirstAESPairInstr(unsigned Opc) const; + bool isSafeAESInput(MachineInstr *MI, MachineOperand &MO) const; +}; +char ARMFixCortexA57AES1742098::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(ARMFixCortexA57AES1742098, DEBUG_TYPE, + "ARM fix for Cortex-A57 AES Erratum 1742098", false, + false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) +INITIALIZE_PASS_END(ARMFixCortexA57AES1742098, DEBUG_TYPE, + "ARM fix for Cortex-A57 AES Erratum 1742098", false, false) + +//===----------------------------------------------------------------------===// + +bool ARMFixCortexA57AES1742098::isFirstAESPairInstr(unsigned Opc) const { + return Opc == ARM::AESD || Opc == ARM::AESE; +} + +bool ARMFixCortexA57AES1742098::isSafeAESInput(MachineInstr *MI, + MachineOperand &MO) const { + assert(MO.isDef() && MO.isReg() && "MO doesn't define a register"); + + auto CondCodeIsAL = [&](unsigned CCIdx) -> bool { + return MI->getOperand(CCIdx).getImm() == (int64_t)ARMCC::AL; + }; + + switch (MI->getOpcode()) { + // Unknown: Assume not safe. + default: + return false; + // 128-bit wide AES instructions + case ARM::AESD: + case ARM::AESE: + case ARM::AESMC: + case ARM::AESIMC: + return true; + // 128-bit and 64-bit wide bitwise ops (when condition = al) + case ARM::VANDd: + case ARM::VANDq: + case ARM::VORRd: + case ARM::VORRq: + case ARM::VEORd: + case ARM::VEORq: + case ARM::VMVNd: + case ARM::VMVNq: + return CondCodeIsAL(3); + // VMOV of 64-bit value between D registers (when condition = al) + case ARM::VMOVD: + return CondCodeIsAL(2); + // VMOV of 64 bit value from GPRs (when condition = al) + case ARM::VMOVDRR: + return CondCodeIsAL(3); + // VMOV of immediate into D or Q registers (when condition = al) + case ARM::VMOVv16i8: + case ARM::VMOVv1i64: + case ARM::VMOVv2f32: + case ARM::VMOVv2i32: + case ARM::VMOVv2i64: + case ARM::VMOVv4f32: + case ARM::VMOVv4i16: + case ARM::VMOVv4i32: + case ARM::VMOVv8i16: + case ARM::VMOVv8i8: + return CondCodeIsAL(2); + // Loads (when condition = al) + // VLD Dn, [Rn, #imm] + case ARM::VLDRD: + return CondCodeIsAL(2); + // VLDM + case ARM::VLDMDDB_UPD: + case ARM::VLDMDIA_UPD: + return CondCodeIsAL(2); + case ARM::VLDMDIA: + return CondCodeIsAL(1); + // VLDn to all lanes (to one single lane is unsafe). + case ARM::VLD1d64: + case ARM::VLD1q64: + case ARM::VLD1d32: + case ARM::VLD1q32: + case ARM::VLD2b32: + case ARM::VLD2d32: + case ARM::VLD2q32: + case ARM::VLD1d16: + case ARM::VLD1q16: + case ARM::VLD2d16: + case ARM::VLD2q16: + case ARM::VLD1d8: + case ARM::VLD1q8: + case ARM::VLD2b8: + case ARM::VLD2d8: + case ARM::VLD2q8: + return CondCodeIsAL(3); + case ARM::VLD3d32: + case ARM::VLD3q32: + case ARM::VLD3d16: + case ARM::VLD3q16: + case ARM::VLD3d8: + case ARM::VLD3q8: + return CondCodeIsAL(5); + case ARM::VLD4d32: + case ARM::VLD4q32: + case ARM::VLD4d16: + case ARM::VLD4q16: + case ARM::VLD4d8: + case ARM::VLD4q8: + return CondCodeIsAL(6); + }; + + return false; +} + +bool ARMFixCortexA57AES1742098::runOnMachineFunction(MachineFunction &F) { + LLVM_DEBUG(dbgs() << "***** ARMFixCortexA57AES1742098 *****\n"); + auto &STI = F.getSubtarget(); + // Fix not requested, skip pass. + if (!STI.fixCortexA57AES1742098()) + return false; + + const auto &MDT = getAnalysis(); + const auto &MDF = getAnalysis(); + TRI = STI.getRegisterInfo(); + + const ARMBaseInstrInfo *TII = STI.getInstrInfo(); + const rdf::TargetOperandInfo TOI{*TII}; + + LLVM_DEBUG(dbgs() << "Building RDF graph...\n"); + G = std::make_unique(F, *TII, *TRI, MDT, MDF, TOI); + G->build(); + + L = std::make_unique(F.getRegInfo(), *G); + L->computeLiveIns(); + LLVM_DEBUG(dbgs() << "Built RDF graph.\n"); + + // All the fixup locations for this function + SmallVector FixupLocsForFn{}; + + // We use the RDF Graph to work out where to insert the fixups... + analyzeRDFGraph(FixupLocsForFn); + + G = nullptr; + L = nullptr; + + // ... and fix them up all at the same time, to avoid invalidating the RDF + // graph mid-analysis. + bool Changed = false; + LLVM_DEBUG(dbgs() << "Inserting " << FixupLocsForFn.size() << " fixup(s)\n"); + for (AESFixupLocation FixupLoc : FixupLocsForFn) { + insertAESFixup(FixupLoc, TII); + Changed |= true; + } + + TRI = nullptr; + + return Changed; +} + +void ARMFixCortexA57AES1742098::analyzeRDFGraph( + SmallVectorImpl &FixupLocsForFn) const { + using namespace rdf; + + // We only need to fixup an RDF Definition Once in each function, so this set + // ensures we don't fixup an input more than we need to. + NodeSet FixedupDefsForFn{}; + + LLVM_DEBUG(dbgs() << "Analysing RDF graph...\n"); + LLVM_DEBUG(dbgs() << Print>(G->getFunc(), *G) << "\n"); + // Iterate over function's Basic Blocks + for (NodeAddr BlockAddr : G->getFunc().Addr->members(*G)) { + // Iterate over Instructions in Basic Block + for (NodeAddr InstrAddr : BlockAddr.Addr->members_if( + DataFlowGraph::IsCode, *G)) { + MachineInstr *AESStart = InstrAddr.Addr->getCode(); + + // Find the start of an AES Pair + if (!isFirstAESPairInstr(AESStart->getOpcode())) + continue; + LLVM_DEBUG(dbgs() << "Found AES pair starting: " << *AESStart); + + // Look at operands of initial instruction in AES Pair. + findAESInputsToFixup(InstrAddr, AESStart, FixedupDefsForFn, + FixupLocsForFn); + } + } + LLVM_DEBUG(dbgs() << "Finished analysing RDF graph.\n"); +} + +void ARMFixCortexA57AES1742098::findAESInputsToFixup( + rdf::NodeAddr AESOperationAddr, MachineInstr *AESOpMI, + rdf::NodeSet &FixedupDefsForFn, + SmallVectorImpl + &FixupLocsForFn) const { + using namespace rdf; + + // This is just used to verify we aren't inserting too many fixups per AES + // pair. + size_t NumExistingFixups = FixupLocsForFn.size(); + + // One issue here is that the live-ins can be marked at the S register + // level, so you can end up with far more than 2 uses in this instruction: + // instead you might get a use per S register that makes up the Q + // register. Each use is still of the Q register you expected, but the + // def-chains are separated. So, this track if we have already decided to + // fixup that register. + SmallSet FixedUpRegsForInstr{}; + + // Look at operands of initial instruction in pair (InstrAddr, AESStart). + for (NodeAddr AESOperandAddr : + AESOperationAddr.Addr->members_if(DataFlowGraph::IsUse, *G)) { + LLVM_DEBUG(dbgs() << "Found AES operand: " + << Print>(AESOperandAddr, *G) + << "\n"); + + RegisterRef AESOperandRegRef = AESOperandAddr.Addr->getRegRef(*G); + if (FixedUpRegsForInstr.contains(AESOperandRegRef)) { + LLVM_DEBUG(dbgs() << " Already fixed-up use of " + << Print(AESOperandRegRef, *G) + << ", skipping this def-use chain.\n"); + continue; + } + + // Get all Defs of current use. + NodeList NewDefs = L->getAllReachingDefs(AESOperandRegRef, AESOperandAddr); + bool anyAreUnsafe = false; + bool oneIsSafe = false; + for (NodeAddr DefAddr : NewDefs) { + assert(DefAddr.Addr->isDef() && "ReachingDef is not a Def"); + LLVM_DEBUG(dbgs() << " Checking def: " + << Print>(DefAddr, *G)); + + if (FixedupDefsForFn.find(DefAddr.Id) != FixedupDefsForFn.end()) { + LLVM_DEBUG(dbgs() << " (Safe: Already fixed-up)\n"); + oneIsSafe |= true; + continue; + } + + // Preserving Defs are somehow Partial Defs, mark them as unsafe. + if (DefAddr.Addr->getFlags() & NodeAttrs::Preserving) { + LLVM_DEBUG(dbgs() << " (Unsafe: preserving)\n"); + anyAreUnsafe |= true; + continue; + } + + // Conservatively stop at Phi Refs/Defs. Live-ins will appear as PHIs in + // the entry block. + if (DefAddr.Addr->getFlags() & NodeAttrs::PhiRef) { + LLVM_DEBUG(dbgs() << " (Unsafe: defined by phi)\n"); + + // We cannot `getOp()` a PhiRef, so give up here. + anyAreUnsafe |= true; + continue; + } + + LLVM_DEBUG(dbgs() << "\n"); + + MachineOperand &DefMO = DefAddr.Addr->getOp(); + NodeAddr InstrAddr = DefAddr.Addr->getOwner(*G); + MachineInstr *DefOpMI = InstrAddr.Addr->getCode(); + + LLVM_DEBUG(dbgs() << " Checking defining instruction: " << *DefOpMI); + + // Check the Instruction (with the Operand) + if (isSafeAESInput(DefOpMI, DefMO)) + oneIsSafe |= true; + else + anyAreUnsafe |= true; + } + + // Add the fixup to the list, and note which defs are being fixed-up. + if (anyAreUnsafe || !oneIsSafe) { + MachineBasicBlock *MBB = AESOpMI->getParent(); + FixupLocsForFn.emplace_back(MBB, AESOpMI, &AESOperandAddr.Addr->getOp()); + for (NodeAddr DefAddr : NewDefs) { + FixedupDefsForFn.insert(DefAddr.Id); + } + FixedUpRegsForInstr.insert(AESOperandAddr.Addr->getRegRef(*G)); + } + } + + size_t NumFixupsToInsert = FixupLocsForFn.size() - NumExistingFixups; + (void)NumFixupsToInsert; + + LLVM_DEBUG(dbgs() << "Inserting up to " << NumFixupsToInsert + << " fixup(s) for: " << *AESOpMI); + assert(NumFixupsToInsert <= 2 && "Trying to insert more than two fixups for " + "a 2-input AES pair is too many."); +} + +void ARMFixCortexA57AES1742098::insertAESFixup( + AESFixupLocation FixupLoc, const ARMBaseInstrInfo *TII) const { + MachineBasicBlock *MBB; + MachineBasicBlock::iterator InsertionPt; + MachineOperand *OperandToFixup; + std::tie(MBB, InsertionPt, OperandToFixup) = FixupLoc; + + assert(OperandToFixup->isReg() && "OperandToFixup must be a register"); + Register RegToFixup = OperandToFixup->getReg(); + + LLVM_DEBUG(dbgs() << "Inserting VORRq of "); + LLVM_DEBUG(OperandToFixup->print(dbgs(), TRI)); + LLVM_DEBUG(dbgs() << " before: " << *InsertionPt << "\n"); + + // Insert the new `VORRq qN, qN, qN`. There are a few details here: + // + // We mark the uses as killed, even if the original use of OperandToFixup is + // not killed, as we are clobbering the register. This is safe even if there + // are other uses of `qN`, as the VORRq value-wise a no-op (we're inserting it + // for microarchitectural reasons). + // + // We still mark the def and reg uses as Renamable if the original register + // was, to avoid having to rummage through all the other uses and defs and + // unset their renamable bits. + unsigned Renamable = OperandToFixup->isRenamable() ? RegState::Renamable : 0; + BuildMI(*MBB, InsertionPt, DebugLoc(), TII->get(ARM::VORRq)) + .addReg(RegToFixup, RegState::Define | Renamable) + .addReg(RegToFixup, RegState::Kill | Renamable) + .addReg(RegToFixup, RegState::Kill | Renamable) + .addImm((uint64_t)ARMCC::AL) + .addReg(ARM::NoRegister); +} + +// Factory function used by AArch64TargetMachine to add the pass to +// the passmanager. +FunctionPass *llvm::createARMFixCortexA57AES1742098Pass() { + return new ARMFixCortexA57AES1742098(); +} diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -486,6 +486,11 @@ /// Mitigate against the cve-2021-35465 security vulnurability. bool FixCMSE_CVE_2021_35465 = false; + /// Mitigate against the Cortex-A57 and Cortex-A72 AES Erratum: + /// - Cortex-A57 Erratum 1742098 + /// - Cortex-A72 Erratum 1655431 + bool FixCortexA57AES1742098 = false; + /// Harden against Straight Line Speculation for Returns and Indirect /// Branches. bool HardenSlsRetBr = false; @@ -959,6 +964,10 @@ bool fixCMSE_CVE_2021_35465() const { return FixCMSE_CVE_2021_35465; } + bool fixCortexA57AES1742098() const { + return HasAES && FixCortexA57AES1742098; + } + bool hardenSlsRetBr() const { return HardenSlsRetBr; } bool hardenSlsBlr() const { return HardenSlsBlr; } bool hardenSlsNoComdat() const { return HardenSlsNoComdat; } diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -106,6 +106,7 @@ initializeMVEGatherScatterLoweringPass(Registry); initializeARMSLSHardeningPass(Registry); initializeMVELaneInterleavingPass(Registry); + initializeARMFixCortexA57AES1742098Pass(Registry); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -580,6 +581,8 @@ addPass(createARMConstantIslandPass()); addPass(createARMLowOverheadLoopsPass()); + addPass(createARMFixCortexA57AES1742098Pass()); + if (TM->getTargetTriple().isOSWindows()) { // Identify valid longjmp targets for Windows Control Flow Guard. addPass(createCFGuardLongjmpPass()); diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt --- a/llvm/lib/Target/ARM/CMakeLists.txt +++ b/llvm/lib/Target/ARM/CMakeLists.txt @@ -32,6 +32,7 @@ ARMConstantPoolValue.cpp ARMExpandPseudoInsts.cpp ARMFastISel.cpp + ARMFixCortexA57AES1742098Pass.cpp ARMFrameLowering.cpp ARMHazardRecognizer.cpp ARMInstructionSelector.cpp diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -187,6 +187,8 @@ ; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: ReachingDefAnalysis ; CHECK-NEXT: ARM Low Overhead Loops pass +; CHECK-NEXT: Machine Dominance Frontier Construction +; CHECK-NEXT: ARM fix for Cortex-A57 AES Erratum 1742098 ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: ARM Assembly Printer diff --git a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll @@ -0,0 +1,3327 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple armv8---eabi -mattr=+aes,+fix-cortex-a57-aes-1742098 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK-FIX,CHECK-FIX-NOSCHED +; RUN: llc -mtriple armv8---eabi -mattr=+aes,-fix-cortex-a57-aes-1742098 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK-NOFIX + +; These CPUs should have the fix enabled by default. They use different +; FileCheck prefixes because some instructions are scheduled differently. +; +; RUN: llc -mtriple armv8---eabi -mcpu=cortex-a57 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK-FIX,CHECK-CORTEX-FIX +; RUN: llc -mtriple armv8---eabi -mcpu=cortex-a72 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK-FIX,CHECK-CORTEX-FIX + +; This checks that adding `+fix-cortex-a57-aes-1742098` causes `vorr` to be +; inserted wherever the compiler cannot prove that either input to the first aes +; instruction in a fused aes pair was set by 64-bit Neon register writes or +; 128-bit Neon register writes. All other register writes are unsafe, and +; require a `vorr` to protect the AES input. + +declare <16 x i8> @llvm.arm.neon.aese(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.aesmc(<16 x i8>) +declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.aesimc(<16 x i8>) + + +define void @aese_once_via_ptr(<16 x i8>* %0, <16 x i8>* %1) nounwind { +; CHECK-FIX-LABEL: aese_once_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: aese.8 q9, q8 +; CHECK-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_once_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: bx lr + %3 = load <16 x i8>, <16 x i8>* %1, align 8 + %4 = load <16 x i8>, <16 x i8>* %0, align 8 + %5 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %3, <16 x i8> %4) + %6 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %5) + store <16 x i8> %6, <16 x i8>* %1, align 8 + ret void +} + +define <16 x i8> @aese_once_via_val(<16 x i8> %0, <16 x i8> %1) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_once_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vmov d17, r2, r3 +; CHECK-FIX-NOSCHED-NEXT: mov r12, sp +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-FIX-NOSCHED-NEXT: vmov d16, r0, r1 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d16 +; CHECK-FIX-NOSCHED-NEXT: vmov r2, r3, d17 +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_once_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vmov d17, r2, r3 +; CHECK-NOFIX-NEXT: mov r12, sp +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NOFIX-NEXT: vmov d16, r0, r1 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vmov r0, r1, d16 +; CHECK-NOFIX-NEXT: vmov r2, r3, d17 +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_once_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vmov d19, r2, r3 +; CHECK-CORTEX-FIX-NEXT: vmov d18, r0, r1 +; CHECK-CORTEX-FIX-NEXT: mov r12, sp +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-CORTEX-FIX-NEXT: aese.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d16 +; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d17 +; CHECK-CORTEX-FIX-NEXT: bx lr + %3 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %1, <16 x i8> %0) + %4 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %3) + ret <16 x i8> %4 +} + +define void @aese_twice_via_ptr(<16 x i8>* %0, <16 x i8>* %1) nounwind { +; CHECK-FIX-LABEL: aese_twice_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: aese.8 q9, q8 +; CHECK-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_twice_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: bx lr + %3 = load <16 x i8>, <16 x i8>* %1, align 8 + %4 = load <16 x i8>, <16 x i8>* %0, align 8 + %5 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %3, <16 x i8> %4) + %6 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %5) + store <16 x i8> %6, <16 x i8>* %1, align 8 + %7 = load <16 x i8>, <16 x i8>* %0, align 8 + %8 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %6, <16 x i8> %7) + %9 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %1, align 8 + ret void +} + +define <16 x i8> @aese_twice_via_val(<16 x i8> %0, <16 x i8> %1) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_twice_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vmov d17, r2, r3 +; CHECK-FIX-NOSCHED-NEXT: mov r12, sp +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-FIX-NOSCHED-NEXT: vmov d16, r0, r1 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q9, q9 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d16 +; CHECK-FIX-NOSCHED-NEXT: vmov r2, r3, d17 +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_twice_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vmov d17, r2, r3 +; CHECK-NOFIX-NEXT: mov r12, sp +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NOFIX-NEXT: vmov d16, r0, r1 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q9, q9 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vmov r0, r1, d16 +; CHECK-NOFIX-NEXT: vmov r2, r3, d17 +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_twice_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vmov d19, r2, r3 +; CHECK-CORTEX-FIX-NEXT: vmov d18, r0, r1 +; CHECK-CORTEX-FIX-NEXT: mov r12, sp +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-CORTEX-FIX-NEXT: aese.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d16 +; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d17 +; CHECK-CORTEX-FIX-NEXT: bx lr + %3 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %1, <16 x i8> %0) + %4 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %3) + %5 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %4, <16 x i8> %0) + %6 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %5) + ret <16 x i8> %6 +} + +define void @aese_loop_via_ptr(i32 %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB4_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB4_1 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.2: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_loop_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB4_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bne .LBB4_1 +; CHECK-NOFIX-NEXT: @ %bb.2: +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB4_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bne .LBB4_1 +; CHECK-CORTEX-FIX-NEXT: @ %bb.2: +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = icmp eq i32 %0, 0 + br i1 %4, label %5, label %6 + +5: + ret void + +6: + %7 = phi i32 [ %12, %6 ], [ 0, %3 ] + %8 = load <16 x i8>, <16 x i8>* %2, align 8 + %9 = load <16 x i8>, <16 x i8>* %1, align 8 + %10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %8, <16 x i8> %9) + %11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %2, align 8 + %12 = add nuw i32 %7, 1 + %13 = icmp eq i32 %12, %0 + br i1 %13, label %5, label %6 +} + +define <16 x i8> @aese_loop_via_val(i32 %0, <16 x i8> %1, <16 x i8> %2) nounwind { +; CHECK-FIX-LABEL: aese_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: add r1, sp, #8 +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: beq .LBB5_3 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vldr d19, [sp] +; CHECK-FIX-NEXT: vmov d18, r2, r3 +; CHECK-FIX-NEXT: .LBB5_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB5_2 +; CHECK-FIX-NEXT: .LBB5_3: +; CHECK-FIX-NEXT: vmov r0, r1, d16 +; CHECK-FIX-NEXT: vmov r2, r3, d17 +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_loop_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: add r1, sp, #8 +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: beq .LBB5_3 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vldr d19, [sp] +; CHECK-NOFIX-NEXT: vmov d18, r2, r3 +; CHECK-NOFIX-NEXT: .LBB5_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: bne .LBB5_2 +; CHECK-NOFIX-NEXT: .LBB5_3: +; CHECK-NOFIX-NEXT: vmov r0, r1, d16 +; CHECK-NOFIX-NEXT: vmov r2, r3, d17 +; CHECK-NOFIX-NEXT: bx lr + %4 = icmp eq i32 %0, 0 + br i1 %4, label %5, label %7 + +5: + %6 = phi <16 x i8> [ %2, %3 ], [ %11, %7 ] + ret <16 x i8> %6 + +7: + %8 = phi i32 [ %12, %7 ], [ 0, %3 ] + %9 = phi <16 x i8> [ %11, %7 ], [ %2, %3 ] + %10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %9, <16 x i8> %1) + %11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %10) + %12 = add nuw i32 %8, 1 + %13 = icmp eq i32 %12, %0 + br i1 %13, label %5, label %7 +} + +define void @aese_set8_via_ptr(i8* %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set8_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.8 {d16[0]}, [r0] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set8_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: vld1.8 {d16[0]}, [r0] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set8_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.8 {d16[0]}, [r0] +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i8, i8* %0, align 1 + %6 = load <16 x i8>, <16 x i8>* %1, align 8 + %7 = insertelement <16 x i8> %6, i8 %5, i64 0 + %8 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %4, <16 x i8> %7) + %9 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %2, align 8 + ret void +} + +define void @aese_set8_via_val(i8 zeroext %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aese_set8_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: vmov.8 d16[0], r0 +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set8_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: vmov.8 d16[0], r0 +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = insertelement <16 x i8> %4, i8 %0, i64 0 + %6 = load <16 x i8>, <16 x i8>* %1, align 8 + %7 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %5, <16 x i8> %6) + %8 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %7) + store <16 x i8> %8, <16 x i8>* %2, align 8 + ret void +} + +define void @aese_set8_cond_via_ptr(i1 zeroext %0, i8* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set8_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB8_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vld1.8 {d18[0]}, [r1] +; CHECK-FIX-NEXT: b .LBB8_3 +; CHECK-FIX-NEXT: .LBB8_2: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: .LBB8_3: +; CHECK-FIX-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set8_cond_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB8_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vld1.8 {d18[0]}, [r1] +; CHECK-NOFIX-NEXT: b .LBB8_3 +; CHECK-NOFIX-NEXT: .LBB8_2: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: .LBB8_3: +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %10 + +6: + %7 = load i8, i8* %1, align 1 + %8 = load <16 x i8>, <16 x i8>* %2, align 8 + %9 = insertelement <16 x i8> %8, i8 %7, i64 0 + br label %12 + +10: + %11 = load <16 x i8>, <16 x i8>* %2, align 8 + br label %12 + +12: + %13 = phi <16 x i8> [ %9, %6 ], [ %11, %10 ] + %14 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %5, <16 x i8> %13) + %15 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %14) + store <16 x i8> %15, <16 x i8>* %3, align 8 + ret void +} + +define void @aese_set8_cond_via_val(i1 zeroext %0, i8 zeroext %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set8_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB9_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.8 d16[0], r1 +; CHECK-FIX-NEXT: .LBB9_2: @ %select.end +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set8_cond_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB9_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vmov.8 d16[0], r1 +; CHECK-NOFIX-NEXT: .LBB9_2: @ %select.end +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + %6 = insertelement <16 x i8> %5, i8 %1, i64 0 + %7 = select i1 %0, <16 x i8> %6, <16 x i8> %5 + %8 = load <16 x i8>, <16 x i8>* %2, align 8 + %9 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %7, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %3, align 8 + ret void +} + +define void @aese_set8_loop_via_ptr(i32 %0, i8* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set8_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB10_1: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vld1.8 {d16[0]}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: .LBB10_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q9, q9 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB10_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.3: +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set8_loop_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB10_1: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.8 {d16[0]}, [r1] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: .LBB10_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: aesmc.8 q9, q9 +; CHECK-NOFIX-NEXT: bne .LBB10_2 +; CHECK-NOFIX-NEXT: @ %bb.3: +; CHECK-NOFIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set8_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB10_1: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: vld1.8 {d16[0]}, [r1] +; CHECK-CORTEX-FIX-NEXT: .LBB10_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vorr q9, q9, q9 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q9, q9 +; CHECK-CORTEX-FIX-NEXT: bne .LBB10_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.3: +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = load i8, i8* %1, align 1 + %6 = load <16 x i8>, <16 x i8>* %2, align 8 + %7 = insertelement <16 x i8> %6, i8 %5, i64 0 + %8 = icmp eq i32 %0, 0 + br i1 %8, label %12, label %9 + +9: + %10 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %13 + +11: + store <16 x i8> %17, <16 x i8>* %3, align 8 + br label %12 + +12: + ret void + +13: + %14 = phi <16 x i8> [ %10, %9 ], [ %17, %13 ] + %15 = phi i32 [ 0, %9 ], [ %18, %13 ] + %16 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %14, <16 x i8> %7) + %17 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %16) + %18 = add nuw i32 %15, 1 + %19 = icmp eq i32 %18, %0 + br i1 %19, label %11, label %13 +} + +define void @aese_set8_loop_via_val(i32 %0, i8 zeroext %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set8_loop_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB11_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: vmov.8 d16[0], r1 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB11_1 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.2: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set8_loop_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB11_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: vmov.8 d16[0], r1 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bne .LBB11_1 +; CHECK-NOFIX-NEXT: @ %bb.2: +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set8_loop_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB11_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: vmov.8 d16[0], r1 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: bne .LBB11_1 +; CHECK-CORTEX-FIX-NEXT: @ %bb.2: +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = icmp eq i32 %0, 0 + br i1 %5, label %6, label %7 + +6: + ret void + +7: + %8 = phi i32 [ %14, %7 ], [ 0, %4 ] + %9 = load <16 x i8>, <16 x i8>* %2, align 8 + %10 = insertelement <16 x i8> %9, i8 %1, i64 0 + %11 = load <16 x i8>, <16 x i8>* %3, align 8 + %12 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %11, <16 x i8> %10) + %13 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %12) + store <16 x i8> %13, <16 x i8>* %3, align 8 + %14 = add nuw i32 %8, 1 + %15 = icmp eq i32 %14, %0 + br i1 %15, label %6, label %7 +} + +define void @aese_set16_via_ptr(i16* %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set16_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set16_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set16_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i16, i16* %0, align 2 + %6 = bitcast <16 x i8>* %1 to <8 x i16>* + %7 = load <8 x i16>, <8 x i16>* %6, align 8 + %8 = insertelement <8 x i16> %7, i16 %5, i64 0 + %9 = bitcast <8 x i16> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %4, <16 x i8> %9) + %11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %2, align 8 + ret void +} + +define void @aese_set16_via_val(i16 zeroext %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aese_set16_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: vmov.16 d16[0], r0 +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set16_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: vmov.16 d16[0], r0 +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <8 x i16>* + %5 = load <8 x i16>, <8 x i16>* %4, align 8 + %6 = insertelement <8 x i16> %5, i16 %0, i64 0 + %7 = bitcast <8 x i16> %6 to <16 x i8> + %8 = load <16 x i8>, <16 x i8>* %1, align 8 + %9 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %7, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define void @aese_set16_cond_via_ptr(i1 zeroext %0, i16* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set16_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB14_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vld1.16 {d18[0]}, [r1:16] +; CHECK-FIX-NEXT: b .LBB14_3 +; CHECK-FIX-NEXT: .LBB14_2: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: .LBB14_3: +; CHECK-FIX-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set16_cond_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB14_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vld1.16 {d18[0]}, [r1:16] +; CHECK-NOFIX-NEXT: b .LBB14_3 +; CHECK-NOFIX-NEXT: .LBB14_2: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: .LBB14_3: +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %11 + +6: + %7 = load i16, i16* %1, align 2 + %8 = bitcast <16 x i8>* %2 to <8 x i16>* + %9 = load <8 x i16>, <8 x i16>* %8, align 8 + %10 = insertelement <8 x i16> %9, i16 %7, i64 0 + br label %14 + +11: + %12 = bitcast <16 x i8>* %2 to <8 x i16>* + %13 = load <8 x i16>, <8 x i16>* %12, align 8 + br label %14 + +14: + %15 = phi <8 x i16> [ %10, %6 ], [ %13, %11 ] + %16 = bitcast <8 x i16> %15 to <16 x i8> + %17 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %5, <16 x i8> %16) + %18 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %17) + store <16 x i8> %18, <16 x i8>* %3, align 8 + ret void +} + +define void @aese_set16_cond_via_val(i1 zeroext %0, i16 zeroext %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set16_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB15_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.16 d16[0], r1 +; CHECK-FIX-NEXT: .LBB15_2: @ %select.end +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set16_cond_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB15_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vmov.16 d16[0], r1 +; CHECK-NOFIX-NEXT: .LBB15_2: @ %select.end +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <8 x i16>* + %6 = load <8 x i16>, <8 x i16>* %5, align 8 + %7 = insertelement <8 x i16> %6, i16 %1, i64 0 + %8 = select i1 %0, <8 x i16> %7, <8 x i16> %6 + %9 = bitcast <8 x i16> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %2, align 8 + %11 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %9, <16 x i8> %10) + %12 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %11) + store <16 x i8> %12, <16 x i8>* %3, align 8 + ret void +} + +define void @aese_set16_loop_via_ptr(i32 %0, i16* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set16_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB16_1: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: .LBB16_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q9, q9 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB16_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.3: +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set16_loop_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB16_1: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: .LBB16_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: aesmc.8 q9, q9 +; CHECK-NOFIX-NEXT: bne .LBB16_2 +; CHECK-NOFIX-NEXT: @ %bb.3: +; CHECK-NOFIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set16_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB16_1: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-CORTEX-FIX-NEXT: .LBB16_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vorr q9, q9, q9 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q9, q9 +; CHECK-CORTEX-FIX-NEXT: bne .LBB16_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.3: +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = load i16, i16* %1, align 2 + %6 = bitcast <16 x i8>* %2 to <8 x i16>* + %7 = load <8 x i16>, <8 x i16>* %6, align 8 + %8 = insertelement <8 x i16> %7, i16 %5, i64 0 + %9 = bitcast <8 x i16> %8 to <16 x i8> + %10 = icmp eq i32 %0, 0 + br i1 %10, label %14, label %11 + +11: + %12 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %15 + +13: + store <16 x i8> %19, <16 x i8>* %3, align 8 + br label %14 + +14: + ret void + +15: + %16 = phi <16 x i8> [ %12, %11 ], [ %19, %15 ] + %17 = phi i32 [ 0, %11 ], [ %20, %15 ] + %18 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %16, <16 x i8> %9) + %19 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %18) + %20 = add nuw i32 %17, 1 + %21 = icmp eq i32 %20, %0 + br i1 %21, label %13, label %15 +} + +define void @aese_set16_loop_via_val(i32 %0, i16 zeroext %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set16_loop_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB17_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r1 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB17_1 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.2: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set16_loop_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB17_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: vmov.16 d16[0], r1 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bne .LBB17_1 +; CHECK-NOFIX-NEXT: @ %bb.2: +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set16_loop_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB17_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r1 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: bne .LBB17_1 +; CHECK-CORTEX-FIX-NEXT: @ %bb.2: +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %2 to <8 x i16>* + %6 = icmp eq i32 %0, 0 + br i1 %6, label %7, label %8 + +7: + ret void + +8: + %9 = phi i32 [ %16, %8 ], [ 0, %4 ] + %10 = load <8 x i16>, <8 x i16>* %5, align 8 + %11 = insertelement <8 x i16> %10, i16 %1, i64 0 + %12 = bitcast <8 x i16> %11 to <16 x i8> + %13 = load <16 x i8>, <16 x i8>* %3, align 8 + %14 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %13, <16 x i8> %12) + %15 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %14) + store <16 x i8> %15, <16 x i8>* %3, align 8 + %16 = add nuw i32 %9, 1 + %17 = icmp eq i32 %16, %0 + br i1 %17, label %7, label %8 +} + +define void @aese_set32_via_ptr(i32* %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set32_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set32_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set32_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i32, i32* %0, align 4 + %6 = bitcast <16 x i8>* %1 to <4 x i32>* + %7 = load <4 x i32>, <4 x i32>* %6, align 8 + %8 = insertelement <4 x i32> %7, i32 %5, i64 0 + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %4, <16 x i8> %9) + %11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %2, align 8 + ret void +} + +define void @aese_set32_via_val(i32 %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aese_set32_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: vmov.32 d16[0], r0 +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set32_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r0 +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <4 x i32>* + %5 = load <4 x i32>, <4 x i32>* %4, align 8 + %6 = insertelement <4 x i32> %5, i32 %0, i64 0 + %7 = bitcast <4 x i32> %6 to <16 x i8> + %8 = load <16 x i8>, <16 x i8>* %1, align 8 + %9 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %7, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define void @aese_set32_cond_via_ptr(i1 zeroext %0, i32* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set32_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB20_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vld1.32 {d18[0]}, [r1:32] +; CHECK-FIX-NEXT: b .LBB20_3 +; CHECK-FIX-NEXT: .LBB20_2: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: .LBB20_3: +; CHECK-FIX-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set32_cond_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB20_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vld1.32 {d18[0]}, [r1:32] +; CHECK-NOFIX-NEXT: b .LBB20_3 +; CHECK-NOFIX-NEXT: .LBB20_2: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: .LBB20_3: +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %11 + +6: + %7 = load i32, i32* %1, align 4 + %8 = bitcast <16 x i8>* %2 to <4 x i32>* + %9 = load <4 x i32>, <4 x i32>* %8, align 8 + %10 = insertelement <4 x i32> %9, i32 %7, i64 0 + br label %14 + +11: + %12 = bitcast <16 x i8>* %2 to <4 x i32>* + %13 = load <4 x i32>, <4 x i32>* %12, align 8 + br label %14 + +14: + %15 = phi <4 x i32> [ %10, %6 ], [ %13, %11 ] + %16 = bitcast <4 x i32> %15 to <16 x i8> + %17 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %5, <16 x i8> %16) + %18 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %17) + store <16 x i8> %18, <16 x i8>* %3, align 8 + ret void +} + +define void @aese_set32_cond_via_val(i1 zeroext %0, i32 %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aese_set32_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB21_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.32 d16[0], r1 +; CHECK-FIX-NEXT: .LBB21_2: @ %select.end +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aese.8 q8, q9 +; CHECK-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set32_cond_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB21_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r1 +; CHECK-NOFIX-NEXT: .LBB21_2: @ %select.end +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <4 x i32>* + %6 = load <4 x i32>, <4 x i32>* %5, align 8 + %7 = insertelement <4 x i32> %6, i32 %1, i64 0 + %8 = select i1 %0, <4 x i32> %7, <4 x i32> %6 + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %2, align 8 + %11 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %9, <16 x i8> %10) + %12 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %11) + store <16 x i8> %12, <16 x i8>* %3, align 8 + ret void +} + +define void @aese_set32_loop_via_ptr(i32 %0, i32* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set32_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB22_1: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vld1.32 {d16[0]}, [r1:32] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: .LBB22_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q9, q9 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB22_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.3: +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set32_loop_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB22_1: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.32 {d16[0]}, [r1:32] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: .LBB22_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: aesmc.8 q9, q9 +; CHECK-NOFIX-NEXT: bne .LBB22_2 +; CHECK-NOFIX-NEXT: @ %bb.3: +; CHECK-NOFIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set32_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB22_1: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: vld1.32 {d16[0]}, [r1:32] +; CHECK-CORTEX-FIX-NEXT: .LBB22_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vorr q9, q9, q9 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q9, q9 +; CHECK-CORTEX-FIX-NEXT: bne .LBB22_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.3: +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = load i32, i32* %1, align 4 + %6 = bitcast <16 x i8>* %2 to <4 x i32>* + %7 = load <4 x i32>, <4 x i32>* %6, align 8 + %8 = insertelement <4 x i32> %7, i32 %5, i64 0 + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = icmp eq i32 %0, 0 + br i1 %10, label %14, label %11 + +11: + %12 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %15 + +13: + store <16 x i8> %19, <16 x i8>* %3, align 8 + br label %14 + +14: + ret void + +15: + %16 = phi <16 x i8> [ %12, %11 ], [ %19, %15 ] + %17 = phi i32 [ 0, %11 ], [ %20, %15 ] + %18 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %16, <16 x i8> %9) + %19 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %18) + %20 = add nuw i32 %17, 1 + %21 = icmp eq i32 %20, %0 + br i1 %21, label %13, label %15 +} + +define void @aese_set32_loop_via_val(i32 %0, i32 %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set32_loop_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r1 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB23_1 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.2: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set32_loop_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r1 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bne .LBB23_1 +; CHECK-NOFIX-NEXT: @ %bb.2: +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set32_loop_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r1 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: bne .LBB23_1 +; CHECK-CORTEX-FIX-NEXT: @ %bb.2: +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %2 to <4 x i32>* + %6 = icmp eq i32 %0, 0 + br i1 %6, label %7, label %8 + +7: + ret void + +8: + %9 = phi i32 [ %16, %8 ], [ 0, %4 ] + %10 = load <4 x i32>, <4 x i32>* %5, align 8 + %11 = insertelement <4 x i32> %10, i32 %1, i64 0 + %12 = bitcast <4 x i32> %11 to <16 x i8> + %13 = load <16 x i8>, <16 x i8>* %3, align 8 + %14 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %13, <16 x i8> %12) + %15 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %14) + store <16 x i8> %15, <16 x i8>* %3, align 8 + %16 = add nuw i32 %9, 1 + %17 = icmp eq i32 %16, %0 + br i1 %17, label %7, label %8 +} + +define void @aese_set64_via_ptr(i64* %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set64_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vldr d16, [r0] +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set64_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vldr d16, [r0] +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set64_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vldr d16, [r0] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d20, d21}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-CORTEX-FIX-NEXT: vorr d18, d16, d16 +; CHECK-CORTEX-FIX-NEXT: aese.8 q10, q9 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q10 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i64, i64* %0, align 8 + %6 = bitcast <16 x i8>* %1 to <2 x i64>* + %7 = load <2 x i64>, <2 x i64>* %6, align 8 + %8 = insertelement <2 x i64> %7, i64 %5, i64 0 + %9 = bitcast <2 x i64> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %4, <16 x i8> %9) + %11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %2, align 8 + ret void +} + +define void @aese_set64_via_val(i64 %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set64_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r1 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set64_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r0 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vmov.32 d16[1], r1 +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set64_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r1 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <2 x i64>* + %5 = load <2 x i64>, <2 x i64>* %4, align 8 + %6 = insertelement <2 x i64> %5, i64 %0, i64 0 + %7 = bitcast <2 x i64> %6 to <16 x i8> + %8 = load <16 x i8>, <16 x i8>* %1, align 8 + %9 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %7, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define void @aese_set64_cond_via_ptr(i1 zeroext %0, i64* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set64_cond_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB26_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vldr d18, [r1] +; CHECK-FIX-NOSCHED-NEXT: b .LBB26_3 +; CHECK-FIX-NOSCHED-NEXT: .LBB26_2: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: .LBB26_3: +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set64_cond_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB26_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vldr d18, [r1] +; CHECK-NOFIX-NEXT: b .LBB26_3 +; CHECK-NOFIX-NEXT: .LBB26_2: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: .LBB26_3: +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set64_cond_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: beq .LBB26_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.1: +; CHECK-CORTEX-FIX-NEXT: vldr d20, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: vorr d18, d20, d20 +; CHECK-CORTEX-FIX-NEXT: b .LBB26_3 +; CHECK-CORTEX-FIX-NEXT: .LBB26_2: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: .LBB26_3: +; CHECK-CORTEX-FIX-NEXT: vorr q9, q9, q9 +; CHECK-CORTEX-FIX-NEXT: aese.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %11 + +6: + %7 = load i64, i64* %1, align 8 + %8 = bitcast <16 x i8>* %2 to <2 x i64>* + %9 = load <2 x i64>, <2 x i64>* %8, align 8 + %10 = insertelement <2 x i64> %9, i64 %7, i64 0 + br label %14 + +11: + %12 = bitcast <16 x i8>* %2 to <2 x i64>* + %13 = load <2 x i64>, <2 x i64>* %12, align 8 + br label %14 + +14: + %15 = phi <2 x i64> [ %10, %6 ], [ %13, %11 ] + %16 = bitcast <2 x i64> %15 to <16 x i8> + %17 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %5, <16 x i8> %16) + %18 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %17) + store <16 x i8> %18, <16 x i8>* %3, align 8 + ret void +} + +define void @aese_set64_cond_via_val(i1 zeroext %0, i64 %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set64_cond_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: ldr r1, [sp, #4] +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: ldr r12, [sp] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: beq .LBB27_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r2 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r3 +; CHECK-FIX-NOSCHED-NEXT: .LBB27_2: @ %select.end +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q8 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set64_cond_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: ldr r1, [sp, #4] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: ldr r12, [sp] +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: beq .LBB27_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r2 +; CHECK-NOFIX-NEXT: vmov.32 d16[1], r3 +; CHECK-NOFIX-NEXT: .LBB27_2: @ %select.end +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NOFIX-NEXT: aese.8 q8, q9 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set64_cond_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #4] +; CHECK-CORTEX-FIX-NEXT: ldr r12, [sp] +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: beq .LBB27_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.1: +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r3 +; CHECK-CORTEX-FIX-NEXT: .LBB27_2: @ %select.end +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <2 x i64>* + %6 = load <2 x i64>, <2 x i64>* %5, align 8 + %7 = insertelement <2 x i64> %6, i64 %1, i64 0 + %8 = select i1 %0, <2 x i64> %7, <2 x i64> %6 + %9 = bitcast <2 x i64> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %2, align 8 + %11 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %9, <16 x i8> %10) + %12 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %11) + store <16 x i8> %12, <16 x i8>* %3, align 8 + ret void +} + +define void @aese_set64_loop_via_ptr(i32 %0, i64* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set64_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB28_1: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vldr d16, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: .LBB28_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q9, q9 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB28_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.3: +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set64_loop_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB28_1: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vldr d16, [r1] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: .LBB28_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: aesmc.8 q9, q9 +; CHECK-NOFIX-NEXT: bne .LBB28_2 +; CHECK-NOFIX-NEXT: @ %bb.3: +; CHECK-NOFIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set64_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB28_1: +; CHECK-CORTEX-FIX-NEXT: vldr d18, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vorr d16, d18, d18 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: .LBB28_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vorr q9, q9, q9 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q9, q9 +; CHECK-CORTEX-FIX-NEXT: bne .LBB28_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.3: +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = load i64, i64* %1, align 8 + %6 = bitcast <16 x i8>* %2 to <2 x i64>* + %7 = load <2 x i64>, <2 x i64>* %6, align 8 + %8 = insertelement <2 x i64> %7, i64 %5, i64 0 + %9 = bitcast <2 x i64> %8 to <16 x i8> + %10 = icmp eq i32 %0, 0 + br i1 %10, label %14, label %11 + +11: + %12 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %15 + +13: + store <16 x i8> %19, <16 x i8>* %3, align 8 + br label %14 + +14: + ret void + +15: + %16 = phi <16 x i8> [ %12, %11 ], [ %19, %15 ] + %17 = phi i32 [ 0, %11 ], [ %20, %15 ] + %18 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %16, <16 x i8> %9) + %19 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %18) + %20 = add nuw i32 %17, 1 + %21 = icmp eq i32 %20, %0 + br i1 %21, label %13, label %15 +} + +define void @aese_set64_loop_via_val(i32 %0, i64 %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aese_set64_loop_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB29_1: +; CHECK-FIX-NOSCHED-NEXT: ldr r1, [sp, #4] +; CHECK-FIX-NOSCHED-NEXT: ldr r12, [sp] +; CHECK-FIX-NOSCHED-NEXT: .LBB29_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r2 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r3 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aese.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB29_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.3: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aese_set64_loop_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB29_1: +; CHECK-NOFIX-NEXT: ldr r1, [sp, #4] +; CHECK-NOFIX-NEXT: ldr r12, [sp] +; CHECK-NOFIX-NEXT: .LBB29_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r2 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: vmov.32 d16[1], r3 +; CHECK-NOFIX-NEXT: aese.8 q9, q8 +; CHECK-NOFIX-NEXT: aesmc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: bne .LBB29_2 +; CHECK-NOFIX-NEXT: @ %bb.3: +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aese_set64_loop_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB29_1: +; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #4] +; CHECK-CORTEX-FIX-NEXT: ldr r12, [sp] +; CHECK-CORTEX-FIX-NEXT: .LBB29_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r3 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: bne .LBB29_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.3: +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %2 to <2 x i64>* + %6 = icmp eq i32 %0, 0 + br i1 %6, label %7, label %8 + +7: + ret void + +8: + %9 = phi i32 [ %16, %8 ], [ 0, %4 ] + %10 = load <2 x i64>, <2 x i64>* %5, align 8 + %11 = insertelement <2 x i64> %10, i64 %1, i64 0 + %12 = bitcast <2 x i64> %11 to <16 x i8> + %13 = load <16 x i8>, <16 x i8>* %3, align 8 + %14 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %13, <16 x i8> %12) + %15 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %14) + store <16 x i8> %15, <16 x i8>* %3, align 8 + %16 = add nuw i32 %9, 1 + %17 = icmp eq i32 %16, %0 + br i1 %17, label %7, label %8 +} + +define void @aesd_once_via_ptr(<16 x i8>* %0, <16 x i8>* %1) nounwind { +; CHECK-FIX-LABEL: aesd_once_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_once_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: bx lr + %3 = load <16 x i8>, <16 x i8>* %1, align 8 + %4 = load <16 x i8>, <16 x i8>* %0, align 8 + %5 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %3, <16 x i8> %4) + %6 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %5) + store <16 x i8> %6, <16 x i8>* %1, align 8 + ret void +} + +define <16 x i8> @aesd_once_via_val(<16 x i8> %0, <16 x i8> %1) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_once_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vmov d17, r2, r3 +; CHECK-FIX-NOSCHED-NEXT: mov r12, sp +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-FIX-NOSCHED-NEXT: vmov d16, r0, r1 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d16 +; CHECK-FIX-NOSCHED-NEXT: vmov r2, r3, d17 +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_once_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vmov d17, r2, r3 +; CHECK-NOFIX-NEXT: mov r12, sp +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NOFIX-NEXT: vmov d16, r0, r1 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vmov r0, r1, d16 +; CHECK-NOFIX-NEXT: vmov r2, r3, d17 +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_once_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vmov d19, r2, r3 +; CHECK-CORTEX-FIX-NEXT: vmov d18, r0, r1 +; CHECK-CORTEX-FIX-NEXT: mov r12, sp +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-CORTEX-FIX-NEXT: aesd.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d16 +; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d17 +; CHECK-CORTEX-FIX-NEXT: bx lr + %3 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %1, <16 x i8> %0) + %4 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %3) + ret <16 x i8> %4 +} + +define void @aesd_twice_via_ptr(<16 x i8>* %0, <16 x i8>* %1) nounwind { +; CHECK-FIX-LABEL: aesd_twice_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_twice_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: bx lr + %3 = load <16 x i8>, <16 x i8>* %1, align 8 + %4 = load <16 x i8>, <16 x i8>* %0, align 8 + %5 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %3, <16 x i8> %4) + %6 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %5) + store <16 x i8> %6, <16 x i8>* %1, align 8 + %7 = load <16 x i8>, <16 x i8>* %0, align 8 + %8 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %6, <16 x i8> %7) + %9 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %1, align 8 + ret void +} + +define <16 x i8> @aesd_twice_via_val(<16 x i8> %0, <16 x i8> %1) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_twice_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vmov d17, r2, r3 +; CHECK-FIX-NOSCHED-NEXT: mov r12, sp +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-FIX-NOSCHED-NEXT: vmov d16, r0, r1 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q9, q9 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d16 +; CHECK-FIX-NOSCHED-NEXT: vmov r2, r3, d17 +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_twice_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vmov d17, r2, r3 +; CHECK-NOFIX-NEXT: mov r12, sp +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NOFIX-NEXT: vmov d16, r0, r1 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q9, q9 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vmov r0, r1, d16 +; CHECK-NOFIX-NEXT: vmov r2, r3, d17 +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_twice_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vmov d19, r2, r3 +; CHECK-CORTEX-FIX-NEXT: vmov d18, r0, r1 +; CHECK-CORTEX-FIX-NEXT: mov r12, sp +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-CORTEX-FIX-NEXT: aesd.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d16 +; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d17 +; CHECK-CORTEX-FIX-NEXT: bx lr + %3 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %1, <16 x i8> %0) + %4 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %3) + %5 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %4, <16 x i8> %0) + %6 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %5) + ret <16 x i8> %6 +} + +define void @aesd_loop_via_ptr(i32 %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB34_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB34_1 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.2: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_loop_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB34_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bne .LBB34_1 +; CHECK-NOFIX-NEXT: @ %bb.2: +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB34_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bne .LBB34_1 +; CHECK-CORTEX-FIX-NEXT: @ %bb.2: +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = icmp eq i32 %0, 0 + br i1 %4, label %5, label %6 + +5: + ret void + +6: + %7 = phi i32 [ %12, %6 ], [ 0, %3 ] + %8 = load <16 x i8>, <16 x i8>* %2, align 8 + %9 = load <16 x i8>, <16 x i8>* %1, align 8 + %10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %8, <16 x i8> %9) + %11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %2, align 8 + %12 = add nuw i32 %7, 1 + %13 = icmp eq i32 %12, %0 + br i1 %13, label %5, label %6 +} + +define <16 x i8> @aesd_loop_via_val(i32 %0, <16 x i8> %1, <16 x i8> %2) nounwind { +; CHECK-FIX-LABEL: aesd_loop_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: add r1, sp, #8 +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NEXT: beq .LBB35_3 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vldr d19, [sp] +; CHECK-FIX-NEXT: vmov d18, r2, r3 +; CHECK-FIX-NEXT: .LBB35_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: subs r0, r0, #1 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: bne .LBB35_2 +; CHECK-FIX-NEXT: .LBB35_3: +; CHECK-FIX-NEXT: vmov r0, r1, d16 +; CHECK-FIX-NEXT: vmov r2, r3, d17 +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_loop_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: add r1, sp, #8 +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: beq .LBB35_3 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vldr d19, [sp] +; CHECK-NOFIX-NEXT: vmov d18, r2, r3 +; CHECK-NOFIX-NEXT: .LBB35_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: bne .LBB35_2 +; CHECK-NOFIX-NEXT: .LBB35_3: +; CHECK-NOFIX-NEXT: vmov r0, r1, d16 +; CHECK-NOFIX-NEXT: vmov r2, r3, d17 +; CHECK-NOFIX-NEXT: bx lr + %4 = icmp eq i32 %0, 0 + br i1 %4, label %5, label %7 + +5: + %6 = phi <16 x i8> [ %2, %3 ], [ %11, %7 ] + ret <16 x i8> %6 + +7: + %8 = phi i32 [ %12, %7 ], [ 0, %3 ] + %9 = phi <16 x i8> [ %11, %7 ], [ %2, %3 ] + %10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %9, <16 x i8> %1) + %11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %10) + %12 = add nuw i32 %8, 1 + %13 = icmp eq i32 %12, %0 + br i1 %13, label %5, label %7 +} + +define void @aesd_set8_via_ptr(i8* %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set8_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.8 {d16[0]}, [r0] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set8_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: vld1.8 {d16[0]}, [r0] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set8_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.8 {d16[0]}, [r0] +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i8, i8* %0, align 1 + %6 = load <16 x i8>, <16 x i8>* %1, align 8 + %7 = insertelement <16 x i8> %6, i8 %5, i64 0 + %8 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %4, <16 x i8> %7) + %9 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %8) + store <16 x i8> %9, <16 x i8>* %2, align 8 + ret void +} + +define void @aesd_set8_via_val(i8 zeroext %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aesd_set8_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: vmov.8 d16[0], r0 +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set8_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: vmov.8 d16[0], r0 +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = insertelement <16 x i8> %4, i8 %0, i64 0 + %6 = load <16 x i8>, <16 x i8>* %1, align 8 + %7 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %5, <16 x i8> %6) + %8 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %7) + store <16 x i8> %8, <16 x i8>* %2, align 8 + ret void +} + +define void @aesd_set8_cond_via_ptr(i1 zeroext %0, i8* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set8_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB38_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vld1.8 {d18[0]}, [r1] +; CHECK-FIX-NEXT: b .LBB38_3 +; CHECK-FIX-NEXT: .LBB38_2: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: .LBB38_3: +; CHECK-FIX-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set8_cond_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB38_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vld1.8 {d18[0]}, [r1] +; CHECK-NOFIX-NEXT: b .LBB38_3 +; CHECK-NOFIX-NEXT: .LBB38_2: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: .LBB38_3: +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %10 + +6: + %7 = load i8, i8* %1, align 1 + %8 = load <16 x i8>, <16 x i8>* %2, align 8 + %9 = insertelement <16 x i8> %8, i8 %7, i64 0 + br label %12 + +10: + %11 = load <16 x i8>, <16 x i8>* %2, align 8 + br label %12 + +12: + %13 = phi <16 x i8> [ %9, %6 ], [ %11, %10 ] + %14 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %5, <16 x i8> %13) + %15 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %14) + store <16 x i8> %15, <16 x i8>* %3, align 8 + ret void +} + +define void @aesd_set8_cond_via_val(i1 zeroext %0, i8 zeroext %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set8_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB39_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.8 d16[0], r1 +; CHECK-FIX-NEXT: .LBB39_2: @ %select.end +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set8_cond_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB39_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vmov.8 d16[0], r1 +; CHECK-NOFIX-NEXT: .LBB39_2: @ %select.end +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + %6 = insertelement <16 x i8> %5, i8 %1, i64 0 + %7 = select i1 %0, <16 x i8> %6, <16 x i8> %5 + %8 = load <16 x i8>, <16 x i8>* %2, align 8 + %9 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %7, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %3, align 8 + ret void +} + +define void @aesd_set8_loop_via_ptr(i32 %0, i8* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set8_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB40_1: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vld1.8 {d16[0]}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: .LBB40_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q9, q9 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB40_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.3: +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set8_loop_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB40_1: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.8 {d16[0]}, [r1] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: .LBB40_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: aesimc.8 q9, q9 +; CHECK-NOFIX-NEXT: bne .LBB40_2 +; CHECK-NOFIX-NEXT: @ %bb.3: +; CHECK-NOFIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set8_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB40_1: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: vld1.8 {d16[0]}, [r1] +; CHECK-CORTEX-FIX-NEXT: .LBB40_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vorr q9, q9, q9 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q9, q9 +; CHECK-CORTEX-FIX-NEXT: bne .LBB40_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.3: +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = load i8, i8* %1, align 1 + %6 = load <16 x i8>, <16 x i8>* %2, align 8 + %7 = insertelement <16 x i8> %6, i8 %5, i64 0 + %8 = icmp eq i32 %0, 0 + br i1 %8, label %12, label %9 + +9: + %10 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %13 + +11: + store <16 x i8> %17, <16 x i8>* %3, align 8 + br label %12 + +12: + ret void + +13: + %14 = phi <16 x i8> [ %10, %9 ], [ %17, %13 ] + %15 = phi i32 [ 0, %9 ], [ %18, %13 ] + %16 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %14, <16 x i8> %7) + %17 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %16) + %18 = add nuw i32 %15, 1 + %19 = icmp eq i32 %18, %0 + br i1 %19, label %11, label %13 +} + +define void @aesd_set8_loop_via_val(i32 %0, i8 zeroext %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set8_loop_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB41_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: vmov.8 d16[0], r1 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB41_1 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.2: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set8_loop_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB41_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: vmov.8 d16[0], r1 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bne .LBB41_1 +; CHECK-NOFIX-NEXT: @ %bb.2: +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set8_loop_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB41_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: vmov.8 d16[0], r1 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: bne .LBB41_1 +; CHECK-CORTEX-FIX-NEXT: @ %bb.2: +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = icmp eq i32 %0, 0 + br i1 %5, label %6, label %7 + +6: + ret void + +7: + %8 = phi i32 [ %14, %7 ], [ 0, %4 ] + %9 = load <16 x i8>, <16 x i8>* %2, align 8 + %10 = insertelement <16 x i8> %9, i8 %1, i64 0 + %11 = load <16 x i8>, <16 x i8>* %3, align 8 + %12 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %11, <16 x i8> %10) + %13 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %12) + store <16 x i8> %13, <16 x i8>* %3, align 8 + %14 = add nuw i32 %8, 1 + %15 = icmp eq i32 %14, %0 + br i1 %15, label %6, label %7 +} + +define void @aesd_set16_via_ptr(i16* %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set16_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set16_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set16_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i16, i16* %0, align 2 + %6 = bitcast <16 x i8>* %1 to <8 x i16>* + %7 = load <8 x i16>, <8 x i16>* %6, align 8 + %8 = insertelement <8 x i16> %7, i16 %5, i64 0 + %9 = bitcast <8 x i16> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %4, <16 x i8> %9) + %11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %2, align 8 + ret void +} + +define void @aesd_set16_via_val(i16 zeroext %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aesd_set16_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: vmov.16 d16[0], r0 +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set16_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: vmov.16 d16[0], r0 +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <8 x i16>* + %5 = load <8 x i16>, <8 x i16>* %4, align 8 + %6 = insertelement <8 x i16> %5, i16 %0, i64 0 + %7 = bitcast <8 x i16> %6 to <16 x i8> + %8 = load <16 x i8>, <16 x i8>* %1, align 8 + %9 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %7, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define void @aesd_set16_cond_via_ptr(i1 zeroext %0, i16* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set16_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB44_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vld1.16 {d18[0]}, [r1:16] +; CHECK-FIX-NEXT: b .LBB44_3 +; CHECK-FIX-NEXT: .LBB44_2: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: .LBB44_3: +; CHECK-FIX-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set16_cond_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB44_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vld1.16 {d18[0]}, [r1:16] +; CHECK-NOFIX-NEXT: b .LBB44_3 +; CHECK-NOFIX-NEXT: .LBB44_2: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: .LBB44_3: +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %11 + +6: + %7 = load i16, i16* %1, align 2 + %8 = bitcast <16 x i8>* %2 to <8 x i16>* + %9 = load <8 x i16>, <8 x i16>* %8, align 8 + %10 = insertelement <8 x i16> %9, i16 %7, i64 0 + br label %14 + +11: + %12 = bitcast <16 x i8>* %2 to <8 x i16>* + %13 = load <8 x i16>, <8 x i16>* %12, align 8 + br label %14 + +14: + %15 = phi <8 x i16> [ %10, %6 ], [ %13, %11 ] + %16 = bitcast <8 x i16> %15 to <16 x i8> + %17 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %5, <16 x i8> %16) + %18 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %17) + store <16 x i8> %18, <16 x i8>* %3, align 8 + ret void +} + +define void @aesd_set16_cond_via_val(i1 zeroext %0, i16 zeroext %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set16_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB45_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.16 d16[0], r1 +; CHECK-FIX-NEXT: .LBB45_2: @ %select.end +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set16_cond_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB45_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vmov.16 d16[0], r1 +; CHECK-NOFIX-NEXT: .LBB45_2: @ %select.end +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <8 x i16>* + %6 = load <8 x i16>, <8 x i16>* %5, align 8 + %7 = insertelement <8 x i16> %6, i16 %1, i64 0 + %8 = select i1 %0, <8 x i16> %7, <8 x i16> %6 + %9 = bitcast <8 x i16> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %2, align 8 + %11 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %9, <16 x i8> %10) + %12 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %11) + store <16 x i8> %12, <16 x i8>* %3, align 8 + ret void +} + +define void @aesd_set16_loop_via_ptr(i32 %0, i16* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set16_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB46_1: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: .LBB46_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q9, q9 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB46_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.3: +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set16_loop_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB46_1: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: .LBB46_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: aesimc.8 q9, q9 +; CHECK-NOFIX-NEXT: bne .LBB46_2 +; CHECK-NOFIX-NEXT: @ %bb.3: +; CHECK-NOFIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set16_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB46_1: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-CORTEX-FIX-NEXT: .LBB46_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vorr q9, q9, q9 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q9, q9 +; CHECK-CORTEX-FIX-NEXT: bne .LBB46_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.3: +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = load i16, i16* %1, align 2 + %6 = bitcast <16 x i8>* %2 to <8 x i16>* + %7 = load <8 x i16>, <8 x i16>* %6, align 8 + %8 = insertelement <8 x i16> %7, i16 %5, i64 0 + %9 = bitcast <8 x i16> %8 to <16 x i8> + %10 = icmp eq i32 %0, 0 + br i1 %10, label %14, label %11 + +11: + %12 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %15 + +13: + store <16 x i8> %19, <16 x i8>* %3, align 8 + br label %14 + +14: + ret void + +15: + %16 = phi <16 x i8> [ %12, %11 ], [ %19, %15 ] + %17 = phi i32 [ 0, %11 ], [ %20, %15 ] + %18 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %16, <16 x i8> %9) + %19 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %18) + %20 = add nuw i32 %17, 1 + %21 = icmp eq i32 %20, %0 + br i1 %21, label %13, label %15 +} + +define void @aesd_set16_loop_via_val(i32 %0, i16 zeroext %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set16_loop_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB47_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r1 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB47_1 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.2: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set16_loop_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB47_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: vmov.16 d16[0], r1 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bne .LBB47_1 +; CHECK-NOFIX-NEXT: @ %bb.2: +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set16_loop_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB47_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r1 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: bne .LBB47_1 +; CHECK-CORTEX-FIX-NEXT: @ %bb.2: +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %2 to <8 x i16>* + %6 = icmp eq i32 %0, 0 + br i1 %6, label %7, label %8 + +7: + ret void + +8: + %9 = phi i32 [ %16, %8 ], [ 0, %4 ] + %10 = load <8 x i16>, <8 x i16>* %5, align 8 + %11 = insertelement <8 x i16> %10, i16 %1, i64 0 + %12 = bitcast <8 x i16> %11 to <16 x i8> + %13 = load <16 x i8>, <16 x i8>* %3, align 8 + %14 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %13, <16 x i8> %12) + %15 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %14) + store <16 x i8> %15, <16 x i8>* %3, align 8 + %16 = add nuw i32 %9, 1 + %17 = icmp eq i32 %16, %0 + br i1 %17, label %7, label %8 +} + +define void @aesd_set32_via_ptr(i32* %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set32_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set32_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set32_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i32, i32* %0, align 4 + %6 = bitcast <16 x i8>* %1 to <4 x i32>* + %7 = load <4 x i32>, <4 x i32>* %6, align 8 + %8 = insertelement <4 x i32> %7, i32 %5, i64 0 + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %4, <16 x i8> %9) + %11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %2, align 8 + ret void +} + +define void @aesd_set32_via_val(i32 %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-LABEL: aesd_set32_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NEXT: vmov.32 d16[0], r0 +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set32_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r0 +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <4 x i32>* + %5 = load <4 x i32>, <4 x i32>* %4, align 8 + %6 = insertelement <4 x i32> %5, i32 %0, i64 0 + %7 = bitcast <4 x i32> %6 to <16 x i8> + %8 = load <16 x i8>, <16 x i8>* %1, align 8 + %9 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %7, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define void @aesd_set32_cond_via_ptr(i1 zeroext %0, i32* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set32_cond_via_ptr: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB50_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vld1.32 {d18[0]}, [r1:32] +; CHECK-FIX-NEXT: b .LBB50_3 +; CHECK-FIX-NEXT: .LBB50_2: +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: .LBB50_3: +; CHECK-FIX-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set32_cond_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB50_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vld1.32 {d18[0]}, [r1:32] +; CHECK-NOFIX-NEXT: b .LBB50_3 +; CHECK-NOFIX-NEXT: .LBB50_2: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: .LBB50_3: +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %11 + +6: + %7 = load i32, i32* %1, align 4 + %8 = bitcast <16 x i8>* %2 to <4 x i32>* + %9 = load <4 x i32>, <4 x i32>* %8, align 8 + %10 = insertelement <4 x i32> %9, i32 %7, i64 0 + br label %14 + +11: + %12 = bitcast <16 x i8>* %2 to <4 x i32>* + %13 = load <4 x i32>, <4 x i32>* %12, align 8 + br label %14 + +14: + %15 = phi <4 x i32> [ %10, %6 ], [ %13, %11 ] + %16 = bitcast <4 x i32> %15 to <16 x i8> + %17 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %5, <16 x i8> %16) + %18 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %17) + store <16 x i8> %18, <16 x i8>* %3, align 8 + ret void +} + +define void @aesd_set32_cond_via_val(i1 zeroext %0, i32 %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-LABEL: aesd_set32_cond_via_val: +; CHECK-FIX: @ %bb.0: +; CHECK-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: cmp r0, #0 +; CHECK-FIX-NEXT: beq .LBB51_2 +; CHECK-FIX-NEXT: @ %bb.1: +; CHECK-FIX-NEXT: vmov.32 d16[0], r1 +; CHECK-FIX-NEXT: .LBB51_2: @ %select.end +; CHECK-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set32_cond_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB51_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r1 +; CHECK-NOFIX-NEXT: .LBB51_2: @ %select.end +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <4 x i32>* + %6 = load <4 x i32>, <4 x i32>* %5, align 8 + %7 = insertelement <4 x i32> %6, i32 %1, i64 0 + %8 = select i1 %0, <4 x i32> %7, <4 x i32> %6 + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %2, align 8 + %11 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %9, <16 x i8> %10) + %12 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %11) + store <16 x i8> %12, <16 x i8>* %3, align 8 + ret void +} + +define void @aesd_set32_loop_via_ptr(i32 %0, i32* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set32_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB52_1: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vld1.32 {d16[0]}, [r1:32] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: .LBB52_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q9, q9 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB52_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.3: +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set32_loop_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB52_1: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vld1.32 {d16[0]}, [r1:32] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: .LBB52_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: aesimc.8 q9, q9 +; CHECK-NOFIX-NEXT: bne .LBB52_2 +; CHECK-NOFIX-NEXT: @ %bb.3: +; CHECK-NOFIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set32_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB52_1: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: vld1.32 {d16[0]}, [r1:32] +; CHECK-CORTEX-FIX-NEXT: .LBB52_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vorr q9, q9, q9 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q9, q9 +; CHECK-CORTEX-FIX-NEXT: bne .LBB52_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.3: +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = load i32, i32* %1, align 4 + %6 = bitcast <16 x i8>* %2 to <4 x i32>* + %7 = load <4 x i32>, <4 x i32>* %6, align 8 + %8 = insertelement <4 x i32> %7, i32 %5, i64 0 + %9 = bitcast <4 x i32> %8 to <16 x i8> + %10 = icmp eq i32 %0, 0 + br i1 %10, label %14, label %11 + +11: + %12 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %15 + +13: + store <16 x i8> %19, <16 x i8>* %3, align 8 + br label %14 + +14: + ret void + +15: + %16 = phi <16 x i8> [ %12, %11 ], [ %19, %15 ] + %17 = phi i32 [ 0, %11 ], [ %20, %15 ] + %18 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %16, <16 x i8> %9) + %19 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %18) + %20 = add nuw i32 %17, 1 + %21 = icmp eq i32 %20, %0 + br i1 %21, label %13, label %15 +} + +define void @aesd_set32_loop_via_val(i32 %0, i32 %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set32_loop_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB53_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r1 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB53_1 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.2: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set32_loop_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB53_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r1 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bne .LBB53_1 +; CHECK-NOFIX-NEXT: @ %bb.2: +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set32_loop_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB53_1: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r1 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: bne .LBB53_1 +; CHECK-CORTEX-FIX-NEXT: @ %bb.2: +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %2 to <4 x i32>* + %6 = icmp eq i32 %0, 0 + br i1 %6, label %7, label %8 + +7: + ret void + +8: + %9 = phi i32 [ %16, %8 ], [ 0, %4 ] + %10 = load <4 x i32>, <4 x i32>* %5, align 8 + %11 = insertelement <4 x i32> %10, i32 %1, i64 0 + %12 = bitcast <4 x i32> %11 to <16 x i8> + %13 = load <16 x i8>, <16 x i8>* %3, align 8 + %14 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %13, <16 x i8> %12) + %15 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %14) + store <16 x i8> %15, <16 x i8>* %3, align 8 + %16 = add nuw i32 %9, 1 + %17 = icmp eq i32 %16, %0 + br i1 %17, label %7, label %8 +} + +define void @aesd_set64_via_ptr(i64* %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set64_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vldr d16, [r0] +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set64_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vldr d16, [r0] +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set64_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vldr d16, [r0] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d20, d21}, [r2] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-CORTEX-FIX-NEXT: vorr d18, d16, d16 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q10, q9 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q10 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = load <16 x i8>, <16 x i8>* %2, align 8 + %5 = load i64, i64* %0, align 8 + %6 = bitcast <16 x i8>* %1 to <2 x i64>* + %7 = load <2 x i64>, <2 x i64>* %6, align 8 + %8 = insertelement <2 x i64> %7, i64 %5, i64 0 + %9 = bitcast <2 x i64> %8 to <16 x i8> + %10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %4, <16 x i8> %9) + %11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %10) + store <16 x i8> %11, <16 x i8>* %2, align 8 + ret void +} + +define void @aesd_set64_via_val(i64 %0, <16 x i8>* %1, <16 x i8>* %2) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set64_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r1 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set64_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r0 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vmov.32 d16[1], r1 +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set64_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r1 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %4 = bitcast <16 x i8>* %2 to <2 x i64>* + %5 = load <2 x i64>, <2 x i64>* %4, align 8 + %6 = insertelement <2 x i64> %5, i64 %0, i64 0 + %7 = bitcast <2 x i64> %6 to <16 x i8> + %8 = load <16 x i8>, <16 x i8>* %1, align 8 + %9 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %7, <16 x i8> %8) + %10 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %9) + store <16 x i8> %10, <16 x i8>* %2, align 8 + ret void +} + +define void @aesd_set64_cond_via_ptr(i1 zeroext %0, i64* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set64_cond_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB56_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vldr d18, [r1] +; CHECK-FIX-NOSCHED-NEXT: b .LBB56_3 +; CHECK-FIX-NOSCHED-NEXT: .LBB56_2: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-FIX-NOSCHED-NEXT: .LBB56_3: +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set64_cond_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: beq .LBB56_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: vldr d18, [r1] +; CHECK-NOFIX-NEXT: b .LBB56_3 +; CHECK-NOFIX-NEXT: .LBB56_2: +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-NOFIX-NEXT: .LBB56_3: +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set64_cond_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: beq .LBB56_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.1: +; CHECK-CORTEX-FIX-NEXT: vldr d20, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: vorr d18, d20, d20 +; CHECK-CORTEX-FIX-NEXT: b .LBB56_3 +; CHECK-CORTEX-FIX-NEXT: .LBB56_2: +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r2] +; CHECK-CORTEX-FIX-NEXT: .LBB56_3: +; CHECK-CORTEX-FIX-NEXT: vorr q9, q9, q9 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = load <16 x i8>, <16 x i8>* %3, align 8 + br i1 %0, label %6, label %11 + +6: + %7 = load i64, i64* %1, align 8 + %8 = bitcast <16 x i8>* %2 to <2 x i64>* + %9 = load <2 x i64>, <2 x i64>* %8, align 8 + %10 = insertelement <2 x i64> %9, i64 %7, i64 0 + br label %14 + +11: + %12 = bitcast <16 x i8>* %2 to <2 x i64>* + %13 = load <2 x i64>, <2 x i64>* %12, align 8 + br label %14 + +14: + %15 = phi <2 x i64> [ %10, %6 ], [ %13, %11 ] + %16 = bitcast <2 x i64> %15 to <16 x i8> + %17 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %5, <16 x i8> %16) + %18 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %17) + store <16 x i8> %18, <16 x i8>* %3, align 8 + ret void +} + +define void @aesd_set64_cond_via_val(i1 zeroext %0, i64 %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set64_cond_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: ldr r1, [sp, #4] +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: ldr r12, [sp] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: beq .LBB57_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r2 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r3 +; CHECK-FIX-NOSCHED-NEXT: .LBB57_2: @ %select.end +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q8 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set64_cond_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: ldr r1, [sp, #4] +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: ldr r12, [sp] +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: beq .LBB57_2 +; CHECK-NOFIX-NEXT: @ %bb.1: +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r2 +; CHECK-NOFIX-NEXT: vmov.32 d16[1], r3 +; CHECK-NOFIX-NEXT: .LBB57_2: @ %select.end +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NOFIX-NEXT: aesd.8 q8, q9 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q8 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set64_cond_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #4] +; CHECK-CORTEX-FIX-NEXT: ldr r12, [sp] +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: beq .LBB57_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.1: +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r3 +; CHECK-CORTEX-FIX-NEXT: .LBB57_2: @ %select.end +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q8 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %3 to <2 x i64>* + %6 = load <2 x i64>, <2 x i64>* %5, align 8 + %7 = insertelement <2 x i64> %6, i64 %1, i64 0 + %8 = select i1 %0, <2 x i64> %7, <2 x i64> %6 + %9 = bitcast <2 x i64> %8 to <16 x i8> + %10 = load <16 x i8>, <16 x i8>* %2, align 8 + %11 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %9, <16 x i8> %10) + %12 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %11) + store <16 x i8> %12, <16 x i8>* %3, align 8 + ret void +} + +define void @aesd_set64_loop_via_ptr(i32 %0, i64* %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set64_loop_via_ptr: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB58_1: +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-FIX-NOSCHED-NEXT: vldr d16, [r1] +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: .LBB58_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vorr q9, q9, q9 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q9, q9 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB58_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.3: +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set64_loop_via_ptr: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB58_1: +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NOFIX-NEXT: vldr d16, [r1] +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: .LBB58_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: aesimc.8 q9, q9 +; CHECK-NOFIX-NEXT: bne .LBB58_2 +; CHECK-NOFIX-NEXT: @ %bb.3: +; CHECK-NOFIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set64_loop_via_ptr: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB58_1: +; CHECK-CORTEX-FIX-NEXT: vldr d18, [r1] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-CORTEX-FIX-NEXT: vorr d16, d18, d18 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: .LBB58_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vorr q9, q9, q9 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q9, q9 +; CHECK-CORTEX-FIX-NEXT: bne .LBB58_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.3: +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d18, d19}, [r3] +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = load i64, i64* %1, align 8 + %6 = bitcast <16 x i8>* %2 to <2 x i64>* + %7 = load <2 x i64>, <2 x i64>* %6, align 8 + %8 = insertelement <2 x i64> %7, i64 %5, i64 0 + %9 = bitcast <2 x i64> %8 to <16 x i8> + %10 = icmp eq i32 %0, 0 + br i1 %10, label %14, label %11 + +11: + %12 = load <16 x i8>, <16 x i8>* %3, align 8 + br label %15 + +13: + store <16 x i8> %19, <16 x i8>* %3, align 8 + br label %14 + +14: + ret void + +15: + %16 = phi <16 x i8> [ %12, %11 ], [ %19, %15 ] + %17 = phi i32 [ 0, %11 ], [ %20, %15 ] + %18 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %16, <16 x i8> %9) + %19 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %18) + %20 = add nuw i32 %17, 1 + %21 = icmp eq i32 %20, %0 + br i1 %21, label %13, label %15 +} + +define void @aesd_set64_loop_via_val(i32 %0, i64 %1, <16 x i8>* %2, <16 x i8>* %3) nounwind { +; CHECK-FIX-NOSCHED-LABEL: aesd_set64_loop_via_val: +; CHECK-FIX-NOSCHED: @ %bb.0: +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bxeq lr +; CHECK-FIX-NOSCHED-NEXT: .LBB59_1: +; CHECK-FIX-NOSCHED-NEXT: ldr r1, [sp, #4] +; CHECK-FIX-NOSCHED-NEXT: ldr r12, [sp] +; CHECK-FIX-NOSCHED-NEXT: .LBB59_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-FIX-NOSCHED-NEXT: subs r0, r0, #1 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r2 +; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r3 +; CHECK-FIX-NOSCHED-NEXT: vorr q8, q8, q8 +; CHECK-FIX-NOSCHED-NEXT: aesd.8 q9, q8 +; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q9 +; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-FIX-NOSCHED-NEXT: bne .LBB59_2 +; CHECK-FIX-NOSCHED-NEXT: @ %bb.3: +; CHECK-FIX-NOSCHED-NEXT: bx lr +; +; CHECK-NOFIX-LABEL: aesd_set64_loop_via_val: +; CHECK-NOFIX: @ %bb.0: +; CHECK-NOFIX-NEXT: cmp r0, #0 +; CHECK-NOFIX-NEXT: bxeq lr +; CHECK-NOFIX-NEXT: .LBB59_1: +; CHECK-NOFIX-NEXT: ldr r1, [sp, #4] +; CHECK-NOFIX-NEXT: ldr r12, [sp] +; CHECK-NOFIX-NEXT: .LBB59_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NOFIX-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NOFIX-NEXT: subs r0, r0, #1 +; CHECK-NOFIX-NEXT: vmov.32 d16[0], r2 +; CHECK-NOFIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NOFIX-NEXT: vmov.32 d16[1], r3 +; CHECK-NOFIX-NEXT: aesd.8 q9, q8 +; CHECK-NOFIX-NEXT: aesimc.8 q8, q9 +; CHECK-NOFIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NOFIX-NEXT: bne .LBB59_2 +; CHECK-NOFIX-NEXT: @ %bb.3: +; CHECK-NOFIX-NEXT: bx lr +; +; CHECK-CORTEX-FIX-LABEL: aesd_set64_loop_via_val: +; CHECK-CORTEX-FIX: @ %bb.0: +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bxeq lr +; CHECK-CORTEX-FIX-NEXT: .LBB59_1: +; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #4] +; CHECK-CORTEX-FIX-NEXT: ldr r12, [sp] +; CHECK-CORTEX-FIX-NEXT: .LBB59_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-CORTEX-FIX-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-CORTEX-FIX-NEXT: subs r0, r0, #1 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r3 +; CHECK-CORTEX-FIX-NEXT: vorr q8, q8, q8 +; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 +; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 +; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-CORTEX-FIX-NEXT: bne .LBB59_2 +; CHECK-CORTEX-FIX-NEXT: @ %bb.3: +; CHECK-CORTEX-FIX-NEXT: bx lr + %5 = bitcast <16 x i8>* %2 to <2 x i64>* + %6 = icmp eq i32 %0, 0 + br i1 %6, label %7, label %8 + +7: + ret void + +8: + %9 = phi i32 [ %16, %8 ], [ 0, %4 ] + %10 = load <2 x i64>, <2 x i64>* %5, align 8 + %11 = insertelement <2 x i64> %10, i64 %1, i64 0 + %12 = bitcast <2 x i64> %11 to <16 x i8> + %13 = load <16 x i8>, <16 x i8>* %3, align 8 + %14 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %13, <16 x i8> %12) + %15 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %14) + store <16 x i8> %15, <16 x i8>* %3, align 8 + %16 = add nuw i32 %9, 1 + %17 = icmp eq i32 %16, %0 + br i1 %17, label %7, label %8 +}