Index: lib/Target/AArch64/AArch64.h =================================================================== --- lib/Target/AArch64/AArch64.h +++ lib/Target/AArch64/AArch64.h @@ -39,6 +39,7 @@ FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); FunctionPass *createAArch64A57FPLoadBalancing(); +FunctionPass *createAArch64FixCortexA53_835769(); /// \brief Creates an ARM-specific Target Transformation Info pass. ImmutablePass * createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM); Index: lib/Target/AArch64/AArch64FixCortexA53_835769.cpp =================================================================== --- /dev/null +++ lib/Target/AArch64/AArch64FixCortexA53_835769.cpp @@ -0,0 +1,231 @@ +//===-- AArch64FixCortexA53_835769.cpp ------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass changes code to work around Cortex-A53 erratum 835769. +// It works around it by inserting a nop instruction in code sequences that +// in some circumstances may trigger the erratum. +// It inserts a nop instruction between a sequence of the following 2 classes +// of instructions: +// instr 1: mem-instr (including loads, stores and prefetches). +// instr 2: non-SIMD integer multiply-accumulate writing 64-bit X registers. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-fix-cortex-a53-835769" + +static cl::opt +WorkAroundA53Erratum("aarch64-fix-cortex-a53-835769", cl::Hidden, + cl::desc("Work around Cortex-A53 erratum 835769"), + cl::init(false)); + +STATISTIC(NumNopsAdded, "Number of Nops added to work around erratum 835769"); + +//===----------------------------------------------------------------------===// +// Helper functions + +// Is the instruction a match for the instruction that comes first in the +// sequence of instructions that can trigger the erratum? +static bool isFirstInstructionInSequence(MachineInstr *MI) { + // Must return true if this instruction is a load, a store or a prefetch. + switch (MI->getOpcode()) { + case AArch64::PRFMl: + case AArch64::PRFMroW: + case AArch64::PRFMroX: + case AArch64::PRFMui: + case AArch64::PRFUMi: + return true; + default: + return (MI->mayLoad() || MI->mayStore()); + } +} + +// Is the instruction a match for the instruction that comes second in the +// sequence that can trigger the erratum? +static bool isSecondInstructionInSequence(MachineInstr *MI) { + // Must return true for non-SIMD integer multiply-accumulates, writing + // to a 64-bit register. + switch (MI->getOpcode()) { + // Erratum cannot be triggered when the destination register is 32 bits, + // therefore only include the following. + case AArch64::MSUBXrrr: + case AArch64::MADDXrrr: + case AArch64::SMADDLrrr: + case AArch64::SMSUBLrrr: + case AArch64::UMADDLrrr: + case AArch64::UMSUBLrrr: + // Erratum can only be triggered by multiply-adds, not by regular + // non-accumulating multiplies, i.e. when Ra=XZR='11111' + return MI->getOperand(3).getReg() != AArch64::XZR; + default: + return false; + } +} + + +//===----------------------------------------------------------------------===// + +namespace { +class AArch64FixCortexA53_835769 : public MachineFunctionPass { + const AArch64InstrInfo *TII; + +public: + static char ID; + explicit AArch64FixCortexA53_835769() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &F) override; + + const char *getPassName() const override { + return "Workaround A53 erratum 835769 pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool runOnBasicBlock(MachineBasicBlock &MBB); +}; +char AArch64FixCortexA53_835769::ID = 0; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// + +bool +AArch64FixCortexA53_835769::runOnMachineFunction(MachineFunction &F) { + const TargetMachine &TM = F.getTarget(); + if (!WorkAroundA53Erratum) + return false; + + bool Changed = false; + DEBUG(dbgs() << "***** AArch64FixCortexA53_835769 *****\n"); + + TII = TM.getSubtarget().getInstrInfo(); + + for (auto &MBB : F) { + Changed |= runOnBasicBlock(MBB); + } + + return Changed; +} + +// Return the block that was fallen through to get to MBB, if any, +// otherwise nullptr. +static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock &MBB) { + // Get the previous machine basic block in the function. + MachineFunction::iterator MBBI = MBB; + + // Can't go off top of function. + if (MBBI == MBB.getParent()->begin()) + return nullptr; + + MachineBasicBlock *PrevBB = std::prev(MBBI); + for (MachineBasicBlock *S : MBB.predecessors()) + if (S == PrevBB) + return S; + + return nullptr; +} + +static MachineInstr *getLastNonPseudo(MachineBasicBlock *MBB) { + for (auto I = MBB->rbegin(), E = MBB->rend(); I != E; ++I) { + if (!I->isPseudo()) + return &*I; + } + + llvm_unreachable("Expected to find instruction"); +} + +static void insertNopBeforeInstruction(MachineBasicBlock &MBB, MachineInstr* MI, + const TargetInstrInfo *TII) { + // If we are the first instruction of the block, put the NOP at the end of + // the previous fallthrough block + if (MI == &MBB.front()) { + MachineBasicBlock *PMBB = getBBFallenThrough(MBB); + assert(PMBB && "Expected basic block"); + MachineInstr *I = getLastNonPseudo(PMBB); + assert(I && "Expected instruction"); + DebugLoc DL = I->getDebugLoc(); + BuildMI(PMBB, DL, TII->get(AArch64::HINT)).addImm(0); + } + else { + DebugLoc DL = MI->getDebugLoc(); + BuildMI(MBB, MI, DL, TII->get(AArch64::HINT)).addImm(0); + } + + ++NumNopsAdded; +} + +bool +AArch64FixCortexA53_835769::runOnBasicBlock(MachineBasicBlock &MBB) { + bool Changed = false; + DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n"); + + // First, scan the basic block, looking for a sequence of 2 instructions + // that match the conditions under which the erratum may trigger. + + // List of terminating instructions in matching sequences + std::vector Sequences; + unsigned Idx = 0; + MachineInstr *PrevInstr = nullptr; + + if (MachineBasicBlock *PMBB = getBBFallenThrough(MBB)) + PrevInstr = getLastNonPseudo(PMBB); + + for (auto &MI : MBB) { + MachineInstr *CurrInstr = &MI; + DEBUG(dbgs() << " Examining: " << MI); + if (PrevInstr) { + DEBUG(dbgs() << " PrevInstr: " << *PrevInstr + << " CurrInstr: " << *CurrInstr + << " isFirstInstructionInSequence(PrevInstr): " + << isFirstInstructionInSequence(PrevInstr) << "\n" + << " isSecondInstructionInSequence(CurrInstr): " + << isSecondInstructionInSequence(CurrInstr) << "\n"); + if (isFirstInstructionInSequence(PrevInstr) && + isSecondInstructionInSequence(CurrInstr)) { + DEBUG(dbgs() << " ** pattern found at Idx " << Idx << "!\n"); + Sequences.push_back(CurrInstr); + } + } + if (!CurrInstr->isPseudo()) + PrevInstr = CurrInstr; + ++Idx; + } + + DEBUG(dbgs() << "Scan complete, "<< Sequences.size() + << " occurences of pattern found.\n"); + + // Then update the basic block, inserting nops between the detected sequences. + for (auto &MI : Sequences) { + Changed = true; + insertNopBeforeInstruction(MBB, MI, TII); + } + + return Changed; +} + +// Factory function used by AArch64TargetMachine to add the pass to +// the passmanager. +FunctionPass *llvm::createAArch64FixCortexA53_835769() { + return new AArch64FixCortexA53_835769(); +} Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -274,6 +274,7 @@ } bool AArch64PassConfig::addPreEmitPass() { + addPass(createAArch64FixCortexA53_835769()); // Relax conditional branch instructions if they're otherwise out of // range of their destination. addPass(createAArch64BranchRelaxation()); Index: lib/Target/AArch64/CMakeLists.txt =================================================================== --- lib/Target/AArch64/CMakeLists.txt +++ lib/Target/AArch64/CMakeLists.txt @@ -26,6 +26,7 @@ AArch64DeadRegisterDefinitionsPass.cpp AArch64ExpandPseudoInsts.cpp AArch64FastISel.cpp + AArch64FixCortexA53_835769.cpp AArch64FrameLowering.cpp AArch64ConditionOptimizer.cpp AArch64ISelDAGToDAG.cpp Index: test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll @@ -0,0 +1,524 @@ +; The regression tests need to test for order of emitted instructions, and +; therefore, the tests are a bit fragile/reliant on instruction scheduling. The +; test cases have been minimized as much as possible, but still most of the test +; cases could break if instruction scheduling heuristics for cortex-a53 change +; RUN: llc < %s -mcpu=cortex-a53 -aarch64-fix-cortex-a53-835769=1 -stats 2>&1 \ +; RUN: | FileCheck %s --check-prefix CHECK +; RUN: llc < %s -mcpu=cortex-a53 -aarch64-fix-cortex-a53-835769=0 -stats 2>&1 \ +; RUN: | FileCheck %s --check-prefix CHECK-NOWORKAROUND +; The following run lines are just to verify whether or not this pass runs by +; default for given CPUs. Given the fragility of the tests, this is only run on +; a test case where the scheduler has not freedom at all to reschedule the +; instructions, so the potentially massively different scheduling heuristics +; will not break the test case. +; RUN: llc < %s -mcpu=generic | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED +; RUN: llc < %s -mcpu=cortex-a53 | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED +; RUN: llc < %s -mcpu=cortex-a57 | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED +; RUN: llc < %s -mcpu=cyclone | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +define i64 @f_load_madd_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 { +entry: + %0 = load i64* %c, align 8 + %mul = mul nsw i64 %0, %b + %add = add nsw i64 %mul, %a + ret i64 %add +} +; CHECK-LABEL: f_load_madd_64: +; CHECK: ldr +; CHECK-NEXT: nop +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_load_madd_64: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: madd +; CHECK-BASIC-PASS-DISABLED-LABEL: f_load_madd_64: +; CHECK-BASIC-PASS-DISABLED: ldr +; CHECK-BASIC-PASS-DISABLED-NEXT: madd + + +define i32 @f_load_madd_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 { +entry: + %0 = load i32* %c, align 4 + %mul = mul nsw i32 %0, %b + %add = add nsw i32 %mul, %a + ret i32 %add +} +; CHECK-LABEL: f_load_madd_32: +; CHECK: ldr +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_load_madd_32: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: madd + + +define i64 @f_load_msub_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 { +entry: + %0 = load i64* %c, align 8 + %mul = mul nsw i64 %0, %b + %sub = sub nsw i64 %a, %mul + ret i64 %sub +} +; CHECK-LABEL: f_load_msub_64: +; CHECK: ldr +; CHECK-NEXT: nop +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_load_msub_64: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: msub + + +define i32 @f_load_msub_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 { +entry: + %0 = load i32* %c, align 4 + %mul = mul nsw i32 %0, %b + %sub = sub nsw i32 %a, %mul + ret i32 %sub +} +; CHECK-LABEL: f_load_msub_32: +; CHECK: ldr +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_load_msub_32: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: msub + + +define i64 @f_load_mul_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 { +entry: + %0 = load i64* %c, align 8 + %mul = mul nsw i64 %0, %b + ret i64 %mul +} +; CHECK-LABEL: f_load_mul_64: +; CHECK: ldr +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_load_mul_64: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: mul + + +define i32 @f_load_mul_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 { +entry: + %0 = load i32* %c, align 4 + %mul = mul nsw i32 %0, %b + ret i32 %mul +} +; CHECK-LABEL: f_load_mul_32: +; CHECK: ldr +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_load_mul_32: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: mul + + +define i64 @f_load_mneg_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 { +entry: + %0 = load i64* %c, align 8 + %mul = sub i64 0, %b + %sub = mul i64 %0, %mul + ret i64 %sub +} +; CHECK-LABEL: f_load_mneg_64: +; CHECK-NOWORKAROUND-LABEL: f_load_mneg_64: +; FIXME: only add further checks here once LLVM actually produces +; neg instructions +; FIXME-CHECK: ldr +; FIXME-CHECK-NEXT: nop +; FIXME-CHECK-NEXT: mneg +; FIXME-CHECK-NOWORKAROUND: ldr +; FIXME-CHECK-NOWORKAROUND-NEXT: mneg + + +define i32 @f_load_mneg_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 { +entry: + %0 = load i32* %c, align 4 + %mul = sub i32 0, %b + %sub = mul i32 %0, %mul + ret i32 %sub +} +; CHECK-LABEL: f_load_mneg_32: +; CHECK-NOWORKAROUND-LABEL: f_load_mneg_32: +; FIXME: only add further checks here once LLVM actually produces +; neg instructions +; FIXME-CHECK: ldr +; FIXME-CHECK-NEXT: mneg +; FIXME-CHECK-NOWORKAROUND: ldr +; FIXME-CHECK-NOWORKAROUND-NEXT: mneg + + +define i64 @f_load_smaddl(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = mul nsw i64 %conv1, %conv + %add = add nsw i64 %mul, %a + %0 = load i32* %d, align 4 + %conv2 = sext i32 %0 to i64 + %add3 = add nsw i64 %add, %conv2 + ret i64 %add3 +} +; CHECK-LABEL: f_load_smaddl: +; CHECK: ldrsw +; CHECK-NEXT: nop +; CHECK-NEXT: smaddl +; CHECK-NOWORKAROUND-LABEL: f_load_smaddl: +; CHECK-NOWORKAROUND: ldrsw +; CHECK-NOWORKAROUND-NEXT: smaddl + + +define i64 @f_load_smsubl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = mul nsw i64 %conv1, %conv + %sub = sub i64 %a, %mul + %0 = load i32* %d, align 4 + %conv2 = sext i32 %0 to i64 + %add = add nsw i64 %sub, %conv2 + ret i64 %add +} +; CHECK-LABEL: f_load_smsubl_64: +; CHECK: ldrsw +; CHECK-NEXT: nop +; CHECK-NEXT: smsubl +; CHECK-NOWORKAROUND-LABEL: f_load_smsubl_64: +; CHECK-NOWORKAROUND: ldrsw +; CHECK-NOWORKAROUND-NEXT: smsubl + + +define i64 @f_load_smull(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = mul nsw i64 %conv1, %conv + %0 = load i32* %d, align 4 + %conv2 = sext i32 %0 to i64 + %div = sdiv i64 %mul, %conv2 + ret i64 %div +} +; CHECK-LABEL: f_load_smull: +; CHECK: ldrsw +; CHECK-NEXT: smull +; CHECK-NOWORKAROUND-LABEL: f_load_smull: +; CHECK-NOWORKAROUND: ldrsw +; CHECK-NOWORKAROUND-NEXT: smull + + +define i64 @f_load_smnegl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = sext i32 %b to i64 + %conv1 = sext i32 %c to i64 + %mul = sub nsw i64 0, %conv + %sub = mul i64 %conv1, %mul + %0 = load i32* %d, align 4 + %conv2 = sext i32 %0 to i64 + %div = sdiv i64 %sub, %conv2 + ret i64 %div +} +; CHECK-LABEL: f_load_smnegl_64: +; CHECK-NOWORKAROUND-LABEL: f_load_smnegl_64: +; FIXME: only add further checks here once LLVM actually produces +; smnegl instructions + + +define i64 @f_load_umaddl(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = zext i32 %b to i64 + %conv1 = zext i32 %c to i64 + %mul = mul i64 %conv1, %conv + %add = add i64 %mul, %a + %0 = load i32* %d, align 4 + %conv2 = zext i32 %0 to i64 + %add3 = add i64 %add, %conv2 + ret i64 %add3 +} +; CHECK-LABEL: f_load_umaddl: +; CHECK: ldr +; CHECK-NEXT: nop +; CHECK-NEXT: umaddl +; CHECK-NOWORKAROUND-LABEL: f_load_umaddl: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: umaddl + + +define i64 @f_load_umsubl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = zext i32 %b to i64 + %conv1 = zext i32 %c to i64 + %mul = mul i64 %conv1, %conv + %sub = sub i64 %a, %mul + %0 = load i32* %d, align 4 + %conv2 = zext i32 %0 to i64 + %add = add i64 %sub, %conv2 + ret i64 %add +} +; CHECK-LABEL: f_load_umsubl_64: +; CHECK: ldr +; CHECK-NEXT: nop +; CHECK-NEXT: umsubl +; CHECK-NOWORKAROUND-LABEL: f_load_umsubl_64: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: umsubl + + +define i64 @f_load_umull(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = zext i32 %b to i64 + %conv1 = zext i32 %c to i64 + %mul = mul i64 %conv1, %conv + %0 = load i32* %d, align 4 + %conv2 = zext i32 %0 to i64 + %div = udiv i64 %mul, %conv2 + ret i64 %div +} +; CHECK-LABEL: f_load_umull: +; CHECK: ldr +; CHECK-NEXT: umull +; CHECK-NOWORKAROUND-LABEL: f_load_umull: +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: umull + + +define i64 @f_load_umnegl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 { +entry: + %conv = zext i32 %b to i64 + %conv1 = zext i32 %c to i64 + %mul = sub nsw i64 0, %conv + %sub = mul i64 %conv1, %mul + %0 = load i32* %d, align 4 + %conv2 = zext i32 %0 to i64 + %div = udiv i64 %sub, %conv2 + ret i64 %div +} +; CHECK-LABEL: f_load_umnegl_64: +; CHECK-NOWORKAROUND-LABEL: f_load_umnegl_64: +; FIXME: only add further checks here once LLVM actually produces +; umnegl instructions + + +define i64 @f_store_madd_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + store i64 %a, i64* %e, align 8 + %mul = mul nsw i64 %0, %b + %add = add nsw i64 %mul, %a + ret i64 %add +} +; CHECK-LABEL: f_store_madd_64: +; CHECK: str +; CHECK-NEXT: nop +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_store_madd_64: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: madd + + +define i32 @f_store_madd_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + store i32 %a, i32* %e, align 4 + %mul = mul nsw i32 %0, %b + %add = add nsw i32 %mul, %a + ret i32 %add +} +; CHECK-LABEL: f_store_madd_32: +; CHECK: str +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_store_madd_32: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: madd + + +define i64 @f_store_msub_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + store i64 %a, i64* %e, align 8 + %mul = mul nsw i64 %0, %b + %sub = sub nsw i64 %a, %mul + ret i64 %sub +} +; CHECK-LABEL: f_store_msub_64: +; CHECK: str +; CHECK-NEXT: nop +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_store_msub_64: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: msub + + +define i32 @f_store_msub_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + store i32 %a, i32* %e, align 4 + %mul = mul nsw i32 %0, %b + %sub = sub nsw i32 %a, %mul + ret i32 %sub +} +; CHECK-LABEL: f_store_msub_32: +; CHECK: str +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_store_msub_32: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: msub + + +define i64 @f_store_mul_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + store i64 %a, i64* %e, align 8 + %mul = mul nsw i64 %0, %b + ret i64 %mul +} +; CHECK-LABEL: f_store_mul_64: +; CHECK: str +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_store_mul_64: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: mul + + +define i32 @f_store_mul_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + store i32 %a, i32* %e, align 4 + %mul = mul nsw i32 %0, %b + ret i32 %mul +} +; CHECK-LABEL: f_store_mul_32: +; CHECK: str +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_store_mul_32: +; CHECK-NOWORKAROUND: str +; CHECK-NOWORKAROUND-NEXT: mul + + +define i64 @f_prefetch_madd_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + %1 = bitcast i64* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 0, i32 0, i32 1) + %mul = mul nsw i64 %0, %b + %add = add nsw i64 %mul, %a + ret i64 %add +} +; CHECK-LABEL: f_prefetch_madd_64: +; CHECK: prfm +; CHECK-NEXT: nop +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_prefetch_madd_64: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: madd + +declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) #2 + +define i32 @f_prefetch_madd_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + %1 = bitcast i32* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 1, i32 0, i32 1) + %mul = mul nsw i32 %0, %b + %add = add nsw i32 %mul, %a + ret i32 %add +} +; CHECK-LABEL: f_prefetch_madd_32: +; CHECK: prfm +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: f_prefetch_madd_32: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: madd + +define i64 @f_prefetch_msub_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + %1 = bitcast i64* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 0, i32 1, i32 1) + %mul = mul nsw i64 %0, %b + %sub = sub nsw i64 %a, %mul + ret i64 %sub +} +; CHECK-LABEL: f_prefetch_msub_64: +; CHECK: prfm +; CHECK-NEXT: nop +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_prefetch_msub_64: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: msub + +define i32 @f_prefetch_msub_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + %1 = bitcast i32* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 1, i32 1, i32 1) + %mul = mul nsw i32 %0, %b + %sub = sub nsw i32 %a, %mul + ret i32 %sub +} +; CHECK-LABEL: f_prefetch_msub_32: +; CHECK: prfm +; CHECK-NEXT: msub +; CHECK-NOWORKAROUND-LABEL: f_prefetch_msub_32: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: msub + +define i64 @f_prefetch_mul_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 { +entry: + %0 = load i64* %cp, align 8 + %1 = bitcast i64* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 0, i32 3, i32 1) + %mul = mul nsw i64 %0, %b + ret i64 %mul +} +; CHECK-LABEL: f_prefetch_mul_64: +; CHECK: prfm +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_prefetch_mul_64: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: mul + +define i32 @f_prefetch_mul_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 { +entry: + %0 = load i32* %cp, align 4 + %1 = bitcast i32* %e to i8* + tail call void @llvm.prefetch(i8* %1, i32 1, i32 3, i32 1) + %mul = mul nsw i32 %0, %b + ret i32 %mul +} +; CHECK-LABEL: f_prefetch_mul_32: +; CHECK: prfm +; CHECK-NEXT: mul +; CHECK-NOWORKAROUND-LABEL: f_prefetch_mul_32: +; CHECK-NOWORKAROUND: prfm +; CHECK-NOWORKAROUND-NEXT: mul + +define i64 @fall_through(i64 %a, i64 %b, i64* nocapture readonly %c) #0 { +entry: + %0 = load i64* %c, align 8 + br label %block1 + +block1: + %mul = mul nsw i64 %0, %b + %add = add nsw i64 %mul, %a + %tmp = ptrtoint i8* blockaddress(@fall_through, %block1) to i64 + %ret = add nsw i64 %tmp, %add + ret i64 %ret +} +; CHECK-LABEL: fall_through +; CHECK: ldr +; CHECK-NEXT: nop +; CHECK-NEXT: .Ltmp +; CHECK-NEXT: BB +; CHECK-NEXT: madd +; CHECK-NOWORKAROUND-LABEL: fall_through +; CHECK-NOWORKAROUND: ldr +; CHECK-NOWORKAROUND-NEXT: .Ltmp +; CHECK-NOWORKAROUND-NEXT: BB +; CHECK-NOWORKAROUND-NEXT: madd + +attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + + +; CHECK-LABEL: ... Statistics Collected ... +; CHECK: 11 aarch64-fix-cortex-a53-835769 - Number of Nops added to work around erratum 835769