Index: llvm/trunk/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64.td +++ llvm/trunk/lib/Target/AArch64/AArch64.td @@ -61,9 +61,10 @@ "Reserve X18, making it unavailable " "as a GPR">; -def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld", - "MergeNarrowLoads", "true", - "Merge narrow load instructions">; +def FeatureMergeNarrowZeroSt : SubtargetFeature<"merge-narrow-zero-st", + "MergeNarrowZeroStores", "true", + "Merge narrow zero store " + "instructions">; def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", "Use alias analysis during codegen">; @@ -181,7 +182,7 @@ FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, - FeatureMergeNarrowLd, + FeatureMergeNarrowZeroSt, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, @@ -252,7 +253,7 @@ FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, - FeatureMergeNarrowLd, + FeatureMergeNarrowZeroSt, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, Index: llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -38,7 +38,6 @@ STATISTIC(NumPreFolded, "Number of pre-index updates folded"); STATISTIC(NumUnscaledPairCreated, "Number of load/store from unscaled generated"); -STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); @@ -51,10 +50,6 @@ static cl::opt UpdateLimit("aarch64-update-scan-limit", cl::init(100), cl::Hidden); -static cl::opt EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden, - cl::init(false), - cl::desc("Enable narrow load merge")); - #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" namespace { @@ -107,11 +102,11 @@ bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, MachineBasicBlock::iterator &StoreI); - // Merge the two instructions indicated into a wider instruction. + // Merge the two instructions indicated into a wider narrow store instruction. MachineBasicBlock::iterator - mergeNarrowInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator MergeMI, - const LdStPairFlags &Flags); + mergeNarrowZeroStores(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator MergeMI, + const LdStPairFlags &Flags); // Merge the two instructions indicated into a single pair-wise instruction. MachineBasicBlock::iterator @@ -147,8 +142,8 @@ mergeUpdateInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update, bool IsPreIdx); - // Find and merge foldable ldr/str instructions. - bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); + // Find and merge zero store instructions. + bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); // Find and pair ldr/str instructions. bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); @@ -156,7 +151,7 @@ // Find and promote load instructions which read directly from store. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); - bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); + bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -173,23 +168,6 @@ INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", AARCH64_LOAD_STORE_OPT_NAME, false, false) -static unsigned getBitExtrOpcode(MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - llvm_unreachable("Unexpected opcode."); - case AArch64::LDRBBui: - case AArch64::LDURBBi: - case AArch64::LDRHHui: - case AArch64::LDURHHi: - return AArch64::UBFMWri; - case AArch64::LDRSBWui: - case AArch64::LDURSBWi: - case AArch64::LDRSHWui: - case AArch64::LDURSHWi: - return AArch64::SBFMWri; - } -} - static bool isNarrowStore(unsigned Opc) { switch (Opc) { default: @@ -202,30 +180,6 @@ } } -static bool isNarrowLoad(unsigned Opc) { - switch (Opc) { - default: - return false; - case AArch64::LDRHHui: - case AArch64::LDURHHi: - case AArch64::LDRBBui: - case AArch64::LDURBBi: - case AArch64::LDRSHWui: - case AArch64::LDURSHWi: - case AArch64::LDRSBWui: - case AArch64::LDURSBWi: - return true; - } -} - -static bool isNarrowLoad(MachineInstr &MI) { - return isNarrowLoad(MI.getOpcode()); -} - -static bool isNarrowLoadOrStore(unsigned Opc) { - return isNarrowLoad(Opc) || isNarrowStore(Opc); -} - // Scaling factor for unscaled load or store. static int getMemScale(MachineInstr &MI) { switch (MI.getOpcode()) { @@ -317,23 +271,11 @@ case AArch64::STURSi: case AArch64::LDRSui: case AArch64::LDURSi: - case AArch64::LDRHHui: - case AArch64::LDURHHi: - case AArch64::LDRBBui: - case AArch64::LDURBBi: return Opc; case AArch64::LDRSWui: return AArch64::LDRWui; case AArch64::LDURSWi: return AArch64::LDURWi; - case AArch64::LDRSBWui: - return AArch64::LDRBBui; - case AArch64::LDRSHWui: - return AArch64::LDRHHui; - case AArch64::LDURSBWi: - return AArch64::LDURBBi; - case AArch64::LDURSHWi: - return AArch64::LDURHHi; } } @@ -353,18 +295,6 @@ return AArch64::STURXi; case AArch64::STRWui: return AArch64::STRXui; - case AArch64::LDRHHui: - case AArch64::LDRSHWui: - return AArch64::LDRWui; - case AArch64::LDURHHi: - case AArch64::LDURSHWi: - return AArch64::LDURWi; - case AArch64::LDRBBui: - case AArch64::LDRSBWui: - return AArch64::LDRHHui; - case AArch64::LDURBBi: - case AArch64::LDURSBWi: - return AArch64::LDURHHi; } } @@ -608,23 +538,20 @@ (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } -static bool isPromotableZeroStoreOpcode(unsigned Opc) { - return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi; -} - -static bool isPromotableZeroStoreOpcode(MachineInstr &MI) { - return isPromotableZeroStoreOpcode(MI.getOpcode()); -} - static bool isPromotableZeroStoreInst(MachineInstr &MI) { - return (isPromotableZeroStoreOpcode(MI)) && + unsigned Opc = MI.getOpcode(); + return (Opc == AArch64::STRWui || Opc == AArch64::STURWi || + isNarrowStore(Opc)) && getLdStRegOp(MI).getReg() == AArch64::WZR; } MachineBasicBlock::iterator -AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator MergeMI, - const LdStPairFlags &Flags) { +AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator MergeMI, + const LdStPairFlags &Flags) { + assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && + "Expected promotable zero stores."); + MachineBasicBlock::iterator NextI = I; ++NextI; // If NextI is the second of the two instructions to be merged, we need @@ -665,105 +592,9 @@ OffsetImm /= 2; } + // Construct the new instruction. DebugLoc DL = I->getDebugLoc(); MachineBasicBlock *MBB = I->getParent(); - if (isNarrowLoad(Opc)) { - MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI); - // When merging small (< 32 bit) loads for big-endian targets, the order of - // the component parts gets swapped. - if (!Subtarget->isLittleEndian()) - std::swap(RtMI, Rt2MI); - // Construct the new load instruction. - MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; - NewMemMI = - BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) - .addOperand(getLdStRegOp(*RtNewDest)) - .addOperand(BaseRegOp) - .addImm(OffsetImm) - .setMemRefs(I->mergeMemRefsWith(*MergeMI)); - (void)NewMemMI; - - DEBUG( - dbgs() - << "Creating the new load and extract. Replacing instructions:\n "); - DEBUG(I->print(dbgs())); - DEBUG(dbgs() << " "); - DEBUG(MergeMI->print(dbgs())); - DEBUG(dbgs() << " with instructions:\n "); - DEBUG((NewMemMI)->print(dbgs())); - - int Width = getMemScale(*I) == 1 ? 8 : 16; - int LSBLow = 0; - int LSBHigh = Width; - int ImmsLow = LSBLow + Width - 1; - int ImmsHigh = LSBHigh + Width - 1; - MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I); - if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { - // Create the bitfield extract for high bits. - BitExtMI1 = - BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI))) - .addOperand(getLdStRegOp(*Rt2MI)) - .addReg(getLdStRegOp(*RtNewDest).getReg()) - .addImm(LSBHigh) - .addImm(ImmsHigh); - // Create the bitfield extract for low bits. - if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { - // For unsigned, prefer to use AND for low bits. - BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) - .addOperand(getLdStRegOp(*RtMI)) - .addReg(getLdStRegOp(*RtNewDest).getReg()) - .addImm(ImmsLow); - } else { - BitExtMI2 = - BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI))) - .addOperand(getLdStRegOp(*RtMI)) - .addReg(getLdStRegOp(*RtNewDest).getReg()) - .addImm(LSBLow) - .addImm(ImmsLow); - } - } else { - // Create the bitfield extract for low bits. - if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { - // For unsigned, prefer to use AND for low bits. - BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) - .addOperand(getLdStRegOp(*RtMI)) - .addReg(getLdStRegOp(*RtNewDest).getReg()) - .addImm(ImmsLow); - } else { - BitExtMI1 = - BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI))) - .addOperand(getLdStRegOp(*RtMI)) - .addReg(getLdStRegOp(*RtNewDest).getReg()) - .addImm(LSBLow) - .addImm(ImmsLow); - } - - // Create the bitfield extract for high bits. - BitExtMI2 = - BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI))) - .addOperand(getLdStRegOp(*Rt2MI)) - .addReg(getLdStRegOp(*RtNewDest).getReg()) - .addImm(LSBHigh) - .addImm(ImmsHigh); - } - (void)BitExtMI1; - (void)BitExtMI2; - - DEBUG(dbgs() << " "); - DEBUG((BitExtMI1)->print(dbgs())); - DEBUG(dbgs() << " "); - DEBUG((BitExtMI2)->print(dbgs())); - DEBUG(dbgs() << "\n"); - - // Erase the old instructions. - I->eraseFromParent(); - MergeMI->eraseFromParent(); - return NextI; - } - assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && - "Expected promotable zero store"); - - // Construct the new instruction. MachineInstrBuilder MIB; MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) @@ -772,7 +603,7 @@ .setMemRefs(I->mergeMemRefsWith(*MergeMI)); (void)MIB; - DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n "); + DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); DEBUG(MergeMI->print(dbgs())); @@ -1179,13 +1010,14 @@ return true; } - // If the second instruction isn't even a load/store, bail out. + // If the second instruction isn't even a mergable/pairable load/store, bail + // out. if (!PairIsValidLdStrOpc) return false; - // FIXME: We don't support merging narrow loads/stores with mixed - // scaled/unscaled offsets. - if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB)) + // FIXME: We don't support merging narrow stores with mixed scaled/unscaled + // offsets. + if (isNarrowStore(OpcA) || isNarrowStore(OpcB)) return false; // Try to match an unscaled load/store with a scaled load/store. @@ -1596,37 +1428,26 @@ return false; } -// Find narrow loads that can be converted into a single wider load with -// bitfield extract instructions. Also merge adjacent zero stores into a wider -// store. -bool AArch64LoadStoreOpt::tryToMergeLdStInst( +// Merge adjacent zero stores into a wider store. +bool AArch64LoadStoreOpt::tryToMergeZeroStInst( MachineBasicBlock::iterator &MBBI) { - assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) && - "Expected narrow op."); + assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store."); MachineInstr &MI = *MBBI; MachineBasicBlock::iterator E = MI.getParent()->end(); if (!TII->isCandidateToMergeOrPair(MI)) return false; - // For promotable zero stores, the stored value should be WZR. - if (isPromotableZeroStoreOpcode(MI) && - getLdStRegOp(MI).getReg() != AArch64::WZR) - return false; - // Look ahead up to LdStLimit instructions for a mergable instruction. LdStPairFlags Flags; MachineBasicBlock::iterator MergeMI = findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); if (MergeMI != E) { - if (isNarrowLoad(MI)) { - ++NumNarrowLoadsPromoted; - } else if (isPromotableZeroStoreInst(MI)) { - ++NumZeroStoresPromoted; - } + ++NumZeroStoresPromoted; + // Keeping the iterator straight is a pain, so we let the merge routine tell // us what the next instruction is after it's done mucking about. - MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags); + MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags); return true; } return false; @@ -1667,7 +1488,7 @@ } bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, - bool enableNarrowLdOpt) { + bool EnableNarrowZeroStOpt) { bool Modified = false; // Four tranformations to do here: // 1) Find loads that directly read from stores and promote them by @@ -1706,29 +1527,21 @@ } } } - // 2) Find narrow loads that can be converted into a single wider load - // with bitfield extract instructions. - // e.g., - // ldrh w0, [x2] - // ldrh w1, [x2, #2] - // ; becomes - // ldr w0, [x2] - // ubfx w1, w0, #16, #16 - // and w0, w0, #ffff - // - // Also merge adjacent zero stores into a wider store. + // 2) Merge adjacent zero stores into a wider store. // e.g., // strh wzr, [x0] // strh wzr, [x0, #2] // ; becomes // str wzr, [x0] + // e.g., + // str wzr, [x0] + // str wzr, [x0, #4] + // ; becomes + // str xzr, [x0] for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - enableNarrowLdOpt && MBBI != E;) { - MachineInstr &MI = *MBBI; - unsigned Opc = MI.getOpcode(); - if (isPromotableZeroStoreOpcode(Opc) || - (EnableNarrowLdMerge && isNarrowLoad(Opc))) { - if (tryToMergeLdStInst(MBBI)) { + EnableNarrowZeroStOpt && MBBI != E;) { + if (isPromotableZeroStoreInst(*MBBI)) { + if (tryToMergeZeroStInst(MBBI)) { Modified = true; } else ++MBBI; @@ -1889,10 +1702,10 @@ UsedRegs.resize(TRI->getNumRegs()); bool Modified = false; - bool enableNarrowLdOpt = - Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign(); + bool enableNarrowZeroStOpt = + Subtarget->mergeNarrowStores() && !Subtarget->requiresStrictAlign(); for (auto &MBB : Fn) - Modified |= optimizeBlock(MBB, enableNarrowLdOpt); + Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt); return Modified; } Index: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h @@ -71,7 +71,7 @@ // StrictAlign - Disallow unaligned memory accesses. bool StrictAlign = false; - bool MergeNarrowLoads = false; + bool MergeNarrowZeroStores = false; bool UseAA = false; bool PredictableSelectIsExpensive = false; bool BalanceFPOps = false; @@ -179,7 +179,7 @@ bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } bool hasRAS() const { return HasRAS; } - bool mergeNarrowLoads() const { return MergeNarrowLoads; } + bool mergeNarrowStores() const { return MergeNarrowZeroStores; } bool balanceFPOps() const { return BalanceFPOps; } bool predictableSelectIsExpensive() const { return PredictableSelectIsExpensive; Index: llvm/trunk/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll @@ -1,329 +1,14 @@ -; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=cortex-a57 -verify-machineinstrs -enable-narrow-ld-merge=true | FileCheck %s --check-prefix=CHECK --check-prefix=LE -; RUN: llc < %s -mtriple aarch64_be--none-eabi -mcpu=cortex-a57 -verify-machineinstrs -enable-narrow-ld-merge=true | FileCheck %s --check-prefix=CHECK --check-prefix=BE -; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=kryo -verify-machineinstrs -enable-narrow-ld-merge=true | FileCheck %s --check-prefix=CHECK --check-prefix=LE - -; CHECK-LABEL: Ldrh_merge -; CHECK-NOT: ldrh -; CHECK: ldr [[NEW_DEST:w[0-9]+]] -; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff -; CHECK-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16 -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i16 @Ldrh_merge(i16* nocapture readonly %p) { - %1 = load i16, i16* %p, align 2 - %arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1 - %2 = load i16, i16* %arrayidx2, align 2 - %add = sub nuw nsw i16 %1, %2 - ret i16 %add -} - -; CHECK-LABEL: Ldurh_merge -; CHECK-NOT: ldurh -; CHECK: ldur [[NEW_DEST:w[0-9]+]] -; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff -; CHECK-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]] -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i16 @Ldurh_merge(i16* nocapture readonly %p) { -entry: - %arrayidx = getelementptr inbounds i16, i16* %p, i64 -2 - %0 = load i16, i16* %arrayidx - %arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1 - %1 = load i16, i16* %arrayidx3 - %add = sub nuw nsw i16 %0, %1 - ret i16 %add -} - -; CHECK-LABEL: Ldrh_4_merge -; CHECK-NOT: ldrh -; CHECK: ldp [[WORD1:w[0-9]+]], [[WORD2:w[0-9]+]], [x0] -; CHECK-DAG: and [[WORD1LO:w[0-9]+]], [[WORD1]], #0xffff -; CHECK-DAG: lsr [[WORD1HI:w[0-9]+]], [[WORD1]], #16 -; CHECK-DAG: and [[WORD2LO:w[0-9]+]], [[WORD2]], #0xffff -; CHECK-DAG: lsr [[WORD2HI:w[0-9]+]], [[WORD2]], #16 -; LE-DAG: sub [[TEMP1:w[0-9]+]], [[WORD1HI]], [[WORD1LO]] -; BE-DAG: sub [[TEMP1:w[0-9]+]], [[WORD1LO]], [[WORD1HI]] -; LE: udiv [[TEMP2:w[0-9]+]], [[TEMP1]], [[WORD2LO]] -; BE: udiv [[TEMP2:w[0-9]+]], [[TEMP1]], [[WORD2HI]] -; LE: sub w0, [[TEMP2]], [[WORD2HI]] -; BE: sub w0, [[TEMP2]], [[WORD2LO]] -define i16 @Ldrh_4_merge(i16* nocapture readonly %P) { - %arrayidx = getelementptr inbounds i16, i16* %P, i64 0 - %l0 = load i16, i16* %arrayidx - %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1 - %l1 = load i16, i16* %arrayidx2 - %arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2 - %l2 = load i16, i16* %arrayidx7 - %arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3 - %l3 = load i16, i16* %arrayidx12 - %add4 = sub nuw nsw i16 %l1, %l0 - %add9 = udiv i16 %add4, %l2 - %add14 = sub nuw nsw i16 %add9, %l3 - ret i16 %add14 -} - -; CHECK-LABEL: Ldrsh_merge -; CHECK: ldr [[NEW_DEST:w[0-9]+]] -; CHECK-DAG: asr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 -; CHECK-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]] -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] - -define i32 @Ldrsh_merge(i16* %p) nounwind { - %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4 - %tmp = load i16, i16* %add.ptr0 - %add.ptr = getelementptr inbounds i16, i16* %p, i64 5 - %tmp1 = load i16, i16* %add.ptr - %sexttmp = sext i16 %tmp to i32 - %sexttmp1 = sext i16 %tmp1 to i32 - %add = sub nsw i32 %sexttmp1, %sexttmp - ret i32 %add -} - -; CHECK-LABEL: Ldrsh_zsext_merge -; CHECK: ldr [[NEW_DEST:w[0-9]+]] -; LE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff -; LE-DAG: asr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16 -; BE-DAG: sxth [[LO_PART:w[0-9]+]], [[NEW_DEST]] -; BE-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16 -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldrsh_zsext_merge(i16* %p) nounwind { - %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4 - %tmp = load i16, i16* %add.ptr0 - %add.ptr = getelementptr inbounds i16, i16* %p, i64 5 - %tmp1 = load i16, i16* %add.ptr - %sexttmp = zext i16 %tmp to i32 - %sexttmp1 = sext i16 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldrsh_szext_merge -; CHECK: ldr [[NEW_DEST:w[0-9]+]] -; LE-DAG: sxth [[LO_PART:w[0-9]+]], [[NEW_DEST]] -; LE-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16 -; BE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff -; BE-DAG: asr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16 -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldrsh_szext_merge(i16* %p) nounwind { - %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4 - %tmp = load i16, i16* %add.ptr0 - %add.ptr = getelementptr inbounds i16, i16* %p, i64 5 - %tmp1 = load i16, i16* %add.ptr - %sexttmp = sext i16 %tmp to i32 - %sexttmp1 = zext i16 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldrb_merge -; CHECK: ldrh [[NEW_DEST:w[0-9]+]] -; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff -; CHECK-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldrb_merge(i8* %p) nounwind { - %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2 - %tmp = load i8, i8* %add.ptr0 - %add.ptr = getelementptr inbounds i8, i8* %p, i64 3 - %tmp1 = load i8, i8* %add.ptr - %sexttmp = zext i8 %tmp to i32 - %sexttmp1 = zext i8 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldrsb_merge -; CHECK: ldrh [[NEW_DEST:w[0-9]+]] -; CHECK-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]] -; CHECK-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldrsb_merge(i8* %p) nounwind { - %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2 - %tmp = load i8, i8* %add.ptr0 - %add.ptr = getelementptr inbounds i8, i8* %p, i64 3 - %tmp1 = load i8, i8* %add.ptr - %sexttmp = sext i8 %tmp to i32 - %sexttmp1 = sext i8 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldrsb_zsext_merge -; CHECK: ldrh [[NEW_DEST:w[0-9]+]] -; LE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff -; LE-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; BE-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]] -; BE-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldrsb_zsext_merge(i8* %p) nounwind { - %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2 - %tmp = load i8, i8* %add.ptr0 - %add.ptr = getelementptr inbounds i8, i8* %p, i64 3 - %tmp1 = load i8, i8* %add.ptr - %sexttmp = zext i8 %tmp to i32 - %sexttmp1 = sext i8 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldrsb_szext_merge -; CHECK: ldrh [[NEW_DEST:w[0-9]+]] -; LE-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]] -; LE-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; BE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff -; BE-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldrsb_szext_merge(i8* %p) nounwind { - %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2 - %tmp = load i8, i8* %add.ptr0 - %add.ptr = getelementptr inbounds i8, i8* %p, i64 3 - %tmp1 = load i8, i8* %add.ptr - %sexttmp = sext i8 %tmp to i32 - %sexttmp1 = zext i8 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldursh_merge -; CHECK: ldur [[NEW_DEST:w[0-9]+]] -; CHECK-DAG: asr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 -; CHECK-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]] -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldursh_merge(i16* %p) nounwind { - %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1 - %tmp = load i16, i16* %add.ptr0 - %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2 - %tmp1 = load i16, i16* %add.ptr - %sexttmp = sext i16 %tmp to i32 - %sexttmp1 = sext i16 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldursh_zsext_merge -; CHECK: ldur [[NEW_DEST:w[0-9]+]] -; LE-DAG: lsr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 -; LE-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]] -; BE-DAG: asr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 -; BE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xffff -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldursh_zsext_merge(i16* %p) nounwind { - %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1 - %tmp = load i16, i16* %add.ptr0 - %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2 - %tmp1 = load i16, i16* %add.ptr - %sexttmp = zext i16 %tmp to i32 - %sexttmp1 = sext i16 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldursh_szext_merge -; CHECK: ldur [[NEW_DEST:w[0-9]+]] -; LE-DAG: asr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 -; LE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xffff -; BE-DAG: lsr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16 -; BE-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]] -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldursh_szext_merge(i16* %p) nounwind { - %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1 - %tmp = load i16, i16* %add.ptr0 - %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2 - %tmp1 = load i16, i16* %add.ptr - %sexttmp = sext i16 %tmp to i32 - %sexttmp1 = zext i16 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldurb_merge -; CHECK: ldurh [[NEW_DEST:w[0-9]+]] -; CHECK-DAG: ubfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; CHECK-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldurb_merge(i8* %p) nounwind { - %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1 - %tmp = load i8, i8* %add.ptr0 - %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2 - %tmp1 = load i8, i8* %add.ptr - %sexttmp = zext i8 %tmp to i32 - %sexttmp1 = zext i8 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldursb_merge -; CHECK: ldurh [[NEW_DEST:w[0-9]+]] -; CHECK-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; CHECK-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]] -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldursb_merge(i8* %p) nounwind { - %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1 - %tmp = load i8, i8* %add.ptr0 - %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2 - %tmp1 = load i8, i8* %add.ptr - %sexttmp = sext i8 %tmp to i32 - %sexttmp1 = sext i8 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldursb_zsext_merge -; CHECK: ldurh [[NEW_DEST:w[0-9]+]] -; LE-DAG: ubfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; LE-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]] -; BE-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; BE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldursb_zsext_merge(i8* %p) nounwind { - %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1 - %tmp = load i8, i8* %add.ptr0 - %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2 - %tmp1 = load i8, i8* %add.ptr - %sexttmp = zext i8 %tmp to i32 - %sexttmp1 = sext i8 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} - -; CHECK-LABEL: Ldursb_szext_merge -; CHECK: ldurh [[NEW_DEST:w[0-9]+]] -; LE-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; LE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff -; BE-DAG: ubfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8 -; BE-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]] -; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]] -; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]] -define i32 @Ldursb_szext_merge(i8* %p) nounwind { - %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1 - %tmp = load i8, i8* %add.ptr0 - %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2 - %tmp1 = load i8, i8* %add.ptr - %sexttmp = sext i8 %tmp to i32 - %sexttmp1 = zext i8 %tmp1 to i32 - %add = sub nsw i32 %sexttmp, %sexttmp1 - ret i32 %add -} +; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=LE +; RUN: llc < %s -mtriple aarch64_be--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=BE +; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=kryo -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=LE ; CHECK-LABEL: Strh_zero ; CHECK: str wzr define void @Strh_zero(i16* nocapture %P, i32 %n) { entry: - %idxprom = sext i32 %n to i64 + %idxprom = sext i32 %n to i64 %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom - store i16 0, i16* %arrayidx + store i16 0, i16* %arrayidx %add = add nsw i32 %n, 1 %idxprom1 = sext i32 %add to i64 %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 %idxprom1