Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -1270,6 +1270,33 @@ unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode; }; +/// \brief Provide DenseMapInfo for TargetInstrInfo::RegSubRegPair. +template<> +struct DenseMapInfo { + typedef DenseMapInfo RegInfo; + + static inline TargetInstrInfo::RegSubRegPair getEmptyKey() { + return TargetInstrInfo::RegSubRegPair(RegInfo::getEmptyKey(), + RegInfo::getEmptyKey()); + } + static inline TargetInstrInfo::RegSubRegPair getTombstoneKey() { + return TargetInstrInfo::RegSubRegPair(RegInfo::getTombstoneKey(), + RegInfo::getTombstoneKey()); + } + /// \brief Reuse getHashValue implementation from + /// std::pair. + static unsigned getHashValue(const TargetInstrInfo::RegSubRegPair &Val) { + std::pair PairVal = + std::make_pair(Val.Reg, Val.SubReg); + return DenseMapInfo>::getHashValue(PairVal); + } + static bool isEqual(const TargetInstrInfo::RegSubRegPair &LHS, + const TargetInstrInfo::RegSubRegPair &RHS) { + return RegInfo::isEqual(LHS.Reg, RHS.Reg) && + RegInfo::isEqual(LHS.SubReg, RHS.SubReg); + } +}; + } // End llvm namespace #endif Index: lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- lib/CodeGen/PeepholeOptimizer.cpp +++ lib/CodeGen/PeepholeOptimizer.cpp @@ -98,6 +98,12 @@ DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false), cl::desc("Disable advanced copy optimization")); +// Limit the number of PHI instructions to process +// in PeepholeOptimizer::getNextSource. +static cl::opt RewritePHILimit( + "rewrite-phi-limit", cl::Hidden, cl::init(10), + cl::desc("Limit the length of PHI chains nested PHIs to lookup")); + STATISTIC(NumReuse, "Number of extension results reused"); STATISTIC(NumCmps, "Number of compares eliminated"); STATISTIC(NumImmFold, "Number of move immediate folded"); @@ -132,6 +138,10 @@ } } + /// \brief Track Def -> Use info used for rewriting copies. + typedef SmallDenseMap + RewriteMapTy; + private: bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB); bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, @@ -143,7 +153,8 @@ bool optimizeCoalescableCopy(MachineInstr *MI); bool optimizeUncoalescableCopy(MachineInstr *MI, SmallPtrSetImpl &LocalMIs); - bool findNextSource(unsigned &Reg, unsigned &SubReg); + bool findNextSource(unsigned Reg, unsigned SubReg, + RewriteMapTy &RewriteMap); bool isMoveImmediate(MachineInstr *MI, SmallSet &ImmDefRegs, DenseMap &ImmDefMIs); @@ -220,6 +231,14 @@ assert(Idx < getNumSources() && "SubReg source out of index"); return RegSrcs[Idx].SubReg; } + + bool operator==(const ValueTrackerResult &Other) { + for (int i = 0, e = Other.getNumSources(); i != e; ++i) + if (Other.getSrcReg(i) != this->getSrcReg(i) || + Other.getSrcSubReg(i) != this->getSrcSubReg(i)) + return false; + return true; + } }; /// \brief Helper class to track the possible sources of a value defined by @@ -281,6 +300,8 @@ /// \brief Specialized version of getNextSource for SubregToReg /// instructions. ValueTrackerResult getNextSourceFromSubregToReg(); + /// \brief Specialized version of getNextSource for PHI instructions. + ValueTrackerResult getNextSourceFromPHI(); public: /// \brief Create a ValueTracker instance for the value defined by \p Reg. @@ -589,58 +610,133 @@ /// \brief Try to find the next source that share the same register file /// for the value defined by \p Reg and \p SubReg. -/// When true is returned, \p Reg and \p SubReg are updated with the -/// register number and sub-register index of the new source. +/// When true is returned, the \p RewriteMap can be used by the client to +/// retrieve all Def -> Use along the way up to the next source. Any found +/// Use that is not itself a key for another entry, is the next source to +/// use. During the search for the next source, multiple sources can be found +/// given multiple incoming sources of a PHI instruction. In this case, we +/// look in each PHI source for the next source; all found next sources must +/// share the same register file as \p Reg and \p SubReg. The client should +/// then be capable to rewrite all intermediate PHIs to get the next source. /// \return False if no alternative sources are available. True otherwise. -bool PeepholeOptimizer::findNextSource(unsigned &Reg, unsigned &SubReg) { +bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg, + RewriteMapTy &RewriteMap) { // Do not try to find a new source for a physical register. // So far we do not have any motivating example for doing that. // Thus, instead of maintaining untested code, we will revisit that if // that changes at some point. if (TargetRegisterInfo::isPhysicalRegister(Reg)) return false; - const TargetRegisterClass *DefRC = MRI->getRegClass(Reg); - unsigned DefSubReg = SubReg; - unsigned Src; - unsigned SrcSubReg; + SmallVector SrcToLook; + TargetInstrInfo::RegSubRegPair CurSrcPair(Reg, SubReg); + SrcToLook.push_back(CurSrcPair); bool ShouldRewrite = false; - // Follow the chain of copies until we reach the top of the use-def chain - // or find a more suitable source. - ValueTracker ValTracker(Reg, DefSubReg, *MRI, !DisableAdvCopyOpt, TII); - do { - ValueTrackerResult Res = ValTracker.getNextSource(); - if (!Res.isValid()) - break; - Src = Res.getSrcReg(0); - SrcSubReg = Res.getSrcSubReg(0); - - // Do not extend the live-ranges of physical registers as they add - // constraints to the register allocator. - // Moreover, if we want to extend the live-range of a physical register, - // unlike SSA virtual register, we will have to check that they are not - // redefine before the related use. - if (TargetRegisterInfo::isPhysicalRegister(Src)) - break; + unsigned PHILimit = RewritePHILimit; + while (!SrcToLook.empty() && PHILimit) { + TargetInstrInfo::RegSubRegPair Pair = SrcToLook.pop_back_val(); + CurSrcPair = Pair; + ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI, + !DisableAdvCopyOpt, TII); + ValueTrackerResult Res; + + do { + // Follow the chain of copies until we reach the top of the use-def chain + // or find a more suitable source. + Res = ValTracker.getNextSource(); + if (!Res.isValid()) + break; + + // Insert the Def -> Use entry for the recently found source. + ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair); + if (CurSrcRes.isValid()) { + assert(CurSrcRes == Res && "ValueTrackerResult found must match"); + break; + } + RewriteMap.insert(std::make_pair(CurSrcPair, Res)); + + // ValueTrackerResult usually have one source unless it's the result from + // a PHI instruction. Add the found PHI edges to be looked up further. + unsigned NumSrcs = Res.getNumSources(); + if (NumSrcs > 1) { + PHILimit--; + for (unsigned i = 0; i < NumSrcs; ++i) + SrcToLook.push_back(TargetInstrInfo::RegSubRegPair( + Res.getSrcReg(i), Res.getSrcSubReg(i))); + break; + } - const TargetRegisterClass *SrcRC = MRI->getRegClass(Src); + CurSrcPair.Reg = Res.getSrcReg(0); + CurSrcPair.SubReg = Res.getSrcSubReg(0); + // Do not extend the live-ranges of physical registers as they add + // constraints to the register allocator. Moreover, if we want to extend + // the live-range of a physical register, unlike SSA virtual register, + // we will have to check that they aren't redefine before the related use. + if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg)) + return false; - // If this source does not incur a cross register bank copy, use it. - ShouldRewrite = shareSameRegisterFile(*TRI, DefRC, DefSubReg, SrcRC, - SrcSubReg); - } while (!ShouldRewrite); + const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg); + + // If this source does not incur a cross register bank copy, use it. + ShouldRewrite = shareSameRegisterFile(*TRI, DefRC, SubReg, SrcRC, + CurSrcPair.SubReg); + } while (!ShouldRewrite); + + // Continue looking for new sources... + if (Res.isValid()) + continue; + + if (!PHILimit) { + DEBUG(dbgs() << "findNextSource: PHI limit reached\n"); + return false; + } + + // Do not continue searching for a new source if the there's at least + // one use-def which cannot be rewritten. + if (!ShouldRewrite) + return false; + } // If we did not find a more suitable source, there is nothing to optimize. - if (!ShouldRewrite || Src == Reg) + if (CurSrcPair.Reg == Reg) return false; - Reg = Src; - SubReg = SrcSubReg; return true; } +/// \brief Insert a PHI instruction with incoming edges \p SrcRegs that are +/// guaranteed to have the same register class. This is necessary whenever we +/// successfully traverse a PHI instruction and find suitable sources coming +/// from its edges. By inserting a new PHI, we provide a rewritten PHI def +/// suitable to be used in a new COPY instruction. +MachineInstr * +insertPHI(MachineRegisterInfo *MRI, const TargetInstrInfo *TII, + const SmallVectorImpl &SrcRegs, + MachineInstr *OrigPHI) { + assert(!SrcRegs.empty() && "No sources to create a PHI instruction?"); + + const TargetRegisterClass *NewRC = MRI->getRegClass(SrcRegs[0].Reg); + unsigned NewVR = MRI->createVirtualRegister(NewRC); + MachineBasicBlock *MBB = OrigPHI->getParent(); + MachineInstrBuilder MIB = BuildMI(*MBB, OrigPHI, OrigPHI->getDebugLoc(), + TII->get(TargetOpcode::PHI), NewVR); + + unsigned MBBOpIdx = 2; + for (auto RegPair : SrcRegs) { + MIB.addReg(RegPair.Reg, 0, RegPair.SubReg); + MIB.addMBB(OrigPHI->getOperand(MBBOpIdx).getMBB()); + // Since we're extended the lifetime of RegPair.Reg, clear the + // kill flags to account for that and make RegPair.Reg reaches + // the new PHI. + MRI->clearKillFlags(RegPair.Reg); + MBBOpIdx += 2; + } + + return MIB; +} + namespace { /// \brief Helper class to rewrite the arguments of a copy-like instruction. class CopyRewriter { @@ -714,16 +810,65 @@ return true; } - /// \brief Rewrite the current source with \p NewSrcReg and \p NewSecSubReg - /// by creating a new COPY instruction. \p DefReg and \p DefSubReg contain the - /// definition to be rewritten from the original copylike instruction. - /// \return The new COPY if the rewriting was possible, nullptr otherwise. - /// This is needed to handle Uncoalescable copies, since they are copy - /// like instructions that aren't recognized by the register allocator. - virtual MachineInstr *RewriteCurrentSource(unsigned DefReg, - unsigned DefSubReg, - unsigned NewSrcReg, - unsigned NewSrcSubReg) { + /// \brief Given a \p Def.Reg and Def.SubReg pair, use \p RewriteMap to find + /// the new source to use for rewrite. If \p HandleMultipleSources is true and + /// multiple sources for a given \p Def are found along the way, we found a + /// PHI instructions that needs to be rewritten. + /// TODO: HandleMultipleSources should be removed once we test PHI handling + /// with coalescable copies. + TargetInstrInfo::RegSubRegPair + getNewSource(MachineRegisterInfo *MRI, const TargetInstrInfo *TII, + TargetInstrInfo::RegSubRegPair Def, + PeepholeOptimizer::RewriteMapTy &RewriteMap, + bool HandleMultipleSources = true) { + + TargetInstrInfo::RegSubRegPair LookupSrc(Def.Reg, Def.SubReg); + do { + ValueTrackerResult Res = RewriteMap.lookup(LookupSrc); + // If there are no entries on the map, LookupSrc is the new source. + if (!Res.isValid()) + return LookupSrc; + + // There's only one source for this definition, keep searching... + unsigned NumSrcs = Res.getNumSources(); + if (NumSrcs == 1) { + LookupSrc.Reg = Res.getSrcReg(0); + LookupSrc.SubReg = Res.getSrcSubReg(0); + continue; + } + + if (!HandleMultipleSources) + break; + + // Multiple sources, recurse into each source to find a new source + // for it. Then, rewrite the PHI accordingly to its new edges. + SmallVector NewPHISrcs; + for (unsigned i = 0; i < NumSrcs; ++i) { + TargetInstrInfo::RegSubRegPair PHISrc(Res.getSrcReg(i), + Res.getSrcSubReg(i)); + NewPHISrcs.push_back( + getNewSource(MRI, TII, PHISrc, RewriteMap, HandleMultipleSources)); + } + + // Build the new PHI node and return its def register as the new source. + MachineInstr *OrigPHI = const_cast(Res.getInst()); + MachineInstr *NewPHI = insertPHI(MRI, TII, NewPHISrcs, OrigPHI); + const MachineOperand &MODef = NewPHI->getOperand(0); + return TargetInstrInfo::RegSubRegPair(MODef.getReg(), MODef.getSubReg()); + + } while (1); + + return TargetInstrInfo::RegSubRegPair(0, 0); + } + + /// \brief Rewrite the source found through \p Def, by using the \p RewriteMap + /// and create a new COPY instruction. More info about RewriteMap in + /// PeepholeOptimizer::findNextSource. Right now this is only used to handle + /// Uncoalescable copies, since they are copy like instructions that aren't + /// recognized by the register allocator. + virtual MachineInstr * + RewriteSource(TargetInstrInfo::RegSubRegPair Def, + PeepholeOptimizer::RewriteMapTy &RewriteMap) { return nullptr; } }; @@ -771,31 +916,41 @@ return true; } - /// \brief Rewrite the current source with \p NewSrcReg and \p NewSrcSubReg - /// by creating a new COPY instruction. \p DefReg and \p DefSubReg contain the - /// definition to be rewritten from the original copylike instruction. - /// \return The new COPY if the rewriting was possible, nullptr otherwise. - MachineInstr *RewriteCurrentSource(unsigned DefReg, unsigned DefSubReg, - unsigned NewSrcReg, - unsigned NewSrcSubReg) override { - assert(!TargetRegisterInfo::isPhysicalRegister(DefReg) && + /// \brief Rewrite the source found through \p Def, by using the \p RewriteMap + /// and create a new COPY instruction. More info about RewriteMap in + /// PeepholeOptimizer::findNextSource. Right now this is only used to handle + /// Uncoalescable copies, since they are copy like instructions that aren't + /// recognized by the register allocator. + MachineInstr * + RewriteSource(TargetInstrInfo::RegSubRegPair Def, + PeepholeOptimizer::RewriteMapTy &RewriteMap) override { + assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) && "We do not rewrite physical registers"); - const TargetRegisterClass *DefRC = MRI.getRegClass(DefReg); + // Find the new source to use in the COPY rewrite. + TargetInstrInfo::RegSubRegPair NewSrc = + getNewSource(&MRI, &TII, Def, RewriteMap); + + // Insert the COPY. + const TargetRegisterClass *DefRC = MRI.getRegClass(Def.Reg); unsigned NewVR = MRI.createVirtualRegister(DefRC); MachineInstr *NewCopy = BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(), TII.get(TargetOpcode::COPY), NewVR) - .addReg(NewSrcReg, 0, NewSrcSubReg); + .addReg(NewSrc.Reg, 0, NewSrc.SubReg); - NewCopy->getOperand(0).setSubReg(DefSubReg); - if (DefSubReg) + NewCopy->getOperand(0).setSubReg(Def.SubReg); + if (Def.SubReg) NewCopy->getOperand(0).setIsUndef(); - MRI.replaceRegWith(DefReg, NewVR); + MRI.replaceRegWith(Def.Reg, NewVR); MRI.clearKillFlags(NewVR); + // We extended the lifetime of NewSrc.Reg, clear the kill flags to + // account for that. + MRI.clearKillFlags(NewSrc.Reg); + return NewCopy; } }; @@ -1039,16 +1194,25 @@ unsigned SrcReg, SrcSubReg, TrackReg, TrackSubReg; while (CpyRewriter->getNextRewritableSource(SrcReg, SrcSubReg, TrackReg, TrackSubReg)) { - unsigned NewSrc = TrackReg; - unsigned NewSubReg = TrackSubReg; + // Keep track of PHI nodes and its incoming edges when looking for sources. + RewriteMapTy RewriteMap; // Try to find a more suitable source. If we failed to do so, or get the // actual source, move to the next source. - if (!findNextSource(NewSrc, NewSubReg) || SrcReg == NewSrc) + if (!findNextSource(TrackReg, TrackSubReg, RewriteMap)) + continue; + + // Get the new source to rewrite. TODO: Only enable handling of multiple + // sources (PHIs) once we have a motivating example and testcases for it. + TargetInstrInfo::RegSubRegPair TrackPair(TrackReg, TrackSubReg); + TargetInstrInfo::RegSubRegPair NewSrc = CpyRewriter->getNewSource( + MRI, TII, TrackPair, RewriteMap, false /* multiple sources */); + if (SrcReg == NewSrc.Reg) continue; + // Rewrite source. - if (CpyRewriter->RewriteCurrentSource(NewSrc, NewSubReg)) { + if (CpyRewriter->RewriteCurrentSource(NewSrc.Reg, NewSrc.SubReg)) { // We may have extended the live-range of NewSrc, account for that. - MRI->clearKillFlags(NewSrc); + MRI->clearKillFlags(NewSrc.Reg); Changed = true; } } @@ -1077,9 +1241,7 @@ assert(MI && isUncoalescableCopy(*MI) && "Invalid argument"); // Check if we can rewrite all the values defined by this instruction. - SmallVector< - std::pair, - 4> RewritePairs; + SmallVector RewritePairs; // Get the right rewriter for the current copy. std::unique_ptr CpyRewriter(getCopyRewriter(*MI, *TII, *MRI)); // If none exists, bails out. @@ -1089,39 +1251,32 @@ // Rewrite each rewritable source by generating new COPYs. This works // differently from optimizeCoalescableCopy since it first makes sure that all // definitions can be rewritten. - unsigned SrcReg, SrcSubReg, TrackReg, TrackSubReg; - while (CpyRewriter->getNextRewritableSource(SrcReg, SrcSubReg, TrackReg, - TrackSubReg)) { + RewriteMapTy RewriteMap; + unsigned Reg, SubReg, CopyDefReg, CopyDefSubReg; + while (CpyRewriter->getNextRewritableSource(Reg, SubReg, CopyDefReg, + CopyDefSubReg)) { // If a physical register is here, this is probably for a good reason. // Do not rewrite that. - if (TargetRegisterInfo::isPhysicalRegister(TrackReg)) + if (TargetRegisterInfo::isPhysicalRegister(CopyDefReg)) return false; // If we do not know how to rewrite this definition, there is no point // in trying to kill this instruction. - TargetInstrInfo::RegSubRegPair Def(TrackReg, TrackSubReg); - TargetInstrInfo::RegSubRegPair Src = Def; - if (!findNextSource(Src.Reg, Src.SubReg)) + TargetInstrInfo::RegSubRegPair Def(CopyDefReg, CopyDefSubReg); + if (!findNextSource(Def.Reg, Def.SubReg, RewriteMap)) return false; - RewritePairs.push_back(std::make_pair(Def, Src)); + RewritePairs.push_back(Def); } // The change is possible for all defs, do it. - for (const auto &PairDefSrc : RewritePairs) { - const auto &Def = PairDefSrc.first; - const auto &Src = PairDefSrc.second; - + for (const auto &Def : RewritePairs) { // Rewrite the "copy" in a way the register coalescer understands. - MachineInstr *NewCopy = CpyRewriter->RewriteCurrentSource( - Def.Reg, Def.SubReg, Src.Reg, Src.SubReg); + MachineInstr *NewCopy = CpyRewriter->RewriteSource(Def, RewriteMap); assert(NewCopy && "Should be able to always generate a new copy"); - - // We extended the lifetime of Src and clear the kill flags to - // account for that. - MRI->clearKillFlags(Src.Reg); LocalMIs.insert(NewCopy); } + // MI is now dead. MI->eraseFromParent(); ++NumUncoalescableCopies; @@ -1529,6 +1684,26 @@ Def->getOperand(3).getImm()); } +/// \brief Explore each PHI incoming operand and return its sources +ValueTrackerResult ValueTracker::getNextSourceFromPHI() { + assert(Def->isPHI() && "Invalid definition"); + ValueTrackerResult Res; + + // If we look for a different subreg, bails as we do not + // support composing subreg yet. + if (Def->getOperand(0).getSubReg() != DefSubReg) + return ValueTrackerResult(); + + // Return all register sources for PHI instructions. + for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2) { + auto &MO = Def->getOperand(i); + assert(MO.isReg() && "Invalid PHI instruction"); + Res.addSource(MO.getReg(), MO.getSubReg()); + } + + return Res; +} + ValueTrackerResult ValueTracker::getNextSourceImpl() { assert(Def && "This method needs a valid definition"); @@ -1551,6 +1726,8 @@ return getNextSourceFromExtractSubreg(); if (Def->isSubregToReg()) return getNextSourceFromSubregToReg(); + if (Def->isPHI()) + return getNextSourceFromPHI(); return ValueTrackerResult(); } Index: lib/Target/X86/X86InstrMMX.td =================================================================== --- lib/Target/X86/X86InstrMMX.td +++ lib/Target/X86/X86InstrMMX.td @@ -249,6 +249,7 @@ (MMX_X86movd2w (x86mmx VR64:$src)))], IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>; +let isBitcast = 1 in def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (bitconvert GR64:$src))], @@ -262,7 +263,7 @@ // These are 64 bit moves, but since the OS X assembler doesn't // recognize a register-register movq, we write them as // movd. -let SchedRW = [WriteMove] in { +let SchedRW = [WriteMove], isBitcast = 1 in { def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", Index: test/CodeGen/X86/mmx-coalescing.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/mmx-coalescing.ll @@ -0,0 +1,84 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s + +%SA = type <{ %union.anon, i32, [4 x i8], i8*, i8*, i8*, i32, [4 x i8] }> +%union.anon = type { <1 x i64> } + +; Check that extra movd (copy) instructions aren't generated. + +define i32 @test(%SA* %pSA, i16* %A, i32 %B, i32 %C, i32 %D, i8* %E) { +entry: +; CHECK-LABEL: test +; CHECK: # BB#0: +; CHECK-NEXT: pshufw +; CHECK-NEXT: movd +; CHECK-NOT: movd +; CHECK-NEXT: testl + %shl = shl i32 1, %B + %shl1 = shl i32 %C, %B + %shl2 = shl i32 1, %D + %v = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 0, i32 0 + %v0 = load <1 x i64>, <1 x i64>* %v, align 8 + %SA0 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 1 + %v1 = load i32, i32* %SA0, align 4 + %SA1 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 3 + %v2 = load i8*, i8** %SA1, align 8 + %SA2 = getelementptr inbounds %SA, %SA* %pSA, i64 0, i32 4 + %v3 = load i8*, i8** %SA2, align 8 + %v4 = bitcast <1 x i64> %v0 to <4 x i16> + %v5 = bitcast <4 x i16> %v4 to x86_mmx + %v6 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v5, i8 -18) + %v7 = bitcast x86_mmx %v6 to <4 x i16> + %v8 = bitcast <4 x i16> %v7 to <1 x i64> + %v9 = extractelement <1 x i64> %v8, i32 0 + %v10 = bitcast i64 %v9 to <2 x i32> + %v11 = extractelement <2 x i32> %v10, i32 0 + %cmp = icmp eq i32 %v11, 0 + br i1 %cmp, label %if.A, label %if.B + +if.A: +; CHECK: %if.A +; CHECK-NEXT: movd +; CHECK-NEXT: psllq + %pa = phi <1 x i64> [ %v8, %entry ], [ %vx, %if.C ] + %v17 = extractelement <1 x i64> %pa, i32 0 + %v18 = bitcast i64 %v17 to x86_mmx + %v19 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %v18, i32 %B) #2 + %v20 = bitcast x86_mmx %v19 to i64 + %v21 = insertelement <1 x i64> undef, i64 %v20, i32 0 + %cmp3 = icmp eq i64 %v20, 0 + br i1 %cmp3, label %if.C, label %merge + +if.B: + %v34 = bitcast <1 x i64> %v8 to <4 x i16> + %v35 = bitcast <4 x i16> %v34 to x86_mmx + %v36 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v35, i8 -18) + %v37 = bitcast x86_mmx %v36 to <4 x i16> + %v38 = bitcast <4 x i16> %v37 to <1 x i64> + br label %if.C + +if.C: + %vx = phi <1 x i64> [ %v21, %if.A ], [ %v38, %if.B ] + %cvt = bitcast <1 x i64> %vx to <2 x i32> + %ex = extractelement <2 x i32> %cvt, i32 0 + %cmp2 = icmp eq i32 %ex, 0 + br i1 %cmp2, label %if.A, label %merge + +merge: +; CHECK: %merge +; CHECK-NOT: movd +; CHECK-NEXT: pshufw + %vy = phi <1 x i64> [ %v21, %if.A ], [ %vx, %if.C ] + %v130 = bitcast <1 x i64> %vy to <4 x i16> + %v131 = bitcast <4 x i16> %v130 to x86_mmx + %v132 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v131, i8 -18) + %v133 = bitcast x86_mmx %v132 to <4 x i16> + %v134 = bitcast <4 x i16> %v133 to <1 x i64> + %v135 = extractelement <1 x i64> %v134, i32 0 + %v136 = bitcast i64 %v135 to <2 x i32> + %v137 = extractelement <2 x i32> %v136, i32 0 + ret i32 %v137 +} + + +declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) +declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)