Index: lib/CodeGen/MachineCSE.cpp =================================================================== --- lib/CodeGen/MachineCSE.cpp +++ lib/CodeGen/MachineCSE.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CFG.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" @@ -50,6 +51,8 @@ STATISTIC(NumCoalesces, "Number of copies coalesced"); STATISTIC(NumCSEs, "Number of common subexpression eliminated"); +STATISTIC(NumPREs, "Number of partial redundant expression" + " transformed to fully redundant"); STATISTIC(NumPhysCSEs, "Number of physreg referencing common subexpr eliminated"); STATISTIC(NumCrossBBCSEs, @@ -85,6 +88,7 @@ void releaseMemory() override { ScopeMap.clear(); + PREMap.clear(); Exps.clear(); } @@ -98,6 +102,7 @@ unsigned LookAheadLimit = 0; DenseMap ScopeMap; + DenseMap PREMap; ScopedHTType VNT; SmallVector Exps; unsigned CurrVN = 0; @@ -118,13 +123,17 @@ bool &NonLocal) const; bool isCSECandidate(MachineInstr *MI); bool isProfitableToCSE(unsigned CSReg, unsigned Reg, - MachineInstr *CSMI, MachineInstr *MI); + MachineBasicBlock *CSBB, MachineInstr *MI); void EnterScope(MachineBasicBlock *MBB); void ExitScope(MachineBasicBlock *MBB); bool ProcessBlock(MachineBasicBlock *MBB); void ExitScopeIfDone(MachineDomTreeNode *Node, DenseMap &OpenChildren); bool PerformCSE(MachineDomTreeNode *Node); + + bool isPRECandidate(MachineInstr *MI); + bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB); + bool PerformSimplePRE(MachineDominatorTree *DT); }; } // end anonymous namespace @@ -404,9 +413,10 @@ } /// isProfitableToCSE - Return true if it's profitable to eliminate MI with a -/// common expression that defines Reg. +/// common expression that defines Reg. CSBB is basic block where CSReg is +/// defined. bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg, - MachineInstr *CSMI, MachineInstr *MI) { + MachineBasicBlock *CSBB, MachineInstr *MI) { // FIXME: Heuristics that works around the lack the live range splitting. // If CSReg is used at all uses of Reg, CSE should not increase register @@ -432,7 +442,6 @@ // an immediate predecessor. We don't want to increase register pressure and // end up causing other computation to be spilled. if (TII->isAsCheapAsAMove(*MI)) { - MachineBasicBlock *CSBB = CSMI->getParent(); MachineBasicBlock *BB = MI->getParent(); if (CSBB != BB && !CSBB->isSuccessor(BB)) return false; @@ -597,7 +606,7 @@ TargetRegisterInfo::isVirtualRegister(NewReg) && "Do not CSE physical register defs!"); - if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) { + if (!isProfitableToCSE(NewReg, OldReg, CSMI->getParent(), MI)) { LLVM_DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n"); DoCSE = false; break; @@ -742,6 +751,102 @@ return Changed; } +// We use stronger conditions for PREed instrs rather than for CSE ones +// to decrease number of PREed instrs that won't be CSEed. +bool MachineCSE::isPRECandidate(MachineInstr *MI) { + if (!isCSECandidate(MI) || + MI->isTransient() || + MI->isBranch() || + MI->isBundled() || + MI->isNotDuplicable() || + MI->isAsCheapAsAMove() || + MI->isBarrier() || + MI->isReturn() || + MI->getNumDefs() != 1 || + MI->getNumExplicitDefs() != 1) + return false; + + for (auto def: MI->defs()) + if (!TRI->isVirtualRegister(def.getReg())) + return false; + + for (auto use: MI->uses()) + if (use.isReg() && !TRI->isVirtualRegister(use.getReg())) + return false; + + return true; +} + +bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT, MachineBasicBlock *MBB) { + bool Changed = false; + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) { + MachineInstr *MI = &*I; + ++I; + + if (!isPRECandidate(MI)) + continue; + + if (!PREMap.count(MI)) { + PREMap[MI] = MBB; + continue; + } + + auto MBB1 = PREMap[MI]; + assert(!DT->properlyDominates(MBB, MBB1) && + "MBB cannot properly dominate MBB1 while DFS through dominators tree!"); + auto CMBB = DT->findNearestCommonDominator(MBB, MBB1); + + // Two instrs are partial redundant if their basic blocks are reachable + // from one to another but one doesn't dominate another. + if (CMBB != MBB1) { + auto BB = MBB->getBasicBlock(), BB1 = MBB1->getBasicBlock(); + if (BB != nullptr && BB1 != nullptr && + (isPotentiallyReachable(BB1, BB) || + isPotentiallyReachable(BB, BB1))) { + + assert(MI->getOperand(0).isDef() && + "First operand of instr with one explicit def must be this def"); + unsigned VReg = MI->getOperand(0).getReg(); + unsigned NewReg = MRI->cloneVirtualRegister(VReg); + if (!isProfitableToCSE(NewReg, VReg, CMBB, MI)) + continue; + MachineInstr &NewMI = TII->duplicate(*CMBB, CMBB->getFirstTerminator(), *MI); + NewMI.getOperand(0).setReg(NewReg); + + PREMap[MI] = CMBB; + ++NumPREs; + Changed = true; + } + } + } + return Changed; +} + +// This simple PRE (partial redundancy elimination) pass doesn't actually +// eliminate partial redundancy but transforms it to full redundancy, +// anticipating that the next CSE step will eliminate this created redundancy. +// If CSE doesn't eliminate this, than created instruction will remain dead +// and eliminated later by Remove Dead Machine Instructions pass. +bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) { + SmallVector BBs; + + PREMap.clear(); + bool Changed = false; + BBs.push_back(DT->getRootNode()); + do { + auto Node = BBs.pop_back_val(); + const std::vector &Children = Node->getChildren(); + for (MachineDomTreeNode *Child : Children) + BBs.push_back(Child); + + MachineBasicBlock *MBB = Node->getBlock(); + Changed |= ProcessBlockPRE(DT, MBB); + + } while (!BBs.empty()); + + return Changed; +} + bool MachineCSE::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -752,5 +857,8 @@ AA = &getAnalysis().getAAResults(); DT = &getAnalysis(); LookAheadLimit = TII->getMachineCSELookAheadLimit(); - return PerformCSE(DT->getRootNode()); + bool ChangedPRE, ChangedCSE; + ChangedPRE = PerformSimplePRE(DT); + ChangedCSE = PerformCSE(DT->getRootNode()); + return ChangedPRE || ChangedCSE; } Index: test/CodeGen/Mips/internalfunc.ll =================================================================== --- test/CodeGen/Mips/internalfunc.ll +++ test/CodeGen/Mips/internalfunc.ll @@ -27,8 +27,7 @@ if.end: ; preds = %entry, %if.then ; CHECK: lw $[[R2:[0-9]+]], %got(sf2) ; CHECK: addiu ${{[0-9]+}}, $[[R2]], %lo(sf2) -; CHECK: lw $[[R3:[0-9]+]], %got(caller.sf1) -; CHECK: sw ${{[0-9]+}}, %lo(caller.sf1)($[[R3]]) +; CHECK: sw ${{[0-9]+}}, %lo(caller.sf1)($[[R1]]) %tobool3 = icmp ne i32 %a0, 0 %tmp4 = load void (...)*, void (...)** @gf1, align 4 %cond = select i1 %tobool3, void (...)* %tmp4, void (...)* bitcast (void ()* @sf2 to void (...)*) Index: test/CodeGen/X86/avx2-masked-gather.ll =================================================================== --- test/CodeGen/X86/avx2-masked-gather.ll +++ test/CodeGen/X86/avx2-masked-gather.ll @@ -236,18 +236,17 @@ ; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; NOGATHER-NEXT: .LBB4_4: # %else2 ; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax +; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB4_6 ; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm3 -; NOGATHER-NEXT: vmovq %xmm3, %rax +; NOGATHER-NEXT: vmovq %xmm0, %rax ; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2 ; NOGATHER-NEXT: .LBB4_6: # %else5 ; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB4_8 ; NOGATHER-NEXT: # %bb.7: # %cond.load7 -; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax ; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm2 ; NOGATHER-NEXT: .LBB4_8: # %else8 @@ -295,18 +294,17 @@ ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; NOGATHER-NEXT: .LBB5_4: # %else2 ; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax +; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB5_6 ; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm3 -; NOGATHER-NEXT: vmovq %xmm3, %rax +; NOGATHER-NEXT: vmovq %xmm0, %rax ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; NOGATHER-NEXT: .LBB5_6: # %else5 ; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB5_8 ; NOGATHER-NEXT: # %bb.7: # %cond.load7 -; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; NOGATHER-NEXT: .LBB5_8: # %else8 @@ -366,11 +364,11 @@ ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB6_4: # %else2 ; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax +; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3 ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB6_6 ; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4 -; NOGATHER-NEXT: vmovq %xmm4, %rax +; NOGATHER-NEXT: vmovq %xmm3, %rax ; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm1, %xmm4 ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB6_6: # %else5 @@ -378,7 +376,6 @@ ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB6_8 ; NOGATHER-NEXT: # %bb.7: # %cond.load7 -; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3 ; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax ; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm1, %xmm3 ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] @@ -402,11 +399,11 @@ ; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB6_12: # %else14 ; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2 ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB6_14 ; NOGATHER-NEXT: # %bb.13: # %cond.load16 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3 -; NOGATHER-NEXT: vmovq %xmm3, %rax +; NOGATHER-NEXT: vmovq %xmm2, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 ; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3 ; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 @@ -415,8 +412,7 @@ ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB6_16 ; NOGATHER-NEXT: # %bb.15: # %cond.load19 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 -; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 ; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm0, %xmm0 ; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 @@ -477,11 +473,11 @@ ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB7_4: # %else2 ; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax +; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3 ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB7_6 ; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4 -; NOGATHER-NEXT: vmovq %xmm4, %rax +; NOGATHER-NEXT: vmovq %xmm3, %rax ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0,1],mem[0],xmm1[3] ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB7_6: # %else5 @@ -489,7 +485,6 @@ ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB7_8 ; NOGATHER-NEXT: # %bb.7: # %cond.load7 -; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3 ; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],mem[0] ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] @@ -514,11 +509,11 @@ ; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; NOGATHER-NEXT: .LBB7_12: # %else14 ; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2 ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB7_14 ; NOGATHER-NEXT: # %bb.13: # %cond.load16 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3 -; NOGATHER-NEXT: vmovq %xmm3, %rax +; NOGATHER-NEXT: vmovq %xmm2, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 @@ -527,8 +522,7 @@ ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB7_16 ; NOGATHER-NEXT: # %bb.15: # %cond.load19 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 -; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 ; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 @@ -583,11 +577,11 @@ ; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; NOGATHER-NEXT: .LBB8_4: # %else2 ; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2 ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB8_6 ; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3 -; NOGATHER-NEXT: vmovq %xmm3, %rax +; NOGATHER-NEXT: vmovq %xmm2, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 ; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm3, %xmm3 ; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 @@ -596,8 +590,7 @@ ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB8_8 ; NOGATHER-NEXT: # %bb.7: # %cond.load7 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 -; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 ; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 ; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 @@ -652,11 +645,11 @@ ; NOGATHER-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] ; NOGATHER-NEXT: .LBB9_4: # %else2 ; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax +; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm2 ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB9_6 ; NOGATHER-NEXT: # %bb.5: # %cond.load4 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3 -; NOGATHER-NEXT: vmovq %xmm3, %rax +; NOGATHER-NEXT: vmovq %xmm2, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm3 ; NOGATHER-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1] ; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 @@ -665,8 +658,7 @@ ; NOGATHER-NEXT: testb $1, %al ; NOGATHER-NEXT: je .LBB9_8 ; NOGATHER-NEXT: # %bb.7: # %cond.load7 -; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 -; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax +; NOGATHER-NEXT: vpextrq $1, %xmm2, %rax ; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 ; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 Index: test/CodeGen/X86/masked_gather.ll =================================================================== --- test/CodeGen/X86/masked_gather.ll +++ test/CodeGen/X86/masked_gather.ll @@ -70,17 +70,16 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpextrb $8, %xmm1, %eax ; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: je .LBB0_6 ; AVX1-NEXT: # %bb.5: # %cond.load4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX1-NEXT: .LBB0_6: # %else5 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB0_8 ; AVX1-NEXT: # %bb.7: # %cond.load7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX1-NEXT: .LBB0_8: # %else8 @@ -111,17 +110,16 @@ ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpextrb $8, %xmm1, %eax ; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: je .LBB0_6 ; AVX2-NEXT: # %bb.5: # %cond.load4 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vmovq %xmm3, %rax +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX2-NEXT: .LBB0_6: # %else5 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB0_8 ; AVX2-NEXT: # %bb.7: # %cond.load7 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX2-NEXT: .LBB0_8: # %else8 @@ -227,17 +225,16 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpextrb $8, %xmm1, %eax ; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: je .LBB1_6 ; AVX1-NEXT: # %bb.5: # %cond.load4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX1-NEXT: .LBB1_6: # %else5 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB1_8 ; AVX1-NEXT: # %bb.7: # %cond.load7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX1-NEXT: .LBB1_8: # %else8 @@ -273,17 +270,16 @@ ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpextrb $8, %xmm1, %eax ; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: je .LBB1_6 ; AVX2-NEXT: # %bb.5: # %cond.load4 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vmovq %xmm3, %rax +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX2-NEXT: .LBB1_6: # %else5 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB1_8 ; AVX2-NEXT: # %bb.7: # %cond.load7 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX2-NEXT: .LBB1_8: # %else8 @@ -388,17 +384,16 @@ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpextrb $8, %xmm1, %eax ; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: je .LBB2_6 ; AVX1-NEXT: # %bb.5: # %cond.load4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX1-NEXT: .LBB2_6: # %else5 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB2_8 ; AVX1-NEXT: # %bb.7: # %cond.load7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX1-NEXT: .LBB2_8: # %else8 @@ -433,17 +428,16 @@ ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpextrb $8, %xmm1, %eax ; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: je .LBB2_6 ; AVX2-NEXT: # %bb.5: # %cond.load4 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vmovq %xmm3, %rax +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] ; AVX2-NEXT: .LBB2_6: # %else5 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB2_8 ; AVX2-NEXT: # %bb.7: # %cond.load7 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX2-NEXT: .LBB2_8: # %else8 @@ -662,15 +656,15 @@ ; AVX1-NEXT: vpinsrb $1, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_4: # %else2 ; AVX1-NEXT: vpmovsxdq %xmm7, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm8 +; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 ; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm7 ; AVX1-NEXT: vpextrb $2, %xmm7, %eax ; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: je .LBB3_6 ; AVX1-NEXT: # %bb.5: # %cond.load4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vmovq %xmm5, %rax +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vpinsrb $2, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_6: # %else5 ; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6 @@ -678,11 +672,10 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB3_8 ; AVX1-NEXT: # %bb.7: # %cond.load7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vpinsrb $3, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_8: # %else8 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm0 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5 ; AVX1-NEXT: vpextrb $4, %xmm5, %eax @@ -702,15 +695,15 @@ ; AVX1-NEXT: vpinsrb $5, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_12: # %else14 ; AVX1-NEXT: vpmovsxdq %xmm6, %xmm6 -; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm8 +; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm5 ; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 ; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm7 ; AVX1-NEXT: vpextrb $6, %xmm7, %eax ; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: je .LBB3_14 ; AVX1-NEXT: # %bb.13: # %cond.load16 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vmovq %xmm5, %rax +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vpinsrb $6, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_14: # %else17 ; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6 @@ -718,12 +711,11 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB3_16 ; AVX1-NEXT: # %bb.15: # %cond.load19 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vpinsrb $7, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_16: # %else20 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm0 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5 ; AVX1-NEXT: vpextrb $8, %xmm5, %eax @@ -748,10 +740,10 @@ ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm6 ; AVX1-NEXT: vpextrb $10, %xmm6, %eax ; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: je .LBB3_22 ; AVX1-NEXT: # %bb.21: # %cond.load28 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vmovq %xmm7, %rax +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vpinsrb $10, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_22: # %else29 ; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4 @@ -759,7 +751,6 @@ ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB3_24 ; AVX1-NEXT: # %bb.23: # %cond.load31 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vpinsrb $11, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_24: # %else32 @@ -784,17 +775,16 @@ ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpextrb $14, %xmm1, %eax ; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: je .LBB3_30 ; AVX1-NEXT: # %bb.29: # %cond.load40 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovq %xmm2, %rax +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vpinsrb $14, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_30: # %else41 ; AVX1-NEXT: vpextrb $15, %xmm1, %eax ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB3_32 ; AVX1-NEXT: # %bb.31: # %cond.load43 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: vpinsrb $15, (%rax), %xmm3, %xmm3 ; AVX1-NEXT: .LBB3_32: # %else44 @@ -829,10 +819,10 @@ ; AVX2-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm6 ; AVX2-NEXT: vpextrb $2, %xmm6, %eax ; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-NEXT: je .LBB3_6 ; AVX2-NEXT: # %bb.5: # %cond.load4 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-NEXT: vmovq %xmm7, %rax +; AVX2-NEXT: vmovq %xmm5, %rax ; AVX2-NEXT: vpinsrb $2, (%rax), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_6: # %else5 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 @@ -840,7 +830,6 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB3_8 ; AVX2-NEXT: # %bb.7: # %cond.load7 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-NEXT: vpextrq $1, %xmm5, %rax ; AVX2-NEXT: vpinsrb $3, (%rax), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_8: # %else8 @@ -865,10 +854,10 @@ ; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5 ; AVX2-NEXT: vpextrb $6, %xmm5, %eax ; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: je .LBB3_14 ; AVX2-NEXT: # %bb.13: # %cond.load16 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-NEXT: vmovq %xmm6, %rax +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vpinsrb $6, (%rax), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_14: # %else17 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm6 @@ -876,7 +865,6 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB3_16 ; AVX2-NEXT: # %bb.15: # %cond.load19 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vpinsrb $7, (%rax), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_16: # %else20 @@ -902,10 +890,10 @@ ; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5 ; AVX2-NEXT: vpextrb $10, %xmm5, %eax ; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: je .LBB3_22 ; AVX2-NEXT: # %bb.21: # %cond.load28 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-NEXT: vmovq %xmm6, %rax +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vpinsrb $10, (%rax), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_22: # %else29 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 @@ -913,7 +901,6 @@ ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB3_24 ; AVX2-NEXT: # %bb.23: # %cond.load31 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vpinsrb $11, (%rax), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_24: # %else32 @@ -938,17 +925,16 @@ ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpextrb $14, %xmm1, %eax ; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: je .LBB3_30 ; AVX2-NEXT: # %bb.29: # %cond.load40 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovq %xmm2, %rax +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vpinsrb $14, (%rax), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_30: # %else41 ; AVX2-NEXT: vpextrb $15, %xmm1, %eax ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB3_32 ; AVX2-NEXT: # %bb.31: # %cond.load43 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: vpinsrb $15, (%rax), %xmm3, %xmm3 ; AVX2-NEXT: .LBB3_32: # %else44 @@ -1009,9 +995,9 @@ ; AVX512-NEXT: kshiftrw $4, %k0, %k1 ; AVX512-NEXT: kmovw %k1, %eax ; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm5 ; AVX512-NEXT: je .LBB3_10 ; AVX512-NEXT: # %bb.9: # %cond.load10 -; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm5 ; AVX512-NEXT: vmovq %xmm5, %rax ; AVX512-NEXT: vpinsrb $4, (%rax), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_10: # %else11 @@ -1020,7 +1006,6 @@ ; AVX512-NEXT: testb $1, %al ; AVX512-NEXT: je .LBB3_12 ; AVX512-NEXT: # %bb.11: # %cond.load13 -; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm5 ; AVX512-NEXT: vpextrq $1, %xmm5, %rax ; AVX512-NEXT: vpinsrb $5, (%rax), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_12: # %else14 @@ -1032,10 +1017,10 @@ ; AVX512-NEXT: kshiftrw $6, %k0, %k1 ; AVX512-NEXT: kmovw %k1, %eax ; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm4 ; AVX512-NEXT: je .LBB3_14 ; AVX512-NEXT: # %bb.13: # %cond.load16 -; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm5 -; AVX512-NEXT: vmovq %xmm5, %rax +; AVX512-NEXT: vmovq %xmm4, %rax ; AVX512-NEXT: vpinsrb $6, (%rax), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_14: # %else17 ; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 @@ -1044,7 +1029,6 @@ ; AVX512-NEXT: testb $1, %al ; AVX512-NEXT: je .LBB3_16 ; AVX512-NEXT: # %bb.15: # %cond.load19 -; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm4 ; AVX512-NEXT: vpextrq $1, %xmm4, %rax ; AVX512-NEXT: vpinsrb $7, (%rax), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_16: # %else20 @@ -1098,9 +1082,9 @@ ; AVX512-NEXT: kshiftrw $12, %k0, %k1 ; AVX512-NEXT: kmovw %k1, %eax ; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; AVX512-NEXT: je .LBB3_26 ; AVX512-NEXT: # %bb.25: # %cond.load34 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; AVX512-NEXT: vmovq %xmm3, %rax ; AVX512-NEXT: vpinsrb $12, (%rax), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_26: # %else35 @@ -1109,7 +1093,6 @@ ; AVX512-NEXT: testb $1, %al ; AVX512-NEXT: je .LBB3_28 ; AVX512-NEXT: # %bb.27: # %cond.load37 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; AVX512-NEXT: vpextrq $1, %xmm3, %rax ; AVX512-NEXT: vpinsrb $13, (%rax), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_28: # %else38 @@ -1120,10 +1103,10 @@ ; AVX512-NEXT: kshiftrw $14, %k0, %k1 ; AVX512-NEXT: kmovw %k1, %eax ; AVX512-NEXT: testb $1, %al +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: je .LBB3_30 ; AVX512-NEXT: # %bb.29: # %cond.load40 -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vpinsrb $14, (%rax), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_30: # %else41 ; AVX512-NEXT: kshiftrw $15, %k0, %k0 @@ -1131,7 +1114,6 @@ ; AVX512-NEXT: testb $1, %al ; AVX512-NEXT: je .LBB3_32 ; AVX512-NEXT: # %bb.31: # %cond.load43 -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax ; AVX512-NEXT: vpinsrb $15, (%rax), %xmm2, %xmm2 ; AVX512-NEXT: .LBB3_32: # %else44