Index: include/llvm/CodeGen/Passes.h =================================================================== --- include/llvm/CodeGen/Passes.h +++ include/llvm/CodeGen/Passes.h @@ -129,6 +129,10 @@ // instruction and update the MachineFunctionInfo with that information. extern char &ShrinkWrapID; + /// LiveRangeShrink pass. Move instruction close to its definition to shrink + /// the definition's live range. + extern char &LiveRangeShrinkID; + /// Greedy register allocator. extern char &RAGreedyID; Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -186,6 +186,7 @@ void initializeLiveDebugValuesPass(PassRegistry&); void initializeLiveDebugVariablesPass(PassRegistry&); void initializeLiveIntervalsPass(PassRegistry&); +void initializeLiveRangeShrinkPass(PassRegistry&); void initializeLiveRegMatrixPass(PassRegistry&); void initializeLiveStacksPass(PassRegistry&); void initializeLiveVariablesPass(PassRegistry&); Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -48,6 +48,7 @@ LivePhysRegs.cpp LiveRangeCalc.cpp LiveRangeEdit.cpp + LiveRangeShrink.cpp LiveRegMatrix.cpp LiveRegUnits.cpp LiveStackAnalysis.cpp Index: lib/CodeGen/CodeGen.cpp =================================================================== --- lib/CodeGen/CodeGen.cpp +++ lib/CodeGen/CodeGen.cpp @@ -43,6 +43,7 @@ initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); initializeLiveIntervalsPass(Registry); + initializeLiveRangeShrinkPass(Registry); initializeLiveStacksPass(Registry); initializeLiveVariablesPass(Registry); initializeLocalStackSlotPassPass(Registry); Index: lib/CodeGen/LiveRangeShrink.cpp =================================================================== --- /dev/null +++ lib/CodeGen/LiveRangeShrink.cpp @@ -0,0 +1,202 @@ +//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// This pass moves instructions close to the definition of its operands to +// shrink live range of the def instruction. The code motion is limited within +// the basic block. The moved instruction should have 1 def, and more than one +// uses, all of which are the only use of the def. +// +//===----------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "lrshrink" + +using namespace llvm; + +namespace { +class LiveRangeShrink : public MachineFunctionPass { +public: + static char ID; + + LiveRangeShrink() : MachineFunctionPass(ID) { + initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Live Range Shrink"; } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // End anonymous namespace. + +char LiveRangeShrink::ID = 0; +char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID; + +INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false, + false) +namespace { +typedef DenseMap InstOrderMap; + +/// Returns \p New if it's dominated by \p Old, otherwise return \p Old. +/// \p M maintains a map from instruction to its dominating order that satisfies +/// M[A] > M[B] guarantees that A is dominated by B. +/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return +/// \p New. +MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old, + const InstOrderMap &M) { + auto NewIter = M.find(&New); + if (NewIter == M.end()) + return Old; + if (Old == nullptr) + return &New; + unsigned OrderOld = M.find(Old)->second; + unsigned OrderNew = NewIter->second; + if (OrderOld != OrderNew) + return OrderOld < OrderNew ? &New : Old; + // OrderOld == OrderNew, we need to iterate down from Old to see if it + // can reach New, if yes, New is dominated by Old. + for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew; + I = I->getNextNode()) + if (I == &New) + return &New; + return Old; +} + +/// Builds Instruction to its dominating order number map \p M by traversing +/// from instruction \p Start. Returns true if any store instruction is seen. +bool BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) { + bool SawStore = false; + M.clear(); + unsigned i = 0; + for (MachineInstr &I : make_range(Start, Start->getParent()->end())) { + M[&I] = i++; + if (I.mayStore()) + SawStore = true; + } + return SawStore; +} +} // end anonymous namespace + +bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); + + InstOrderMap IOM; + for (MachineBasicBlock &MBB : MF) { + if (MBB.empty()) + continue; + bool SawStore = BuildInstOrderMap(MBB.begin(), IOM); + + // Map from register to instruction order (value of IOM) where the + // register is used last. When moving instructions up, we need to + // make sure all its defs (including dead def) will not cross its + // last use when moving up. + DenseMap UseMap; + + for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) { + MachineInstr &MI = *Next; + ++Next; + if (MI.isPHI() || MI.isDebugValue()) + continue; + + unsigned CurrentOrder = IOM[&MI]; + unsigned Barrier = 0; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDebug()) + continue; + if (MO.isUse()) + UseMap[MO.getReg()] = CurrentOrder; + else if (MO.isDead() && UseMap.count(MO.getReg())) + // Barrier is the last instruction where MO get used. MI should not + // be moved above Barrier. + Barrier = std::max(Barrier, UseMap[MO.getReg()]); + } + + if (!MI.isSafeToMove(nullptr, SawStore)) { + // If MI has side effects, it should become a barrier for code motion. + // IOM is rebuild from the next instruction to prevent later + // instructions from being moved before this MI. + if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) + BuildInstOrderMap(Next, IOM); + continue; + } + + const MachineOperand *DefMO = nullptr; + MachineInstr *Insert = nullptr; + + // Number of live-ranges that will be shortened. We do not count + // live-ranges that are defined by a COPY as it could be coalesced later. + unsigned NumEligibleUse = 0; + + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDead() || MO.isDebug()) + continue; + unsigned Reg = MO.getReg(); + // Do not move the instruction if it def/uses a physical register, + // unless it is a constant physical register. + if (TargetRegisterInfo::isPhysicalRegister(Reg) && + !MRI.isConstantPhysReg(Reg)) { + Insert = nullptr; + break; + } + if (MO.isDef()) { + // Do not move if there is more than one def + if (DefMO) { + Insert = nullptr; + break; + } + DefMO = &MO; + } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg)) { + MachineInstr &DefInstr = *MRI.def_instr_begin(Reg); + if (!DefInstr.isCopy()) + NumEligibleUse++; + Insert = FindDominatedInstruction(DefInstr, Insert, IOM); + } else { + Insert = nullptr; + break; + } + } + // Move the instruction when # of shrinked live range > 1 + if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) { + MachineBasicBlock::iterator I = std::next(Insert->getIterator()); + // Skip all the PHI instructions + while (I != MBB.end() && I->isPHI()) + I = std::next(I); + if (I == MI.getIterator()) + continue; + + // Update the dominator order to be the same as the insertion point. + // We do this to maintain a non-decreasing order without need to update + // all instruction orders after the insertion point. + unsigned NewOrder = IOM[&*I]; + IOM[&MI] = NewOrder; + + // Find MI's debug value following MI. + MachineBasicBlock::iterator EndIter = std::next(MI.getIterator()); + if (MI.getOperand(0).isReg()) + for (; EndIter != MBB.end() && EndIter->isDebugValue() && + EndIter->getOperand(0).isReg() && + EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg(); + ++EndIter) + IOM[&*EndIter] = NewOrder; + MBB.splice(I, &MBB, MI.getIterator(), EndIter); + } + } + } + return false; +} Index: lib/CodeGen/TargetPassConfig.cpp =================================================================== --- lib/CodeGen/TargetPassConfig.cpp +++ lib/CodeGen/TargetPassConfig.cpp @@ -607,6 +607,9 @@ addPass(&LocalStackSlotAllocationID, false); } + if (getOptLevel() != CodeGenOpt::None) + addPass(&LiveRangeShrinkID); + // Run pre-ra passes. addPreRegAlloc(); Index: test/CodeGen/AArch64/arm64-ccmp.ll =================================================================== --- test/CodeGen/AArch64/arm64-ccmp.ll +++ test/CodeGen/AArch64/arm64-ccmp.ll @@ -378,11 +378,11 @@ ; CHECK-NEXT: cmp x0, #13 ; CHECK-NOT: ccmp ; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt +; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]] ; CHECK-NEXT: cmp x2, #2 ; CHECK-NEXT: cset [[REG2:w[0-9]+]], lt ; CHECK-NEXT: cmp x2, #4 ; CHECK-NEXT: cset [[REG3:w[0-9]+]], gt -; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]] ; CHECK-NEXT: and [[REG5:w[0-9]+]], [[REG2]], [[REG3]] ; CHECK-NEXT: orr [[REG6:w[0-9]+]], [[REG4]], [[REG5]] ; CHECK-NEXT: cmp [[REG6]], #0 Index: test/CodeGen/AArch64/arm64-misched-multimmo.ll =================================================================== --- test/CodeGen/AArch64/arm64-misched-multimmo.ll +++ test/CodeGen/AArch64/arm64-misched-multimmo.ll @@ -12,7 +12,7 @@ ; CHECK: Successors: ; CHECK-NOT: ch SU(4) ; CHECK: SU(3) -; CHECK: SU(4): STRWui %WZR, %X{{[0-9]+}} +; CHECK: SU(5): STRWui %WZR, %X{{[0-9]+}} define i32 @foo() { entry: %0 = load i32, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @G2, i64 0, i64 0), align 4 Index: test/CodeGen/NVPTX/sched1.ll =================================================================== --- test/CodeGen/NVPTX/sched1.ll +++ test/CodeGen/NVPTX/sched1.ll @@ -6,10 +6,10 @@ ; CHECK: .func foo ; CHECK: ld.u32 ; CHECK-NEXT: ld.u32 -; CHECK-NEXT: ld.u32 -; CHECK-NEXT: ld.u32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.u32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.u32 ; CHECK-NEXT: add.s32 %ptr0 = getelementptr i32, i32* %a, i32 0 %val0 = load i32, i32* %ptr0 Index: test/CodeGen/NVPTX/sched2.ll =================================================================== --- test/CodeGen/NVPTX/sched2.ll +++ test/CodeGen/NVPTX/sched2.ll @@ -4,12 +4,12 @@ ; CHECK: .func foo ; CHECK: ld.v2.u32 ; CHECK-NEXT: ld.v2.u32 -; CHECK-NEXT: ld.v2.u32 -; CHECK-NEXT: ld.v2.u32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.v2.u32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.v2.u32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 %ptr0 = getelementptr <2 x i32>, <2 x i32>* %a, i32 0 Index: test/CodeGen/NVPTX/vec8.ll =================================================================== --- test/CodeGen/NVPTX/vec8.ll +++ test/CodeGen/NVPTX/vec8.ll @@ -7,7 +7,7 @@ ; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0] ; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4] ; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1] -; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; +; CHECK-DAG: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; ; CHECK: st.u8 [%[[B]]], [[T]]; %t0 = extractelement <8 x i8> %a, i32 1 %t1 = extractelement <8 x i8> %a, i32 6 Index: test/CodeGen/SPARC/LeonItinerariesUT.ll =================================================================== --- test/CodeGen/SPARC/LeonItinerariesUT.ll +++ test/CodeGen/SPARC/LeonItinerariesUT.ll @@ -28,9 +28,9 @@ ; LEON3_4_ITIN-LABEL: f32_ops: ; LEON3_4_ITIN: ld ; LEON3_4_ITIN-NEXT: ld -; LEON3_4_ITIN-NEXT: ld ; LEON3_4_ITIN-NEXT: fadds ; LEON3_4_ITIN-NEXT: ld +; LEON3_4_ITIN-NEXT: ld ; LEON3_4_ITIN-NEXT: fsubs ; LEON3_4_ITIN-NEXT: fmuls ; LEON3_4_ITIN-NEXT: retl @@ -47,4 +47,4 @@ %6 = fmul float %5, %3 %7 = fdiv float %6, %4 ret float %7 -} \ No newline at end of file +} Index: test/CodeGen/X86/2007-01-08-InstrSched.ll =================================================================== --- test/CodeGen/X86/2007-01-08-InstrSched.ll +++ test/CodeGen/X86/2007-01-08-InstrSched.ll @@ -13,10 +13,10 @@ ; CHECK: mulss ; CHECK: mulss -; CHECK: mulss -; CHECK: mulss ; CHECK: addss +; CHECK: mulss ; CHECK: addss +; CHECK: mulss ; CHECK: addss ; CHECK: ret } Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -135,88 +135,87 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm6, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -259,198 +258,183 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; SSE2-LABEL: avg_v64i8: ; SSE2: # BB#0: -; SSE2-NEXT: subq $152, %rsp -; SSE2-NEXT: .Lcfi0: -; SSE2-NEXT: .cfi_def_cfa_offset 160 -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 -; SSE2-NEXT: movdqa 32(%rdi), %xmm5 -; SSE2-NEXT: movdqa 48(%rdi), %xmm6 +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsi), %xmm5 +; SSE2-NEXT: movdqa 16(%rsi), %xmm13 +; SSE2-NEXT: movdqa 32(%rsi), %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm15 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm15, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm10 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm5 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: paddd %xmm14, %xmm12 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa 16(%rsi), %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: paddd %xmm15, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: paddd %xmm8, %xmm15 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm13 +; SSE2-NEXT: movdqa %xmm11, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm11, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: paddd %xmm2, %xmm14 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa 48(%rsi), %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa 48(%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm15 +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: psrld $1, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: packuswb %xmm1, %xmm10 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm12, %xmm4 +; SSE2-NEXT: psrld $1, %xmm13 +; SSE2-NEXT: psrld $1, %xmm15 ; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm15, %xmm7 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm13 +; SSE2-NEXT: packuswb %xmm15, %xmm13 +; SSE2-NEXT: packuswb %xmm4, %xmm13 +; SSE2-NEXT: psrld $1, %xmm6 ; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: packuswb %xmm9, %xmm14 -; SSE2-NEXT: packuswb %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm13 ; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm10, %xmm12 -; SSE2-NEXT: packuswb %xmm6, %xmm12 -; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: packuswb %xmm9, %xmm6 ; SSE2-NEXT: psrld $1, %xmm11 +; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm14 ; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: packuswb %xmm11, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: packuswb %xmm14, %xmm11 +; SSE2-NEXT: packuswb %xmm6, %xmm11 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: packuswb %xmm8, %xmm3 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: packuswb %xmm5, %xmm7 +; SSE2-NEXT: packuswb %xmm3, %xmm7 +; SSE2-NEXT: movdqu %xmm7, (%rax) +; SSE2-NEXT: movdqu %xmm11, (%rax) +; SSE2-NEXT: movdqu %xmm13, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm12, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) -; SSE2-NEXT: addq $152, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: @@ -464,21 +448,21 @@ ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 @@ -540,13 +524,13 @@ ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm7, %zmm3, %zmm3 -; AVX512F-NEXT: vpaddd %zmm6, %zmm2, %zmm2 -; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -673,27 +657,27 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -755,80 +739,79 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -837,7 +820,7 @@ ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: @@ -847,13 +830,13 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -884,9 +867,9 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 @@ -1047,88 +1030,87 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm6, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1512,27 +1494,27 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -1594,80 +1576,79 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -1676,7 +1657,7 @@ ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: @@ -1686,13 +1667,13 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -1723,9 +1704,9 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 Index: test/CodeGen/X86/avx-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -2244,11 +2244,11 @@ ; X32: # BB#0: ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; X32-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_pd: @@ -2269,19 +2269,19 @@ ; X32: # BB#0: ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero -; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] -; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; X32-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_ps: @@ -2881,10 +2881,10 @@ ; X32: # BB#0: ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; @@ -2908,16 +2908,16 @@ ; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero ; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; Index: test/CodeGen/X86/avx.ll =================================================================== --- test/CodeGen/X86/avx.ll +++ test/CodeGen/X86/avx.ll @@ -113,11 +113,11 @@ ; CHECK-NOT: mov ; CHECK: insertps $48 ; CHECK: insertps $48 +; CHECK: vaddps ; CHECK: insertps $48 ; CHECK: insertps $48 ; CHECK: vaddps ; CHECK: vaddps -; CHECK: vaddps ; CHECK-NEXT: ret %1 = getelementptr inbounds float, float* %fb, i64 %index %2 = load float, float* %1, align 4 Index: test/CodeGen/X86/avx512-cmp-kor-sequence.ll =================================================================== --- test/CodeGen/X86/avx512-cmp-kor-sequence.ll +++ test/CodeGen/X86/avx512-cmp-kor-sequence.ll @@ -13,10 +13,10 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k2 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k3 ; CHECK-NEXT: korw %k1, %k0, %k0 -; CHECK-NEXT: korw %k3, %k2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2 +; CHECK-NEXT: korw %k2, %k1, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX %AX %EAX Index: test/CodeGen/X86/avx512-gather-scatter-intrin.ll =================================================================== --- test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -852,16 +852,16 @@ ; CHECK-NEXT: kxorw %k0, %k0, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: movw $1, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm4 -; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} ; CHECK-NEXT: movw $220, %ax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 -; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -30,8 +30,8 @@ ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -51,8 +51,8 @@ ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) @@ -71,8 +71,8 @@ ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) @@ -91,8 +91,8 @@ ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -111,8 +111,8 @@ ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -131,8 +131,8 @@ ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) @@ -671,9 +671,9 @@ ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -1616,9 +1616,9 @@ ; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) @@ -2031,8 +2031,8 @@ ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3) @@ -2051,8 +2051,8 @@ ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3) @@ -2651,8 +2651,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 %x3) @@ -2989,9 +2989,9 @@ ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) @@ -3010,9 +3010,9 @@ ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) @@ -3030,9 +3030,9 @@ ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) @@ -3050,9 +3050,9 @@ ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -274,11 +274,11 @@ ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1) @@ -301,11 +301,11 @@ ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) @@ -477,11 +477,11 @@ define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2usi %xmm0, %rax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4) @@ -496,11 +496,11 @@ define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %rcx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2si %xmm0, %rax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4) @@ -515,11 +515,11 @@ define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %rcx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2usi %xmm0, %rax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4) @@ -534,11 +534,11 @@ define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %rcx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2si %xmm0, %rax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4) @@ -553,11 +553,11 @@ define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2usi %xmm0, %eax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4) @@ -572,11 +572,11 @@ define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %ecx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2si %xmm0, %eax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4) @@ -591,11 +591,11 @@ define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %ecx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2usi %xmm0, %eax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4) @@ -610,11 +610,11 @@ define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %ecx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2si %xmm0, %eax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4) @@ -683,8 +683,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) -; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask) @@ -3656,11 +3657,11 @@ ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) @@ -3684,10 +3685,10 @@ ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) @@ -3903,11 +3904,11 @@ ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z} -; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4) @@ -3928,11 +3929,11 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 ; CHECK-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4) @@ -4434,8 +4435,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) @@ -4454,8 +4455,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) @@ -4556,9 +4557,9 @@ ; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4) @@ -4579,9 +4580,9 @@ ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovapd %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4603,9 +4604,9 @@ ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vmovaps %xmm0, %xmm5 ; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm3 ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4650,9 +4651,9 @@ ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovaps %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1} +; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4) @@ -4721,9 +4722,9 @@ ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vmovapd %xmm0, %xmm5 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm3 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) @@ -4821,12 +4822,12 @@ ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm0, %xmm4 ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vmovapd %xmm0, %xmm5 -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm1 -; CHECK-NEXT: vaddpd %xmm5, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -4849,12 +4850,12 @@ ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm0, %xmm4 ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vmovaps %xmm0, %xmm5 -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm1 -; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -4909,12 +4910,12 @@ ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -4937,12 +4938,12 @@ ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -5069,12 +5070,12 @@ ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -5097,12 +5098,12 @@ ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -5125,12 +5126,12 @@ ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -5153,12 +5154,12 @@ ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) Index: test/CodeGen/X86/avx512-mask-spills.ll =================================================================== --- test/CodeGen/X86/avx512-mask-spills.ll +++ test/CodeGen/X86/avx512-mask-spills.ll @@ -9,13 +9,11 @@ ; CHECK-NEXT: Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -34,14 +32,12 @@ ; CHECK-NEXT: Lcfi1: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -60,14 +56,12 @@ ; CHECK-NEXT: Lcfi2: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -85,14 +79,12 @@ ; CHECK-NEXT: Lcfi3: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 +; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload -; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload -; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -106,20 +98,18 @@ define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_64i1: ; CHECK: ## BB#0: -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: Lcfi4: -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill -; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq %k0, (%rsp) ## 8-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload -; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq (%rsp), %k0 ## 8-byte Reload ; CHECK-NEXT: vpmovm2b %k0, %zmm0 -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq %cmp_res = icmp ugt <64 x i8> %a, %b Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -796,9 +796,9 @@ ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512: @@ -806,9 +806,9 @@ ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1) @@ -826,8 +826,8 @@ ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -836,8 +836,8 @@ ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2159,9 +2159,9 @@ ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: @@ -2169,9 +2169,9 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4) %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4) @@ -2411,9 +2411,9 @@ ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512: @@ -2421,9 +2421,9 @@ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) Index: test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) @@ -29,8 +29,8 @@ ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) @@ -49,8 +49,8 @@ ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) @@ -69,8 +69,8 @@ ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) @@ -89,8 +89,8 @@ ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) @@ -109,8 +109,8 @@ ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) @@ -1476,9 +1476,9 @@ ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) @@ -1496,9 +1496,9 @@ ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1) @@ -1596,8 +1596,8 @@ ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) @@ -1616,8 +1616,8 @@ ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) Index: test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) Index: test/CodeGen/X86/avx512cdvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -7,8 +7,8 @@ ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) Index: test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -40,8 +40,8 @@ ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) Index: test/CodeGen/X86/avx512dq-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512dq-intrinsics.ll +++ test/CodeGen/X86/avx512dq-intrinsics.ll @@ -414,8 +414,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3) @@ -434,8 +434,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) Index: test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1568,8 +1568,8 @@ ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) @@ -1588,9 +1588,9 @@ ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) @@ -1608,9 +1608,9 @@ ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) Index: test/CodeGen/X86/avx512dqvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -635,8 +635,8 @@ ; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0] ; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3) @@ -680,8 +680,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) Index: test/CodeGen/X86/avx512ifma-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512ifma-intrinsics.ll +++ test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -13,8 +13,8 @@ ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -41,8 +41,8 @@ ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -69,8 +69,8 @@ ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -97,8 +97,8 @@ ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq Index: test/CodeGen/X86/avx512ifmavl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512ifmavl-intrinsics.ll +++ test/CodeGen/X86/avx512ifmavl-intrinsics.ll @@ -14,8 +14,8 @@ ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -42,8 +42,8 @@ ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -70,8 +70,8 @@ ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -98,8 +98,8 @@ ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -126,8 +126,8 @@ ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -154,8 +154,8 @@ ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -182,8 +182,8 @@ ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -210,8 +210,8 @@ ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq Index: test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -30,8 +30,8 @@ ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) @@ -50,8 +50,8 @@ ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) @@ -70,8 +70,8 @@ ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) @@ -90,8 +90,8 @@ ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) @@ -110,8 +110,8 @@ ; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) @@ -130,8 +130,8 @@ ; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) @@ -152,9 +152,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -175,9 +175,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -198,9 +198,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -221,9 +221,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -243,9 +243,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) @@ -266,9 +266,9 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) @@ -3209,10 +3209,10 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb] -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) @@ -3540,9 +3540,9 @@ ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -3560,9 +3560,9 @@ ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -3580,9 +3580,9 @@ ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3600,9 +3600,9 @@ ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3720,8 +3720,8 @@ ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -3740,8 +3740,8 @@ ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -3760,8 +3760,8 @@ ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -3780,8 +3780,8 @@ ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4642,10 +4642,10 @@ ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) @@ -4817,9 +4817,9 @@ ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) @@ -4837,9 +4837,9 @@ ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4368,8 +4368,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4388,8 +4388,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4408,8 +4408,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4428,8 +4428,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4528,8 +4528,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4548,8 +4548,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4568,8 +4568,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4588,8 +4588,8 @@ ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4690,9 +4690,9 @@ ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04] +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4) @@ -4732,9 +4732,9 @@ ; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04] ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05] +; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4) @@ -4755,9 +4755,9 @@ ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8] ; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04] +; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4) Index: test/CodeGen/X86/bswap_tree2.ll =================================================================== --- test/CodeGen/X86/bswap_tree2.ll +++ test/CodeGen/X86/bswap_tree2.ll @@ -9,31 +9,32 @@ define i32 @test1(i32 %x) nounwind { ; CHECK-LABEL: test1: ; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000 -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: bswapl %ecx -; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000 +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: shrl $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test1: ; CHECK64: # BB#0: -; CHECK64-NEXT: movl %edi, %ecx -; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: shrl $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx ; CHECK64-NEXT: bswapl %edi ; CHECK64-NEXT: shrl $16, %edi -; CHECK64-NEXT: orl %ecx, %eax -; CHECK64-NEXT: orl %edi, %eax +; CHECK64-NEXT: orl %ecx, %edi +; CHECK64-NEXT: movl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff %byte1 = and i32 %x, 65280 ; 0x0000ff00 Index: test/CodeGen/X86/fold-tied-op.ll =================================================================== --- test/CodeGen/X86/fold-tied-op.ll +++ test/CodeGen/X86/fold-tied-op.ll @@ -6,9 +6,10 @@ target triple = "i386--netbsd" ; CHECK-LABEL: fn1 -; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: imull {{.*#+}} 4-byte Folded Reload -; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload ; CHECK: retl %struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } Index: test/CodeGen/X86/fp128-i128.ll =================================================================== --- test/CodeGen/X86/fp128-i128.ll +++ test/CodeGen/X86/fp128-i128.ll @@ -50,8 +50,8 @@ ; CHECK-NEXT: andq %rdi, %rcx ; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 ; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: jmp foo # TAILCALL Index: test/CodeGen/X86/haddsub-2.ll =================================================================== --- test/CodeGen/X86/haddsub-2.ll +++ test/CodeGen/X86/haddsub-2.ll @@ -933,14 +933,14 @@ ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] ; AVX-NEXT: retq %vecext = extractelement <4 x float> %A, i32 2 %vecext1 = extractelement <4 x float> %A, i32 3 Index: test/CodeGen/X86/lrshrink.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/lrshrink.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call +; to minimize live-range. + +define i64 @test() { +; CHECK-LABEL: test: +; CHECK: call +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add + %1 = tail call i32 @_Z3foov() + %2 = zext i32 %1 to i64 + %3 = tail call i32 @_Z3foov() + %4 = zext i32 %3 to i64 + %5 = tail call i32 @_Z3foov() + %6 = zext i32 %5 to i64 + %7 = add nuw nsw i64 %4, %2 + %8 = add nuw nsw i64 %6, %7 + ret i64 %8 +} + +declare i32 @_Z3foov() Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -129,9 +129,9 @@ ; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-8, %rax @@ -246,23 +246,23 @@ ; SSE2-NEXT: pmullw %xmm4, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: pmullw %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm2 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-16, %rax Index: test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-128.ll +++ test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -270,9 +270,9 @@ ; SSE2: # BB#0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_012u: @@ -292,9 +292,9 @@ ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_012u: @@ -321,9 +321,9 @@ ; SSE2: # BB#0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_019u: @@ -343,9 +343,9 @@ ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_019u: Index: test/CodeGen/X86/misched-matrix.ll =================================================================== --- test/CodeGen/X86/misched-matrix.ll +++ test/CodeGen/X86/misched-matrix.ll @@ -17,9 +17,9 @@ ; ; TOPDOWN-LABEL: %for.body ; TOPDOWN: movl %{{.*}}, ( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 4( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 8( ; TOPDOWN: movl %{{.*}}, 12( ; TOPDOWN-LABEL: %for.end Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -746,9 +746,9 @@ ; SSE2-LABEL: interleave_24i8_in: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -791,17 +791,17 @@ ; SSE42: # BB#0: ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5] +; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero -; SSE42-NEXT: por %xmm1, %xmm3 +; SSE42-NEXT: por %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm0, %xmm2 -; SSE42-NEXT: movq %xmm2, 16(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: movq %xmm1, 16(%rdi) ; SSE42-NEXT: movdqu %xmm3, (%rdi) ; SSE42-NEXT: retq ; @@ -809,16 +809,16 @@ ; AVX: # BB#0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) -; AVX-NEXT: vmovdqu %xmm1, (%rdi) +; AVX-NEXT: vmovdqu %xmm2, (%rdi) ; AVX-NEXT: retq %s1 = load <8 x i8>, <8 x i8>* %q1, align 4 %s2 = load <8 x i8>, <8 x i8>* %q2, align 4 Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -1152,9 +1152,9 @@ ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1166,9 +1166,9 @@ ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: retq ; @@ -1312,17 +1312,17 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pmuludq %xmm7, %xmm5 +; SSE2-NEXT: pmuludq %xmm7, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pmuludq %xmm8, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm5 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 @@ -1331,22 +1331,22 @@ ; SSE41-LABEL: mul_v8i64_zero_upper: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm4, %xmm1 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: pmuludq %xmm5, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pmuludq %xmm6, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero ; SSE41-NEXT: pmuludq %xmm7, %xmm1 -; SSE41-NEXT: pmuludq %xmm6, %xmm2 -; SSE41-NEXT: pmuludq %xmm5, %xmm0 -; SSE41-NEXT: pmuludq %xmm8, %xmm4 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; @@ -1356,11 +1356,11 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -1467,22 +1467,22 @@ ; SSE41-LABEL: mul_v8i64_sext: ; SSE41: # BB#0: ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm8 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm7 -; SSE41-NEXT: pmovsxwq %xmm0, %xmm5 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE41-NEXT: pmuldq %xmm4, %xmm3 ; SSE41-NEXT: pmovsxdq %xmm2, %xmm2 +; SSE41-NEXT: pmuldq %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; SSE41-NEXT: pmuldq %xmm6, %xmm4 ; SSE41-NEXT: pmovsxdq %xmm1, %xmm0 -; SSE41-NEXT: pmuldq %xmm5, %xmm0 -; SSE41-NEXT: pmuldq %xmm7, %xmm4 -; SSE41-NEXT: pmuldq %xmm6, %xmm2 -; SSE41-NEXT: pmuldq %xmm8, %xmm3 +; SSE41-NEXT: pmuldq %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; @@ -1493,9 +1493,10 @@ ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: mul_v8i64_sext: Index: test/CodeGen/X86/pr29112.ll =================================================================== --- test/CodeGen/X86/pr29112.ll +++ test/CodeGen/X86/pr29112.ll @@ -38,7 +38,8 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1] +; CHECK-NEXT: vaddps %xmm14, %xmm1, %xmm10 ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3] @@ -52,10 +53,9 @@ ; CHECK-NEXT: vmovaps %xmm15, %xmm1 ; CHECK-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm9 -; CHECK-NEXT: vaddps %xmm14, %xmm10, %xmm0 ; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm8 -; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm3 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm9, (%rsp) Index: test/CodeGen/X86/pr31088.ll =================================================================== --- test/CodeGen/X86/pr31088.ll +++ test/CodeGen/X86/pr31088.ll @@ -150,12 +150,12 @@ ; F16C-NEXT: vcvtph2ps %xmm3, %xmm3 ; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; F16C-NEXT: retq %retval = fadd <2 x half> %arg0, %arg1 ret <2 x half> %retval Index: test/CodeGen/X86/rotate.ll =================================================================== --- test/CodeGen/X86/rotate.ll +++ test/CodeGen/X86/rotate.ll @@ -33,8 +33,8 @@ ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB0_4: -; 32-NEXT: orl %esi, %eax ; 32-NEXT: orl %ebx, %edx +; 32-NEXT: orl %esi, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -86,8 +86,8 @@ ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB1_4: -; 32-NEXT: orl %ebx, %eax ; 32-NEXT: orl %esi, %edx +; 32-NEXT: orl %ebx, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -546,7 +546,7 @@ ; 32-LABEL: rotr1_64_mem: ; 32: # BB#0: ; 32-NEXT: pushl %esi -; 32-NEXT: movl 8(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: movl (%eax), %ecx ; 32-NEXT: movl 4(%eax), %edx ; 32-NEXT: movl %edx, %esi @@ -555,11 +555,13 @@ ; 32-NEXT: movl %ecx, 4(%eax) ; 32-NEXT: movl %esi, (%eax) ; 32-NEXT: popl %esi - +; 32-NEXT: retl +; ; 64-LABEL: rotr1_64_mem: ; 64: # BB#0: ; 64-NEXT: rorq (%rdi) ; 64-NEXT: retq + %A = load i64, i64 *%Aptr %B = shl i64 %A, 63 %C = lshr i64 %A, 1 @@ -571,7 +573,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind { ; 32-LABEL: rotr1_32_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorl (%eax) ; 32-NEXT: retl ; @@ -590,7 +592,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind { ; 32-LABEL: rotr1_16_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorw (%eax) ; 32-NEXT: retl ; @@ -609,7 +611,7 @@ define void @rotr1_8_mem(i8* %Aptr) nounwind { ; 32-LABEL: rotr1_8_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorb (%eax) ; 32-NEXT: retl ; Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -149,127 +149,131 @@ define i32 @sad_32i8() nounwind { ; SSE2-LABEL: sad_32i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm7 ; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: psubd %xmm10, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: psubd %xmm10, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm11, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] ; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: psubd %xmm2, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; SSE2-NEXT: psubd %xmm4, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm2, %xmm10 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm8 -; SSE2-NEXT: pxor %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm2, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE2-NEXT: psubd %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm2, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE2-NEXT: psubd %xmm6, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; SSE2-NEXT: psubd %xmm9, %xmm8 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 ; SSE2-NEXT: paddd %xmm7, %xmm13 -; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm15, %xmm3 -; SSE2-NEXT: paddd %xmm14, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm15, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: paddd %xmm14, %xmm13 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -398,288 +402,284 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-LABEL: sad_avx64i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: subq $184, %rsp -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: subq $200, %rsp +; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 -; SSE2-NEXT: movdqa a+1024(%rax), %xmm4 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm11 -; SSE2-NEXT: movdqa a+1072(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm4, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm6, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm8 -; SSE2-NEXT: movdqa %xmm13, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: psubd %xmm13, %xmm14 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: movdqa b+1056(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: psubd %xmm10, %xmm12 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm11 -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: movdqa b+1072(%rax), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movaps a+1040(%rax), %xmm0 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 +; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 +; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm15, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm15, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm12, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: pxor %xmm0, %xmm10 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm13, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: pxor %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm11 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] ; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: pxor %xmm0, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; SSE2-NEXT: movdqa b+1072(%rax), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa b+1056(%rax), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm8 +; SSE2-NEXT: movdqa b+1024(%rax), %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm12 +; SSE2-NEXT: movdqa b+1040(%rax), %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE2-NEXT: psubd %xmm13, %xmm2 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm15, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm15 +; SSE2-NEXT: paddd %xmm15, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: movdqa %xmm10, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm14, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm14 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm14, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm12, %xmm8 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm0, %xmm12 -; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: paddd %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm2 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm14 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm3, %xmm8 +; SSE2-NEXT: paddd %xmm2, %xmm15 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm13 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: paddd %xmm11, %xmm10 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: paddd %xmm13, %xmm1 +; SSE2-NEXT: paddd %xmm15, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm14, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] -; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: addq $184, %rsp +; SSE2-NEXT: addq $200, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: sad_avx64i8: @@ -688,8 +688,8 @@ ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6 ; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7 @@ -697,7 +697,6 @@ ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero @@ -705,48 +704,49 @@ ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8 +; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 +; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 +; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15 -; AVX2-NEXT: vpabsd %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 +; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpabsd %ymm9, %ymm8 +; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm10, %ymm8 +; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vpabsd %ymm11, %ymm8 ; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpabsd %ymm14, %ymm8 -; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpabsd %ymm13, %ymm8 -; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vpabsd %ymm12, %ymm8 ; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpabsd %ymm11, %ymm8 -; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 -; AVX2-NEXT: vpabsd %ymm10, %ymm8 -; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 -; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm13, %ymm8 +; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vpabsd %ymm14, %ymm8 +; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 ; AVX2-NEXT: vpabsd %ymm15, %ymm8 -; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4 +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -773,21 +773,21 @@ ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7 -; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6 -; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5 ; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7 ; AVX512F-NEXT: vpabsd %zmm4, %zmm4 -; AVX512F-NEXT: vpabsd %zmm5, %zmm5 -; AVX512F-NEXT: vpabsd %zmm6, %zmm6 -; AVX512F-NEXT: vpabsd %zmm7, %zmm7 -; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3 -; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2 -; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1 ; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; AVX512F-NEXT: vpabsd %zmm5, %zmm4 +; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vpabsd %zmm6, %zmm4 +; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2 +; AVX512F-NEXT: vpabsd %zmm7, %zmm4 +; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # BB#2: # %middle.block @@ -1154,59 +1154,54 @@ ; SSE2-LABEL: sad_nonloop_32i8: ; SSE2: # BB#0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm13, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqu (%rdx), %xmm5 -; SSE2-NEXT: movdqu 16(%rdx), %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: psubd %xmm5, %xmm0 -; SSE2-NEXT: psubd %xmm7, %xmm3 -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: psubd %xmm1, %xmm12 -; SSE2-NEXT: psubd %xmm8, %xmm6 -; SSE2-NEXT: psubd %xmm15, %xmm11 -; SSE2-NEXT: psubd %xmm14, %xmm10 -; SSE2-NEXT: psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqu 16(%rdi), %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm12, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm12, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqu (%rdx), %xmm7 +; SSE2-NEXT: movdqu 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm10 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE2-NEXT: psubd %xmm6, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psubd %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: psubd %xmm3, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm10 @@ -1215,33 +1210,37 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm13, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm13 ; SSE2-NEXT: pxor %xmm1, %xmm13 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: paddd %xmm10, %xmm4 +; SSE2-NEXT: paddd %xmm11, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm11, %xmm6 -; SSE2-NEXT: paddd %xmm9, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm6 ; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm6, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] Index: test/CodeGen/X86/select.ll =================================================================== --- test/CodeGen/X86/select.ll +++ test/CodeGen/X86/select.ll @@ -299,20 +299,21 @@ ; GENERIC-NEXT: testb %dil, %dil ; GENERIC-NEXT: jne LBB7_4 ; GENERIC-NEXT: ## BB#5: +; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; GENERIC-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; GENERIC-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: jmp LBB7_6 ; GENERIC-NEXT: LBB7_4: -; GENERIC-NEXT: movd %r9d, %xmm2 -; GENERIC-NEXT: movd %ecx, %xmm3 -; GENERIC-NEXT: movd %r8d, %xmm4 +; GENERIC-NEXT: movd %r9d, %xmm1 +; GENERIC-NEXT: movd %ecx, %xmm2 +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; GENERIC-NEXT: movd %r8d, %xmm3 ; GENERIC-NEXT: movd %edx, %xmm1 ; GENERIC-NEXT: LBB7_6: -; GENERIC-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1 ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0 ; GENERIC-NEXT: movq %xmm0, 16(%rsi) @@ -339,16 +340,19 @@ ; ATOM-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; ATOM-NEXT: jmp LBB7_6 ; ATOM-NEXT: LBB7_4: -; ATOM-NEXT: movd %r9d, %xmm2 -; ATOM-NEXT: movd %ecx, %xmm3 -; ATOM-NEXT: movd %r8d, %xmm4 +; ATOM-NEXT: movd %r9d, %xmm1 +; ATOM-NEXT: movd %ecx, %xmm2 +; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; ATOM-NEXT: movd %r8d, %xmm3 ; ATOM-NEXT: movd %edx, %xmm1 -; ATOM-NEXT: LBB7_6: -; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; ATOM-NEXT: LBB7_6: ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0 ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1 ; ATOM-NEXT: movq %xmm0, 16(%rsi) Index: test/CodeGen/X86/setcc-wide-types.ll =================================================================== --- test/CodeGen/X86/setcc-wide-types.ll +++ test/CodeGen/X86/setcc-wide-types.ll @@ -58,25 +58,25 @@ ; SSE2-LABEL: ne_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -100,25 +100,25 @@ ; SSE2-LABEL: eq_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; Index: test/CodeGen/X86/shrink_vmul_sse.ll =================================================================== --- test/CodeGen/X86/shrink_vmul_sse.ll +++ test/CodeGen/X86/shrink_vmul_sse.ll @@ -20,9 +20,9 @@ ; CHECK-NEXT: movzbl 1(%edx,%ecx), %edi ; CHECK-NEXT: movzbl (%edx,%ecx), %edx ; CHECK-NEXT: movzbl 1(%eax,%ecx), %ebx +; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movzbl (%eax,%ecx), %eax ; CHECK-NEXT: imull %edx, %eax -; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movl %ebx, 4(%esi,%ecx,4) ; CHECK-NEXT: movl %eax, (%esi,%ecx,4) ; CHECK-NEXT: popl %esi Index: test/CodeGen/X86/sse-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1537,9 +1537,9 @@ ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-NEXT: retl ; @@ -1673,13 +1673,13 @@ define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind { ; X32-LABEL: test_mm_setr_ps: ; X32: # BB#0: +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_setr_ps: Index: test/CodeGen/X86/sse1.ll =================================================================== --- test/CodeGen/X86/sse1.ll +++ test/CodeGen/X86/sse1.ll @@ -78,8 +78,8 @@ ; X32-NEXT: .LBB1_8: # %entry ; X32-NEXT: xorps %xmm3, %xmm3 ; X32-NEXT: .LBB1_9: # %entry -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: jne .LBB1_11 ; X32-NEXT: # BB#10: ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -115,8 +115,8 @@ ; X64-NEXT: .LBB1_8: # %entry ; X64-NEXT: xorps %xmm3, %xmm3 ; X64-NEXT: .LBB1_9: # %entry -; X64-NEXT: testl %esi, %esi ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: testl %esi, %esi ; X64-NEXT: jne .LBB1_11 ; X64-NEXT: # BB#10: ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero Index: test/CodeGen/X86/sse3-avx-addsub-2.ll =================================================================== --- test/CodeGen/X86/sse3-avx-addsub-2.ll +++ test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -412,14 +412,14 @@ ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] ; SSE-NEXT: subss %xmm4, %xmm3 -; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE-NEXT: addss %xmm0, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: addss %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: addss %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -431,12 +431,12 @@ ; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm4, %xmm4 +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; AVX-NEXT: retq %1 = extractelement <4 x float> %A, i32 0 %2 = extractelement <4 x float> %B, i32 0 Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -273,8 +273,8 @@ ; X32: ## BB#0: ## %entry ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: addss %xmm2, %xmm3 +; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X32-NEXT: retl ; @@ -282,8 +282,8 @@ ; X64: ## BB#0: ## %entry ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: addss %xmm2, %xmm3 +; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X64-NEXT: retq entry: @@ -896,9 +896,9 @@ ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: addps %xmm2, %xmm3 ; X32-NEXT: addps %xmm3, %xmm0 ; X32-NEXT: retl @@ -908,9 +908,9 @@ ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: addps %xmm2, %xmm3 ; X64-NEXT: addps %xmm3, %xmm0 ; X64-NEXT: retq Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -4344,7 +4344,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_4 ; AVX1-NEXT: # BB#5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 ; AVX1-NEXT: jmp .LBB80_6 ; AVX1-NEXT: .LBB80_4: ; AVX1-NEXT: movq %rax, %rcx @@ -4352,22 +4352,22 @@ ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm4 ; AVX1-NEXT: .LBB80_6: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX1-NEXT: jmp .LBB80_9 ; AVX1-NEXT: .LBB80_7: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB80_9: ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: testq %rax, %rax @@ -4397,29 +4397,29 @@ ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: .LBB80_15: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_16 ; AVX1-NEXT: # BB#17: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 ; AVX1-NEXT: jmp .LBB80_18 ; AVX1-NEXT: .LBB80_16: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: .LBB80_18: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rax +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovq %xmm3, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_19 ; AVX1-NEXT: # BB#20: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 ; AVX1-NEXT: jmp .LBB80_21 ; AVX1-NEXT: .LBB80_19: ; AVX1-NEXT: movq %rax, %rcx @@ -4427,25 +4427,25 @@ ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: .LBB80_21: +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm4, %rax +; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_22 ; AVX1-NEXT: # BB#23: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 ; AVX1-NEXT: jmp .LBB80_24 ; AVX1-NEXT: .LBB80_22: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB80_24: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -4471,7 +4471,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_4 ; AVX2-NEXT: # BB#5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 ; AVX2-NEXT: jmp .LBB80_6 ; AVX2-NEXT: .LBB80_4: ; AVX2-NEXT: movq %rax, %rcx @@ -4479,22 +4479,22 @@ ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm4 ; AVX2-NEXT: .LBB80_6: ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vmovq %xmm2, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX2-NEXT: jmp .LBB80_9 ; AVX2-NEXT: .LBB80_7: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB80_9: ; AVX2-NEXT: vpextrq $1, %xmm2, %rax ; AVX2-NEXT: testq %rax, %rax @@ -4524,29 +4524,29 @@ ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: .LBB80_15: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_16 ; AVX2-NEXT: # BB#17: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 ; AVX2-NEXT: jmp .LBB80_18 ; AVX2-NEXT: .LBB80_16: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: .LBB80_18: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rax +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vmovq %xmm3, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_19 ; AVX2-NEXT: # BB#20: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 ; AVX2-NEXT: jmp .LBB80_21 ; AVX2-NEXT: .LBB80_19: ; AVX2-NEXT: movq %rax, %rcx @@ -4554,25 +4554,25 @@ ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 +; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: .LBB80_21: +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX2-NEXT: vpextrq $1, %xmm4, %rax +; AVX2-NEXT: vpextrq $1, %xmm3, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_22 ; AVX2-NEXT: # BB#23: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 ; AVX2-NEXT: jmp .LBB80_24 ; AVX2-NEXT: .LBB80_22: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 +; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB80_24: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; Index: test/CodeGen/X86/vector-bitreverse.ll =================================================================== --- test/CodeGen/X86/vector-bitreverse.ll +++ test/CodeGen/X86/vector-bitreverse.ll @@ -2372,10 +2372,10 @@ ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm3 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 ; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 Index: test/CodeGen/X86/vector-blend.ll =================================================================== --- test/CodeGen/X86/vector-blend.ll +++ test/CodeGen/X86/vector-blend.ll @@ -848,10 +848,10 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: blend_logic_v8i32: @@ -860,10 +860,10 @@ ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm0, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: blend_logic_v8i32: Index: test/CodeGen/X86/vector-sqrt.ll =================================================================== --- test/CodeGen/X86/vector-sqrt.ll +++ test/CodeGen/X86/vector-sqrt.ll @@ -29,11 +29,11 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1 -; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2 -; CHECK-NEXT: vsqrtss 12(%rdi), %xmm3, %xmm3 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm1 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; CHECK-NEXT: vsqrtss 12(%rdi), %xmm2, %xmm1 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; CHECK-NEXT: retq entry: %0 = load float, float* %v, align 4 Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -11,13 +11,13 @@ ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm1 -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> @@ -39,11 +39,11 @@ ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> @@ -124,9 +124,9 @@ ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16