Index: llvm/include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -800,12 +800,30 @@ static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl &Visited, SmallVectorImpl &Worklist, - unsigned int MaxSteps = 0) { + unsigned int MaxSteps = 0, + bool TopologicalPrune = false) { + SmallVector DeferredNodes; if (Visited.count(N)) return true; + + // Node Id's are assigned in three places: As a topological + // ordering (> 0), during legalization (results in values set to + // 0), and new nodes (set to -1). If N has a topolgical id then we + // know that all nodes with ids smaller than it cannot be + // successors and we need not check them. Filter out all node + // that can't be matches. We add them to the worklist before exit + // in case of multiple calls. + + int NId = N->getNodeId(); + + bool Found = false; while (!Worklist.empty()) { const SDNode *M = Worklist.pop_back_val(); - bool Found = false; + int MId = M->getNodeId(); + if (TopologicalPrune && (NId > 0) && (MId > 0) && (MId < NId)) { + DeferredNodes.push_back(M); + continue; + } for (const SDValue &OpV : M->op_values()) { SDNode *Op = OpV.getNode(); if (Visited.insert(Op).second) @@ -814,11 +832,13 @@ Found = true; } if (Found) - return true; + break; if (MaxSteps != 0 && Visited.size() >= MaxSteps) - return false; + break; } - return false; + // Push deferred nodes back on worklist. + Worklist.append(DeferredNodes.begin(), DeferredNodes.end()); + return Found; } /// Return true if all the users of N are contained in Nodes. Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -2132,48 +2132,53 @@ return nullptr; } -/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def". -/// This function iteratively traverses up the operand chain, ignoring -/// certain nodes. -static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse, - SDNode *Root, SmallPtrSetImpl &Visited, +/// findNonImmUse - Return true if "Def" is a predecessor of "Root" via a path +/// beyond "ImmedUse". We may ignore chains as they are checked separately. +static bool findNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse, bool IgnoreChains) { - // The NodeID's are given uniques ID's where a node ID is guaranteed to be - // greater than all of its (recursive) operands. If we scan to a point where - // 'use' is smaller than the node we're scanning for, then we know we will - // never find it. - // - // The Use may be -1 (unassigned) if it is a newly allocated node. This can - // happen because we scan down to newly selected nodes in the case of glue - // uses. - std::vector WorkList; - WorkList.push_back(Use); + SmallPtrSet Visited; + SmallVector WorkList; + // Only check if we have non-immediate uses of Def. + if (ImmedUse->isOnlyUserOf(Def)) + return false; - while (!WorkList.empty()) { - Use = WorkList.back(); - WorkList.pop_back(); - if (Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1) + // Initialize worklist to operands of Root. + for (const SDValue &Op : Root->op_values()) { + SDNode *N = Op.getNode(); + // Ignore chains (they are validated by HandleMergeInputChains) + if ((Op.getValueType() == MVT::Other && IgnoreChains)) continue; + WorkList.push_back(N); + } - // Don't revisit nodes if we already scanned it and didn't fail, we know we - // won't fail if we scan it again. - if (!Visited.insert(Use).second) + // We don't care about paths to Def that go through ImmedUse so mark it visited and + // mark non-def operands as used. + Visited.insert(ImmedUse); + for (const SDValue &Op : ImmedUse->op_values()) { + SDNode *N = Op.getNode(); + // Ignore chain deps (they are validated by + // HandleMergeInputChains) and immediate uses + if ((Op.getValueType() == MVT::Other && IgnoreChains) || (N == Def)) continue; + WorkList.push_back(N); + } - for (const SDValue &Op : Use->op_values()) { - // Ignore chain uses, they are validated by HandleMergeInputChains. - if (Op.getValueType() == MVT::Other && IgnoreChains) - continue; - + int DefId = Def->getNodeId(); + while (!WorkList.empty()) { + SDNode *Node = WorkList.pop_back_val(); + int NodeId = Node->getNodeId(); + // Operand's NodeId's are always strictly lower, so if NodeId is + // lower (and not unassigned,.i.e. -1), we can stop search here. + if (DefId > 0 && NodeId > 0 && NodeId < DefId) + continue; + // Don't revisit nodes. + if (!Visited.insert(Node).second) + continue; + // Consider operands. + for (const SDValue &Op : Node->op_values()) { SDNode *N = Op.getNode(); - if (N == Def) { - if (Use == ImmedUse || Use == Root) - continue; // We are not looking for immediate use. - assert(N != Root); + if (N == Def) return true; - } - - // Traverse up the operand chain. WorkList.push_back(N); } } @@ -2254,8 +2259,7 @@ IgnoreChains = false; } - SmallPtrSet Visited; - return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains); + return !findNonImmUse(Root, N.getNode(), U, IgnoreChains); } void SelectionDAGISel::Select_INLINEASM(SDNode *N) { @@ -2373,143 +2377,6 @@ DEBUG(dbgs() << "ISEL: Match complete!\n"); } -enum ChainResult { - CR_Simple, - CR_InducesCycle, - CR_LeadsToInteriorNode -}; - -/// WalkChainUsers - Walk down the users of the specified chained node that is -/// part of the pattern we're matching, looking at all of the users we find. -/// This determines whether something is an interior node, whether we have a -/// non-pattern node in between two pattern nodes (which prevent folding because -/// it would induce a cycle) and whether we have a TokenFactor node sandwiched -/// between pattern nodes (in which case the TF becomes part of the pattern). -/// -/// The walk we do here is guaranteed to be small because we quickly get down to -/// already selected nodes "below" us. -static ChainResult -WalkChainUsers(const SDNode *ChainedNode, - SmallVectorImpl &ChainedNodesInPattern, - DenseMap &TokenFactorResult, - SmallVectorImpl &InteriorChainedNodes) { - ChainResult Result = CR_Simple; - - for (SDNode::use_iterator UI = ChainedNode->use_begin(), - E = ChainedNode->use_end(); UI != E; ++UI) { - // Make sure the use is of the chain, not some other value we produce. - if (UI.getUse().getValueType() != MVT::Other) continue; - - SDNode *User = *UI; - - if (User->getOpcode() == ISD::HANDLENODE) // Root of the graph. - continue; - - // If we see an already-selected machine node, then we've gone beyond the - // pattern that we're selecting down into the already selected chunk of the - // DAG. - unsigned UserOpcode = User->getOpcode(); - if (User->isMachineOpcode() || - UserOpcode == ISD::CopyToReg || - UserOpcode == ISD::CopyFromReg || - UserOpcode == ISD::INLINEASM || - UserOpcode == ISD::EH_LABEL || - UserOpcode == ISD::LIFETIME_START || - UserOpcode == ISD::LIFETIME_END) { - // If their node ID got reset to -1 then they've already been selected. - // Treat them like a MachineOpcode. - if (User->getNodeId() == -1) - continue; - } - - // If we have a TokenFactor, we handle it specially. - if (User->getOpcode() != ISD::TokenFactor) { - // If the node isn't a token factor and isn't part of our pattern, then it - // must be a random chained node in between two nodes we're selecting. - // This happens when we have something like: - // x = load ptr - // call - // y = x+4 - // store y -> ptr - // Because we structurally match the load/store as a read/modify/write, - // but the call is chained between them. We cannot fold in this case - // because it would induce a cycle in the graph. - if (!std::count(ChainedNodesInPattern.begin(), - ChainedNodesInPattern.end(), User)) - return CR_InducesCycle; - - // Otherwise we found a node that is part of our pattern. For example in: - // x = load ptr - // y = x+4 - // store y -> ptr - // This would happen when we're scanning down from the load and see the - // store as a user. Record that there is a use of ChainedNode that is - // part of the pattern and keep scanning uses. - Result = CR_LeadsToInteriorNode; - InteriorChainedNodes.push_back(User); - continue; - } - - // If we found a TokenFactor, there are two cases to consider: first if the - // TokenFactor is just hanging "below" the pattern we're matching (i.e. no - // uses of the TF are in our pattern) we just want to ignore it. Second, - // the TokenFactor can be sandwiched in between two chained nodes, like so: - // [Load chain] - // ^ - // | - // [Load] - // ^ ^ - // | \ DAG's like cheese - // / \ do you? - // / | - // [TokenFactor] [Op] - // ^ ^ - // | | - // \ / - // \ / - // [Store] - // - // In this case, the TokenFactor becomes part of our match and we rewrite it - // as a new TokenFactor. - // - // To distinguish these two cases, do a recursive walk down the uses. - auto MemoizeResult = TokenFactorResult.find(User); - bool Visited = MemoizeResult != TokenFactorResult.end(); - // Recursively walk chain users only if the result is not memoized. - if (!Visited) { - auto Res = WalkChainUsers(User, ChainedNodesInPattern, TokenFactorResult, - InteriorChainedNodes); - MemoizeResult = TokenFactorResult.insert(std::make_pair(User, Res)).first; - } - switch (MemoizeResult->second) { - case CR_Simple: - // If the uses of the TokenFactor are just already-selected nodes, ignore - // it, it is "below" our pattern. - continue; - case CR_InducesCycle: - // If the uses of the TokenFactor lead to nodes that are not part of our - // pattern that are not selected, folding would turn this into a cycle, - // bail out now. - return CR_InducesCycle; - case CR_LeadsToInteriorNode: - break; // Otherwise, keep processing. - } - - // Okay, we know we're in the interesting interior case. The TokenFactor - // is now going to be considered part of the pattern so that we rewrite its - // uses (it may have uses that are not part of the pattern) with the - // ultimate chain result of the generated code. We will also add its chain - // inputs as inputs to the ultimate TokenFactor we create. - Result = CR_LeadsToInteriorNode; - if (!Visited) { - ChainedNodesInPattern.push_back(User); - InteriorChainedNodes.push_back(User); - } - } - - return Result; -} - /// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains /// operation for when the pattern matched at least one node with a chains. The /// input vector contains a list of all of the chained nodes that we match. We @@ -2519,47 +2386,58 @@ static SDValue HandleMergeInputChains(SmallVectorImpl &ChainNodesMatched, SelectionDAG *CurDAG) { - // Used for memoization. Without it WalkChainUsers could take exponential - // time to run. - DenseMap TokenFactorResult; - // Walk all of the chained nodes we've matched, recursively scanning down the - // users of the chain result. This adds any TokenFactor nodes that are caught - // in between chained nodes to the chained and interior nodes list. - SmallVector InteriorChainedNodes; - for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) { - if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched, - TokenFactorResult, - InteriorChainedNodes) == CR_InducesCycle) - return SDValue(); // Would induce a cycle. - } - // Okay, we have walked all the matched nodes and collected TokenFactor nodes - // that we are interested in. Form our input TokenFactor node. + SmallPtrSet Visited; + SmallVector Worklist; SmallVector InputChains; - for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) { - // Add the input chain of this node to the InputChains list (which will be - // the operands of the generated TokenFactor) if it's not an interior node. - SDNode *N = ChainNodesMatched[i]; - if (N->getOpcode() != ISD::TokenFactor) { - if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N)) - continue; + unsigned int Max = 8192; - // Otherwise, add the input chain. - SDValue InChain = ChainNodesMatched[i]->getOperand(0); - assert(InChain.getValueType() == MVT::Other && "Not a chain"); - InputChains.push_back(InChain); - continue; - } + // Quick exit on trivial merge. + if (ChainNodesMatched.size() == 1) + return ChainNodesMatched[0]->getOperand(0); - // If we have a token factor, we want to add all inputs of the token factor - // that are not part of the pattern we're matching. - for (const SDValue &Op : N->op_values()) { - if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(), - Op.getNode())) - InputChains.push_back(Op); - } + // Add chains that aren't already added (internal). Peek through + // token factors. + std::function AddChains = [&](const SDValue V) { + if (V.getValueType() != MVT::Other) + return; + if (V->getOpcode() == ISD::EntryToken) + return; + // Newly selected nodes (-1) are always added directly. + if (V->getNodeId() == -1) + InputChains.push_back(V); + else if (V->getOpcode() == ISD::TokenFactor) { + for (int i = 0, e = V->getNumOperands(); i != e; ++i) + AddChains(V->getOperand(i)); + } else if (!Visited.count(V.getNode())) + InputChains.push_back(V); + }; + + for (auto *N : ChainNodesMatched) { + Worklist.push_back(N); + Visited.insert(N); } + while (!Worklist.empty()) + AddChains(Worklist.pop_back_val()->getOperand(0)); + + // If one of these chains is a successor of input, we must have a + // node that is both the predecessor and successor of the + // to-be-merged nodes. Fail. + Visited.clear(); + for (SDValue V : InputChains) + Worklist.push_back(V.getNode()); + + for (auto *N : ChainNodesMatched) + if (SDNode::hasPredecessorHelper(N, Visited, Worklist, Max, true)) + return SDValue(); + // Fail conservatively if we stopped searching early. + if (Visited.size() >= Max) + return SDValue(); + // Return merged chain. + + if (InputChains.size() == 0) + return CurDAG->getEntryNode(); if (InputChains.size() == 1) return InputChains[0]; return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]), Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2095,12 +2095,23 @@ // the load output chain as an operand. Return InputChain by reference. SDValue Chain = StoreNode->getChain(); - bool ChainCheck = false; if (Chain == Load.getValue(1)) { - ChainCheck = true; InputChain = LoadNode->getChain(); - } else if (Chain.getOpcode() == ISD::TokenFactor) { + return true; + } + + if (Chain.getOpcode() == ISD::TokenFactor) { + // Fusing Load-Op-Store requires predecessors of store must also + // be predecessors of the load. This addition may cause a loop. We + // can check this by doing a search for Load in the new + // dependencies. As this can be expensive, heuristically prune + // this search by visiting the uses and make sure they all have + // smaller node id than the load. + + bool ChainCheck = false; SmallVector ChainOps; + SmallVector LoopWorklist; + SmallPtrSet Visited; for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { @@ -2109,33 +2120,32 @@ ChainOps.push_back(Load.getOperand(0)); continue; } + LoopWorklist.push_back(Op.getNode()); + ChainOps.push_back(Op); + } - // Make sure using Op as part of the chain would not cause a cycle here. - // In theory, we could check whether the chain node is a predecessor of - // the load. But that can be very expensive. Instead visit the uses and - // make sure they all have smaller node id than the load. - int LoadId = LoadNode->getNodeId(); - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = UI->use_end(); UI != UE; ++UI) { - if (UI.getUse().getResNo() != 0) - continue; - if (UI->getNodeId() > LoadId) + // If Loop Worklist is not empty. Check if we would make a loop. + if (ChainCheck) { + if (!LoopWorklist.empty()) { + unsigned int Max = 8192; + // if Load is predecessor to potentially loop inducing chain + // dependencies. + if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, + Max)) + return false; + // Fail conservatively if we ended loop search early. + if (Visited.size() >= Max) return false; } - ChainOps.push_back(Op); - } - - if (ChainCheck) // Make a new TokenFactor with all the other input chains except // for the load. InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); + return true; + } } - if (!ChainCheck) - return false; - - return true; + return false; } // Change a chain of {load; op; store} of the same value into a simple op @@ -2365,6 +2375,8 @@ MemOp[1] = LoadNode->getMemOperand(); Result->setMemRefs(MemOp, MemOp + 2); + // Update Load Chain uses as well. + ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); CurDAG->RemoveDeadNode(Node); Index: llvm/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll =================================================================== --- llvm/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll +++ llvm/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll @@ -14,7 +14,9 @@ ; clobbers EFLAGS. ; CHECK: lock orl {{.*}}, (%esp) -; CHECK-NEXT: testl [[REG:%e[a-z]+]], [[REG]] +; CHECK-NEXT: cmpl $0, [[REG:%e[a-z]+]] +; unfoldMemoryOperand should to convert this back to testl. +; FIXME-NEXT: testl [[REG:%e[a-z]+]], [[REG]] if.then: ; preds = %entry tail call void bitcast (void (...)* @foo to void ()*)() nounwind Index: llvm/test/CodeGen/X86/avg.ll =================================================================== --- llvm/test/CodeGen/X86/avg.ll +++ llvm/test/CodeGen/X86/avg.ll @@ -90,12 +90,12 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v32i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa (%rsi), %xmm1 -; SSE2-NEXT: pavgb (%rdi), %xmm1 -; SSE2-NEXT: pavgb 16(%rsi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: pavgb 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8: @@ -545,18 +545,18 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v64i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa 32(%rdi), %xmm0 -; SSE2-NEXT: movdqa (%rsi), %xmm1 -; SSE2-NEXT: movdqa 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgb (%rdi), %xmm1 -; SSE2-NEXT: pavgb 16(%rdi), %xmm2 -; SSE2-NEXT: pavgb 32(%rsi), %xmm0 +; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: pavgb 16(%rdi), %xmm1 +; SSE2-NEXT: pavgb 32(%rdi), %xmm2 ; SSE2-NEXT: pavgb 48(%rdi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v64i8: @@ -582,23 +582,23 @@ ; ; AVX2-LABEL: avg_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-NEXT: vpavgb (%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-NEXT: vpavgb (%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -678,12 +678,12 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa (%rsi), %xmm1 -; SSE2-NEXT: pavgw (%rdi), %xmm1 -; SSE2-NEXT: pavgw 16(%rsi), %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw (%rdi), %xmm0 +; SSE2-NEXT: pavgw 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v16i16: @@ -729,18 +729,18 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v32i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa 32(%rdi), %xmm0 -; SSE2-NEXT: movdqa (%rsi), %xmm1 -; SSE2-NEXT: movdqa 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgw (%rdi), %xmm1 -; SSE2-NEXT: pavgw 16(%rdi), %xmm2 -; SSE2-NEXT: pavgw 32(%rsi), %xmm0 +; SSE2-NEXT: pavgw (%rdi), %xmm0 +; SSE2-NEXT: pavgw 16(%rdi), %xmm1 +; SSE2-NEXT: pavgw 32(%rdi), %xmm2 ; SSE2-NEXT: pavgw 48(%rdi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i16: @@ -766,23 +766,23 @@ ; ; AVX2-LABEL: avg_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-NEXT: vpavgw (%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -891,9 +891,9 @@ ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 ; SSE2-NEXT: pavgb (%rsi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1072,9 +1072,9 @@ ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 ; SSE2-NEXT: pavgw (%rsi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1124,14 +1124,14 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: movdqa 32(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 ; SSE2-NEXT: pavgw (%rsi), %xmm0 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1 -; SSE2-NEXT: pavgw 32(%rdi), %xmm3 -; SSE2-NEXT: pavgw 48(%rsi), %xmm2 -; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: pavgw 32(%rsi), %xmm2 +; SSE2-NEXT: pavgw 48(%rsi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1160,9 +1160,9 @@ ; AVX2-LABEL: avg_v32i16_2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1171,9 +1171,9 @@ ; AVX512F-LABEL: avg_v32i16_2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper Index: llvm/test/CodeGen/X86/avx-vbroadcastf128.ll =================================================================== --- llvm/test/CodeGen/X86/avx-vbroadcastf128.ll +++ llvm/test/CodeGen/X86/avx-vbroadcastf128.ll @@ -235,18 +235,16 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm0 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-NEXT: vmovaps %ymm1, (%eax) -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: PR29088: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-NEXT: vmovaps %ymm1, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 Index: llvm/test/CodeGen/X86/avx2-vbroadcast.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -1065,9 +1065,7 @@ ; X64: ## %bb.0: ## %eintry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movb (%rdi), %al -; X64-NEXT: vmovd %eax, %xmm1 -; X64-NEXT: vpbroadcastb %xmm1, %xmm1 +; X64-NEXT: vpbroadcastb (%rdi), %xmm1 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq @@ -1118,9 +1116,7 @@ ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: movb (%rdi), %al -; X64-NEXT: vmovd %eax, %xmm1 -; X64-NEXT: vpbroadcastb %xmm1, %ymm1 +; X64-NEXT: vpbroadcastb (%rdi), %ymm1 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp @@ -1160,9 +1156,7 @@ ; X64: ## %bb.0: ## %entry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: vmovd %eax, %xmm1 -; X64-NEXT: vpbroadcastw %xmm1, %xmm1 +; X64-NEXT: vpbroadcastw (%rdi), %xmm1 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq @@ -1213,9 +1207,7 @@ ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: vmovd %eax, %xmm1 -; X64-NEXT: vpbroadcastw %xmm1, %ymm1 +; X64-NEXT: vpbroadcastw (%rdi), %ymm1 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp @@ -1251,26 +1243,14 @@ ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; -; X64-AVX2-LABEL: isel_crash_4d: -; X64-AVX2: ## %bb.0: ## %entry -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movl (%rdi), %eax -; X64-AVX2-NEXT: vmovd %eax, %xmm1 -; X64-AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: retq -; -; X64-AVX512VL-LABEL: isel_crash_4d: -; X64-AVX512VL: ## %bb.0: ## %entry -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: movl (%rdi), %eax -; X64-AVX512VL-NEXT: vpbroadcastd %eax, %xmm1 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: retq +; X64-LABEL: isel_crash_4d: +; X64: ## %bb.0: ## %entry +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vbroadcastss (%rdi), %xmm1 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 %__b.addr.i = alloca <2 x i64>, align 16 @@ -1307,46 +1287,24 @@ ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; -; X64-AVX2-LABEL: isel_crash_8d: -; X64-AVX2: ## %bb.0: ## %eintry -; X64-AVX2-NEXT: pushq %rbp -; X64-AVX2-NEXT: .cfi_def_cfa_offset 16 -; X64-AVX2-NEXT: .cfi_offset %rbp, -16 -; X64-AVX2-NEXT: movq %rsp, %rbp -; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp -; X64-AVX2-NEXT: andq $-32, %rsp -; X64-AVX2-NEXT: subq $128, %rsp -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp) -; X64-AVX2-NEXT: movl (%rdi), %eax -; X64-AVX2-NEXT: vmovd %eax, %xmm1 -; X64-AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; X64-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq %rbp, %rsp -; X64-AVX2-NEXT: popq %rbp -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq -; -; X64-AVX512VL-LABEL: isel_crash_8d: -; X64-AVX512VL: ## %bb.0: ## %eintry -; X64-AVX512VL-NEXT: pushq %rbp -; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16 -; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16 -; X64-AVX512VL-NEXT: movq %rsp, %rbp -; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp -; X64-AVX512VL-NEXT: andq $-32, %rsp -; X64-AVX512VL-NEXT: subq $128, %rsp -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp) -; X64-AVX512VL-NEXT: movl (%rdi), %eax -; X64-AVX512VL-NEXT: vpbroadcastd %eax, %ymm1 -; X64-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: movq %rbp, %rsp -; X64-AVX512VL-NEXT: popq %rbp -; X64-AVX512VL-NEXT: vzeroupper -; X64-AVX512VL-NEXT: retq +; X64-LABEL: isel_crash_8d: +; X64: ## %bb.0: ## %eintry +; X64-NEXT: pushq %rbp +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: .cfi_offset %rbp, -16 +; X64-NEXT: movq %rsp, %rbp +; X64-NEXT: .cfi_def_cfa_register %rbp +; X64-NEXT: andq $-32, %rsp +; X64-NEXT: subq $128, %rsp +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovaps %ymm0, (%rsp) +; X64-NEXT: vbroadcastss (%rdi), %ymm1 +; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; X64-NEXT: movq %rbp, %rsp +; X64-NEXT: popq %rbp +; X64-NEXT: vzeroupper +; X64-NEXT: retq eintry: %__a.addr.i = alloca <4 x i64>, align 16 %__b.addr.i = alloca <4 x i64>, align 16 @@ -1370,33 +1328,20 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X32-NEXT: vmovaps %xmm0, (%esp) -; X32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vpbroadcastq %xmm1, %xmm1 +; X32-NEXT: vpbroadcastq (%eax), %xmm1 ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; -; X64-AVX2-LABEL: isel_crash_2q: -; X64-AVX2: ## %bb.0: ## %entry -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq (%rdi), %rax -; X64-AVX2-NEXT: vmovq %rax, %xmm1 -; X64-AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: retq -; -; X64-AVX512VL-LABEL: isel_crash_2q: -; X64-AVX512VL: ## %bb.0: ## %entry -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: movq (%rdi), %rax -; X64-AVX512VL-NEXT: vpbroadcastq %rax, %xmm1 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: retq +; X64-LABEL: isel_crash_2q: +; X64: ## %bb.0: ## %entry +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vpbroadcastq (%rdi), %xmm1 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 %__b.addr.i = alloca <2 x i64>, align 16 @@ -1433,46 +1378,24 @@ ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; -; X64-AVX2-LABEL: isel_crash_4q: -; X64-AVX2: ## %bb.0: ## %eintry -; X64-AVX2-NEXT: pushq %rbp -; X64-AVX2-NEXT: .cfi_def_cfa_offset 16 -; X64-AVX2-NEXT: .cfi_offset %rbp, -16 -; X64-AVX2-NEXT: movq %rsp, %rbp -; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp -; X64-AVX2-NEXT: andq $-32, %rsp -; X64-AVX2-NEXT: subq $128, %rsp -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp) -; X64-AVX2-NEXT: movq (%rdi), %rax -; X64-AVX2-NEXT: vmovq %rax, %xmm1 -; X64-AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; X64-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq %rbp, %rsp -; X64-AVX2-NEXT: popq %rbp -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq -; -; X64-AVX512VL-LABEL: isel_crash_4q: -; X64-AVX512VL: ## %bb.0: ## %eintry -; X64-AVX512VL-NEXT: pushq %rbp -; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16 -; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16 -; X64-AVX512VL-NEXT: movq %rsp, %rbp -; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp -; X64-AVX512VL-NEXT: andq $-32, %rsp -; X64-AVX512VL-NEXT: subq $128, %rsp -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp) -; X64-AVX512VL-NEXT: movq (%rdi), %rax -; X64-AVX512VL-NEXT: vpbroadcastq %rax, %ymm1 -; X64-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: movq %rbp, %rsp -; X64-AVX512VL-NEXT: popq %rbp -; X64-AVX512VL-NEXT: vzeroupper -; X64-AVX512VL-NEXT: retq +; X64-LABEL: isel_crash_4q: +; X64: ## %bb.0: ## %eintry +; X64-NEXT: pushq %rbp +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: .cfi_offset %rbp, -16 +; X64-NEXT: movq %rsp, %rbp +; X64-NEXT: .cfi_def_cfa_register %rbp +; X64-NEXT: andq $-32, %rsp +; X64-NEXT: subq $128, %rsp +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovaps %ymm0, (%rsp) +; X64-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; X64-NEXT: movq %rbp, %rsp +; X64-NEXT: popq %rbp +; X64-NEXT: vzeroupper +; X64-NEXT: retq eintry: %__a.addr.i = alloca <4 x i64>, align 16 %__b.addr.i = alloca <4 x i64>, align 16 Index: llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll =================================================================== --- llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll +++ llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll @@ -271,18 +271,16 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovaps (%ecx), %xmm0 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-NEXT: vmovaps %ymm1, (%eax) -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: PR29088: ; X64: # %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-NEXT: vmovaps %ymm1, (%rsi) -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 Index: llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -186,26 +186,23 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X64-AVX512VL-LABEL: PR29088: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512VL-NEXT: vmovdqa %ymm1, (%rsi) -; X64-AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: PR29088: ; X64-AVX512BWVL: ## %bb.0: -; X64-AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX512BWVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) -; X64-AVX512BWVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: PR29088: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512DQVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512DQVL-NEXT: vmovaps %ymm1, (%rsi) -; X64-AVX512DQVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQVL-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 Index: llvm/test/CodeGen/X86/foldmem_cycle.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/foldmem_cycle.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64 + +; The load should not be merged with the and asit causes a cycle in the DAG. + +define void @foo() { +; X64-LABEL: foo: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 16 +; X64-NEXT: .cfi_offset %rbx, -16 +; X64-NEXT: movl (%rax), %ebx +; X64-NEXT: callq bar +; X64-NEXT: testl %ebx, %eax +; X64-NEXT: jne .LBB0_2 +; X64-NEXT: # %bb.1: # %if.then +; X64-NEXT: popq %rbx +; X64-NEXT: retq +; X64-NEXT: .LBB0_2: # %if.end +entry: + %0 = load i32, i32* undef + %call = tail call i32 @bar() + %and = and i32 %call, %0 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: + ret void + +if.end: + unreachable +} + +declare i32 @bar() Index: llvm/test/CodeGen/X86/i256-add.ll =================================================================== --- llvm/test/CodeGen/X86/i256-add.ll +++ llvm/test/CodeGen/X86/i256-add.ll @@ -9,40 +9,30 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $12, %esp +; X32-NEXT: subl $8, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 28(%eax), %ecx +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: movl 24(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%eax), %esi +; X32-NEXT: movl 16(%eax), %edi +; X32-NEXT: movl 12(%eax), %ebx +; X32-NEXT: movl 8(%eax), %ebp +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 8(%ecx), %edi -; X32-NEXT: movl (%ecx), %edx -; X32-NEXT: movl 4(%ecx), %ebx -; X32-NEXT: movl 28(%eax), %esi -; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 24(%eax), %ebp -; X32-NEXT: addl (%eax), %edx -; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: adcl 4(%eax), %ebx -; X32-NEXT: adcl 8(%eax), %edi -; X32-NEXT: movl %edi, (%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %edi -; X32-NEXT: movl 12(%eax), %edx -; X32-NEXT: movl 16(%eax), %esi -; X32-NEXT: adcl 12(%ecx), %edx -; X32-NEXT: adcl 16(%ecx), %esi -; X32-NEXT: adcl 20(%ecx), %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: adcl 24(%ecx), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload -; X32-NEXT: adcl %ebp, 28(%ecx) -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, 8(%ecx) -; X32-NEXT: movl %ebx, 4(%ecx) -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, (%ecx) -; X32-NEXT: movl %edx, 12(%ecx) -; X32-NEXT: movl %esi, 16(%ecx) -; X32-NEXT: movl %edi, 20(%ecx) -; X32-NEXT: movl %eax, 24(%ecx) -; X32-NEXT: addl $12, %esp +; X32-NEXT: addl %ecx, (%eax) +; X32-NEXT: adcl %edx, 4(%eax) +; X32-NEXT: adcl %ebp, 8(%eax) +; X32-NEXT: adcl %ebx, 12(%eax) +; X32-NEXT: adcl %edi, 16(%eax) +; X32-NEXT: adcl %esi, 20(%eax) +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, 24(%eax) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload +; X32-NEXT: adcl %ecx, 28(%eax) +; X32-NEXT: addl $8, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -51,17 +41,14 @@ ; ; X64-LABEL: add: ; X64: # %bb.0: -; X64-NEXT: movq 16(%rdi), %rax -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq 8(%rdi), %rdx -; X64-NEXT: movq 24(%rsi), %r8 -; X64-NEXT: addq (%rsi), %rcx -; X64-NEXT: adcq 8(%rsi), %rdx -; X64-NEXT: adcq 16(%rsi), %rax -; X64-NEXT: adcq %r8, 24(%rdi) -; X64-NEXT: movq %rax, 16(%rdi) -; X64-NEXT: movq %rdx, 8(%rdi) -; X64-NEXT: movq %rcx, (%rdi) +; X64-NEXT: movq 24(%rsi), %rax +; X64-NEXT: movq 16(%rsi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq 8(%rsi), %rsi +; X64-NEXT: addq %rdx, (%rdi) +; X64-NEXT: adcq %rsi, 8(%rdi) +; X64-NEXT: adcq %rcx, 16(%rdi) +; X64-NEXT: adcq %rax, 24(%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q @@ -77,35 +64,28 @@ ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: subl $8, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %eax -; X32-NEXT: movl 12(%ecx), %edx -; X32-NEXT: movl 8(%ecx), %edi -; X32-NEXT: movl (%ecx), %ebx -; X32-NEXT: movl 4(%ecx), %ebp -; X32-NEXT: subl (%esi), %ebx -; X32-NEXT: sbbl 4(%esi), %ebp -; X32-NEXT: sbbl 8(%esi), %edi -; X32-NEXT: sbbl 12(%esi), %edx -; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: sbbl 16(%esi), %eax -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movl 20(%ecx), %edx -; X32-NEXT: sbbl 20(%esi), %edx -; X32-NEXT: movl 24(%ecx), %eax -; X32-NEXT: sbbl 24(%esi), %eax -; X32-NEXT: movl 28(%esi), %esi -; X32-NEXT: sbbl %esi, 28(%ecx) -; X32-NEXT: movl %edi, 8(%ecx) -; X32-NEXT: movl %ebp, 4(%ecx) -; X32-NEXT: movl %ebx, (%ecx) -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 12(%ecx) -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: movl %esi, 16(%ecx) -; X32-NEXT: movl %edx, 20(%ecx) -; X32-NEXT: movl %eax, 24(%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 28(%eax), %ecx +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: movl 24(%eax), %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%eax), %esi +; X32-NEXT: movl 16(%eax), %edi +; X32-NEXT: movl 12(%eax), %ebx +; X32-NEXT: movl 8(%eax), %ebp +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: subl %ecx, (%eax) +; X32-NEXT: sbbl %edx, 4(%eax) +; X32-NEXT: sbbl %ebp, 8(%eax) +; X32-NEXT: sbbl %ebx, 12(%eax) +; X32-NEXT: sbbl %edi, 16(%eax) +; X32-NEXT: sbbl %esi, 20(%eax) +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: sbbl %ecx, 24(%eax) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload +; X32-NEXT: sbbl %ecx, 28(%eax) ; X32-NEXT: addl $8, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi @@ -115,17 +95,14 @@ ; ; X64-LABEL: sub: ; X64: # %bb.0: -; X64-NEXT: movq 16(%rdi), %rax -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: movq 8(%rdi), %rdx -; X64-NEXT: movq 24(%rsi), %r8 -; X64-NEXT: subq (%rsi), %rcx -; X64-NEXT: sbbq 8(%rsi), %rdx -; X64-NEXT: sbbq 16(%rsi), %rax -; X64-NEXT: sbbq %r8, 24(%rdi) -; X64-NEXT: movq %rax, 16(%rdi) -; X64-NEXT: movq %rdx, 8(%rdi) -; X64-NEXT: movq %rcx, (%rdi) +; X64-NEXT: movq 24(%rsi), %rax +; X64-NEXT: movq 16(%rsi), %rcx +; X64-NEXT: movq (%rsi), %rdx +; X64-NEXT: movq 8(%rsi), %rsi +; X64-NEXT: subq %rdx, (%rdi) +; X64-NEXT: sbbq %rsi, 8(%rdi) +; X64-NEXT: sbbq %rcx, 16(%rdi) +; X64-NEXT: sbbq %rax, 24(%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q Index: llvm/test/CodeGen/X86/load-op-store-fusion.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/load-op-store-fusion.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 + +; This test makes sure we do not merge both load-op-store pairs here as it causes a cycle. + +define i8* @fn(i32 %i.015.i, [64 x i64]* %data.i) { +; X32-LABEL: fn: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx,%eax,8), %edx +; X32-NEXT: addl $1, %edx +; X32-NEXT: adcl $0, 4(%ecx,%eax,8) +; X32-NEXT: movl %edx, (%ecx,%eax,8) +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: retl +; +; X64-LABEL: fn: +; X64: # %bb.0: # %entry +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: incq (%rsi,%rax,8) +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq +entry: + %arrayidx.i6 = getelementptr inbounds [64 x i64], [64 x i64]* %data.i, i32 0, i32 %i.015.i + %x8 = load volatile i64, i64* %arrayidx.i6, align 8 + %inc.i7 = add i64 %x8, 1 + store volatile i64 %inc.i7, i64* %arrayidx.i6, align 8 + ret i8* null +} + Index: llvm/test/CodeGen/X86/masked_memop.ll =================================================================== --- llvm/test/CodeGen/X86/masked_memop.ll +++ llvm/test/CodeGen/X86/masked_memop.ll @@ -1280,8 +1280,7 @@ ; AVX-LABEL: load_one_mask_bit_set5: ; AVX: ## %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: retq ; Index: llvm/test/CodeGen/X86/merge-consecutive-stores.ll =================================================================== --- llvm/test/CodeGen/X86/merge-consecutive-stores.ll +++ llvm/test/CodeGen/X86/merge-consecutive-stores.ll @@ -10,12 +10,11 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl $0, 28(%eax) ; CHECK-NEXT: movl $0, 24(%eax) -; CHECK-NEXT: movl 20(%eax), %ecx -; CHECK-NEXT: movl $0, 20(%eax) -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: cmpl 16(%eax), %edx +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: cmpl 16(%eax), %ecx ; CHECK-NEXT: movl $0, 16(%eax) -; CHECK-NEXT: sbbl %ecx, %edx +; CHECK-NEXT: sbbl 20(%eax), %ecx +; CHECK-NEXT: movl $0, 20(%eax) ; CHECK-NEXT: setl %al ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: negl %eax Index: llvm/test/CodeGen/X86/nontemporal.ll =================================================================== --- llvm/test/CodeGen/X86/nontemporal.ll +++ llvm/test/CodeGen/X86/nontemporal.ll @@ -13,36 +13,35 @@ ; X32-SSE-NEXT: andl $-16, %esp ; X32-SSE-NEXT: subl $16, %esp ; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X32-SSE-NEXT: movl 12(%ebp), %eax +; X32-SSE-NEXT: movl 12(%ebp), %ecx ; X32-SSE-NEXT: movdqa 56(%ebp), %xmm4 ; X32-SSE-NEXT: movdqa 40(%ebp), %xmm5 ; X32-SSE-NEXT: movdqa 24(%ebp), %xmm6 -; X32-SSE-NEXT: movl 8(%ebp), %edx -; X32-SSE-NEXT: movl 80(%ebp), %ecx -; X32-SSE-NEXT: movl (%ecx), %esi +; X32-SSE-NEXT: movl 8(%ebp), %esi +; X32-SSE-NEXT: movl 80(%ebp), %edx +; X32-SSE-NEXT: movl (%edx), %eax ; X32-SSE-NEXT: addps {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movntps %xmm0, (%edx) +; X32-SSE-NEXT: movntps %xmm0, (%esi) ; X32-SSE-NEXT: paddq {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: addl (%ecx), %esi -; X32-SSE-NEXT: movntdq %xmm2, (%edx) +; X32-SSE-NEXT: addl (%edx), %eax +; X32-SSE-NEXT: movntdq %xmm2, (%esi) ; X32-SSE-NEXT: addpd {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: addl (%ecx), %esi -; X32-SSE-NEXT: movntpd %xmm1, (%edx) +; X32-SSE-NEXT: addl (%edx), %eax +; X32-SSE-NEXT: movntpd %xmm1, (%esi) ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm6 -; X32-SSE-NEXT: addl (%ecx), %esi -; X32-SSE-NEXT: movntdq %xmm6, (%edx) +; X32-SSE-NEXT: addl (%edx), %eax +; X32-SSE-NEXT: movntdq %xmm6, (%esi) ; X32-SSE-NEXT: paddw {{\.LCPI.*}}, %xmm5 -; X32-SSE-NEXT: addl (%ecx), %esi -; X32-SSE-NEXT: movntdq %xmm5, (%edx) +; X32-SSE-NEXT: addl (%edx), %eax +; X32-SSE-NEXT: movntdq %xmm5, (%esi) ; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm4 -; X32-SSE-NEXT: addl (%ecx), %esi -; X32-SSE-NEXT: movntdq %xmm4, (%edx) -; X32-SSE-NEXT: addl (%ecx), %esi -; X32-SSE-NEXT: movntil %eax, (%edx) -; X32-SSE-NEXT: movl (%ecx), %eax -; X32-SSE-NEXT: addl %esi, %eax -; X32-SSE-NEXT: movsd %xmm3, (%edx) -; X32-SSE-NEXT: addl (%ecx), %eax +; X32-SSE-NEXT: addl (%edx), %eax +; X32-SSE-NEXT: movntdq %xmm4, (%esi) +; X32-SSE-NEXT: addl (%edx), %eax +; X32-SSE-NEXT: movntil %ecx, (%esi) +; X32-SSE-NEXT: addl (%edx), %eax +; X32-SSE-NEXT: movsd %xmm3, (%esi) +; X32-SSE-NEXT: addl (%edx), %eax ; X32-SSE-NEXT: leal -4(%ebp), %esp ; X32-SSE-NEXT: popl %esi ; X32-SSE-NEXT: popl %ebp @@ -56,36 +55,35 @@ ; X32-AVX-NEXT: andl $-16, %esp ; X32-AVX-NEXT: subl $16, %esp ; X32-AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; X32-AVX-NEXT: movl 12(%ebp), %eax +; X32-AVX-NEXT: movl 12(%ebp), %ecx ; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm4 ; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm5 ; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm6 -; X32-AVX-NEXT: movl 8(%ebp), %ecx -; X32-AVX-NEXT: movl 80(%ebp), %edx -; X32-AVX-NEXT: movl (%edx), %esi +; X32-AVX-NEXT: movl 8(%ebp), %edx +; X32-AVX-NEXT: movl 80(%ebp), %esi +; X32-AVX-NEXT: movl (%esi), %eax ; X32-AVX-NEXT: vaddps {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-AVX-NEXT: vmovntps %xmm0, (%ecx) +; X32-AVX-NEXT: vmovntps %xmm0, (%edx) ; X32-AVX-NEXT: vpaddq {{\.LCPI.*}}, %xmm2, %xmm0 -; X32-AVX-NEXT: addl (%edx), %esi -; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) +; X32-AVX-NEXT: addl (%esi), %eax +; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) ; X32-AVX-NEXT: vaddpd {{\.LCPI.*}}, %xmm1, %xmm0 -; X32-AVX-NEXT: addl (%edx), %esi -; X32-AVX-NEXT: vmovntpd %xmm0, (%ecx) +; X32-AVX-NEXT: addl (%esi), %eax +; X32-AVX-NEXT: vmovntpd %xmm0, (%edx) ; X32-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm6, %xmm0 -; X32-AVX-NEXT: addl (%edx), %esi -; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) +; X32-AVX-NEXT: addl (%esi), %eax +; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) ; X32-AVX-NEXT: vpaddw {{\.LCPI.*}}, %xmm5, %xmm0 -; X32-AVX-NEXT: addl (%edx), %esi -; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) +; X32-AVX-NEXT: addl (%esi), %eax +; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) ; X32-AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm4, %xmm0 -; X32-AVX-NEXT: addl (%edx), %esi -; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) -; X32-AVX-NEXT: addl (%edx), %esi -; X32-AVX-NEXT: movntil %eax, (%ecx) -; X32-AVX-NEXT: movl (%edx), %eax -; X32-AVX-NEXT: addl %esi, %eax -; X32-AVX-NEXT: vmovsd %xmm3, (%ecx) -; X32-AVX-NEXT: addl (%edx), %eax +; X32-AVX-NEXT: addl (%esi), %eax +; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X32-AVX-NEXT: addl (%esi), %eax +; X32-AVX-NEXT: movntil %ecx, (%edx) +; X32-AVX-NEXT: addl (%esi), %eax +; X32-AVX-NEXT: vmovsd %xmm3, (%edx) +; X32-AVX-NEXT: addl (%esi), %eax ; X32-AVX-NEXT: leal -4(%ebp), %esp ; X32-AVX-NEXT: popl %esi ; X32-AVX-NEXT: popl %ebp Index: llvm/test/CodeGen/X86/store_op_load_fold.ll =================================================================== --- llvm/test/CodeGen/X86/store_op_load_fold.ll +++ llvm/test/CodeGen/X86/store_op_load_fold.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i686-darwin | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 ; ; Test the add and load are folded into the store instruction. Index: llvm/test/CodeGen/X86/store_op_load_fold2.ll =================================================================== --- llvm/test/CodeGen/X86/store_op_load_fold2.ll +++ llvm/test/CodeGen/X86/store_op_load_fold2.ll @@ -17,14 +17,14 @@ store i64 %tmp2676.us.us, i64* %tmp2666 ret i32 0 -; INTEL: and {{e..}}, dword ptr [360] -; INTEL: and dword ptr [356], {{e..}} -; FIXME: mov dword ptr [360], {{e..}} +; INTEL: and {{e..}}, dword ptr [356] +; INTEL: and dword ptr [360], {{e..}} +; FIXME: mov dword ptr [356], {{e..}} ; The above line comes out as 'mov 360, eax', but when the register is ecx it works? -; ATT: andl 360, %{{e..}} -; ATT: andl %{{e..}}, 356 -; ATT: movl %{{e..}}, 360 +; ATT: andl 356, %{{e..}} +; ATT: andl %{{e..}}, 360 +; ATT: movl %{{e..}}, 356 } Index: llvm/test/CodeGen/X86/subvector-broadcast.ll =================================================================== --- llvm/test/CodeGen/X86/subvector-broadcast.ll +++ llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -751,72 +751,64 @@ ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX-NEXT: vmovaps %xmm1, (%eax) -; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX-NEXT: retl ; ; X32-AVX512F-LABEL: test_broadcast_4i32_8i32_chain: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) -; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: ; X32-AVX512BW: # %bb.0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) -; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: ; X32-AVX512DQ: # %bb.0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512DQ-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) -; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX-NEXT: retq ; ; X64-AVX512F-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) -; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) -; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) -; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 @@ -829,10 +821,9 @@ ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX-NEXT: vmovaps %xmm1, (%eax) -; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; @@ -840,63 +831,56 @@ ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) -; X32-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: ; X32-AVX512BW: # %bb.0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) -; X32-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: ; X32-AVX512DQ: # %bb.0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) -; X32-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; ; X64-AVX512F-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) -; X64-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) -; X64-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) -; X64-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512DQ-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 Index: llvm/test/CodeGen/X86/var-permute-256.ll =================================================================== --- llvm/test/CodeGen/X86/var-permute-256.ll +++ llvm/test/CodeGen/X86/var-permute-256.ll @@ -508,64 +508,49 @@ ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpextrb $1, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $2, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $3, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $5, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $6, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $7, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $8, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $9, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $10, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $11, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $12, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $13, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $14, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $15, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax ; AVX1-NEXT: movzbl (%rsp,%rax), %eax @@ -635,64 +620,49 @@ ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpextrb $1, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $2, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $3, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $5, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $6, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $7, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $8, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $9, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $10, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $11, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $12, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $13, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $14, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $15, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax ; AVX2-NEXT: movzbl (%rsp,%rax), %eax @@ -762,64 +732,49 @@ ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpextrb $1, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $2, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $3, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $4, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $5, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $6, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $7, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $8, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $9, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $10, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $11, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $15, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $0, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -889,64 +844,49 @@ ; AVX512VL-NEXT: vmovd %eax, %xmm0 ; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax ; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax @@ -1737,64 +1677,49 @@ ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpextrb $1, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $2, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $3, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $5, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $6, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $7, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $8, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $9, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $10, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $11, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $12, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $13, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $14, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $15, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm1, %eax ; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax @@ -1858,64 +1783,49 @@ ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpextrb $1, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $2, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $3, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $5, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $6, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $7, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $8, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $9, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $10, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $11, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $12, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $13, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $14, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $15, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm1, %eax ; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax @@ -1979,64 +1889,49 @@ ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpextrb $1, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $2, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $3, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $4, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $5, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $6, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $7, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $8, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $9, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $10, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $11, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $15, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $0, %xmm1, %eax ; AVX512F-NEXT: andl $15, %eax ; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2100,64 +1995,49 @@ ; AVX512VL-NEXT: vmovd %eax, %xmm0 ; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax -; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax ; AVX512VL-NEXT: andl $15, %eax ; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax Index: llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -47,8 +47,7 @@ ; ALL-NEXT: andl $3, %edx ; ALL-NEXT: andl $3, %esi ; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; ALL-NEXT: movq %rbp, %rsp