Index: include/llvm/CodeGen/SelectionDAGAddressAnalysis.h =================================================================== --- include/llvm/CodeGen/SelectionDAGAddressAnalysis.h +++ include/llvm/CodeGen/SelectionDAGAddressAnalysis.h @@ -57,7 +57,7 @@ int64_t &Off); /// Parses tree in Ptr for base, index, offset addresses. - static BaseIndexOffset match(SDValue Ptr); + static BaseIndexOffset match(SDValue Ptr, const SelectionDAG &DAG); }; } // namespace llvm Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -2691,6 +2691,9 @@ bool foldBooleans, DAGCombinerInfo &DCI, const SDLoc &dl) const; + // For targets which wrap address, unwrap for analysis. + virtual SDValue unwrapAddress(SDValue N) const { return N; } + /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. virtual bool Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4915,7 +4915,7 @@ return SDValue(); // Loads must share the same base address - BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr()); + BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG); int64_t ByteOffsetFromBase = 0; if (!Base) Base = Ptr; @@ -12319,8 +12319,21 @@ StoreSDNode *St = cast(StoreNodes[i].MemNode); SDValue Val = St->getValue(); // All operands of BUILD_VECTOR / CONCAT_VECTOR must have the same type. + while (Val.getValueType() != MemVT && Val->getOpcode() == ISD::BITCAST) + Val = Val.getOperand(0); + // If we see an extract of a Bitcast, and we could remove both + // bitcasts, do so. + if (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Val.getOperand(0).getOpcode() == ISD::BITCAST) { + SDValue Vec = Val.getOperand(0).getOperand(0); + if (Vec.getValueType().getVectorElementType() == MemVT) + Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Val), MemVT, Vec, + Val.getOperand(1)); + } + // TODO: add this check for EXTRACT_SUBVECTOR as well. if (Val.getValueType() != MemVT) return false; + Ops.push_back(Val); } @@ -12394,9 +12407,14 @@ StoreSDNode *St, SmallVectorImpl &StoreNodes) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. - BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr()); + BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); + EVT MemVT = St->getMemoryVT(); + SDValue Val = St->getValue(); + while (Val.getValueType() != MemVT && Val->getOpcode() == ISD::BITCAST) + Val = Val.getOperand(0); + // We must have a base and an offset. if (!BasePtr.getBase().getNode()) return; @@ -12407,43 +12425,54 @@ bool IsConstantSrc = isa(St->getValue()) || isa(St->getValue()); - bool IsExtractVecSrc = - (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || - St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR); + + bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Val.getOpcode() == ISD::EXTRACT_SUBVECTOR); bool IsLoadSrc = isa(St->getValue()); BaseIndexOffset LBasePtr; // Match on loadbaseptr if relevant. if (IsLoadSrc) - LBasePtr = - BaseIndexOffset::match(cast(St->getValue())->getBasePtr()); + LBasePtr = BaseIndexOffset::match( + cast(St->getValue())->getBasePtr(), DAG); auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { + bool EquivType = (Other->getMemoryVT() == MemVT) || + (MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT())); if (Other->isVolatile() || Other->isIndexed()) return false; - // We can merge constant floats to equivalent integers - if (Other->getMemoryVT() != MemVT) - if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) && - isa(Other->getValue()))) - return false; if (IsLoadSrc) { // The Load's Base Ptr must also match + if (!EquivType) + return false; if (LoadSDNode *OtherLd = dyn_cast(Other->getValue())) { - auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr()); + auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG); if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) return false; } else return false; } - if (IsConstantSrc) - if (!(isa(Other->getValue()) || - isa(Other->getValue()))) + if (IsConstantSrc) { + // May truncate things converted to integers + if (!EquivType || !(isa(Other->getValue()) || + isa(Other->getValue()))) return false; - if (IsExtractVecSrc) - if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT || - Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR)) + } + if (IsExtractVecSrc) { + SDValue Val = Other->getValue(); + // Peel off bitcasts. + bool MatchVT = false; + while (Val.getOpcode() == ISD::BITCAST) { + if (Val.getValueType() == MemVT) + MatchVT = true; + Val = Val.getOperand(0); + } + if (!(Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) return false; - Ptr = BaseIndexOffset::match(Other->getBasePtr()); + } + + Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG); return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); }; // We looking for a root node which is an ancestor to all mergable @@ -12714,7 +12743,13 @@ bool IsVec = MemVT.isVector(); for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *St = cast(StoreNodes[i].MemNode); - unsigned StoreValOpcode = St->getValue().getOpcode(); + SDValue StVal = St->getValue(); + // peek through bitcasts + while (StVal->getOpcode() == ISD::BITCAST) + StVal = StVal.getOperand(0); + + unsigned StoreValOpcode = StVal.getOpcode(); + // This restriction could be loosened. // Bail out if any stored values are not elements extracted from a // vector. It should be possible to handle mixed sources, but load @@ -12786,7 +12821,7 @@ if (Ld->getMemoryVT() != MemVT) break; - BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr()); + BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG); // If this is not the first ptr that we check. int64_t LdOffset = 0; if (LdBasePtr.getBase().getNode()) { @@ -15470,12 +15505,42 @@ if (N1.isUndef()) return N0; + // If we N0 is also an insert_subvector and we've the only use fix it first. + if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR) + if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode())) + return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2); + // If this is an insert of an extracted vector into an undef vector, we can // just use the input to the extract. if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) return N1.getOperand(0); + // If inserting a bitcast values into an undef vector, w/ same num + // elements, we can just use the bitcast input of the extract + if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST && + N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && + N1.getOperand(0).getOperand(1) == N2 && + N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() == + VT.getVectorNumElements()) { + return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0)); + } + + // If Both N1 and N2 are bit cast values on which insert_subvector + // makes sense, pull bitcast through the insert. + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) { + SDValue CN0 = N0.getOperand(0); + SDValue CN1 = N1.getOperand(0); + if (CN0.getValueType().getVectorElementType() == + CN1.getValueType().getVectorElementType() && + CN0.getValueType().getVectorNumElements() == + VT.getVectorNumElements()) { + SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), + CN0.getValueType(), CN0, CN1, N2); + return DAG.getBitcast(VT, NewINSERT); + } + } + // Combine INSERT_SUBVECTORs where we are inserting to the same index. // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx ) // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) @@ -16540,8 +16605,8 @@ unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3; // Check for BaseIndexOffset matching. - BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr()); - BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr()); + BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG); + BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG); int64_t PtrDiff; if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0)); @@ -16751,7 +16816,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. - BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr()); + BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG); // We must have a base and an offset. if (!BasePtr.getBase().getNode()) @@ -16777,7 +16842,7 @@ break; // Find the base pointer and offset for this memory node. - BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr()); + BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG); // Check that the base pointer is the same as the original one. if (!BasePtr.equalBaseIndex(Ptr, DAG)) Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -34,6 +34,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/ValueTypes.h" @@ -7613,45 +7614,13 @@ SDValue Loc = LD->getOperand(1); SDValue BaseLoc = Base->getOperand(1); - if (Loc.getOpcode() == ISD::FrameIndex) { - if (BaseLoc.getOpcode() != ISD::FrameIndex) - return false; - const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo(); - int FI = cast(Loc)->getIndex(); - int BFI = cast(BaseLoc)->getIndex(); - int FS = MFI.getObjectSize(FI); - int BFS = MFI.getObjectSize(BFI); - if (FS != BFS || FS != (int)Bytes) return false; - return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes); - } - - // Handle X + C. - if (isBaseWithConstantOffset(Loc)) { - int64_t LocOffset = cast(Loc.getOperand(1))->getSExtValue(); - if (Loc.getOperand(0) == BaseLoc) { - // If the base location is a simple address with no offset itself, then - // the second load's first add operand should be the base address. - if (LocOffset == Dist * (int)Bytes) - return true; - } else if (isBaseWithConstantOffset(BaseLoc)) { - // The base location itself has an offset, so subtract that value from the - // second load's offset before comparing to distance * size. - int64_t BOffset = - cast(BaseLoc.getOperand(1))->getSExtValue(); - if (Loc.getOperand(0) == BaseLoc.getOperand(0)) { - if ((LocOffset - BOffset) == Dist * (int)Bytes) - return true; - } - } - } - const GlobalValue *GV1 = nullptr; - const GlobalValue *GV2 = nullptr; - int64_t Offset1 = 0; - int64_t Offset2 = 0; - bool isGA1 = TLI->isGAPlusOffset(Loc.getNode(), GV1, Offset1); - bool isGA2 = TLI->isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); - if (isGA1 && isGA2 && GV1 == GV2) - return Offset1 == (Offset2 + Dist*Bytes); + + auto BaseLocDecomp = BaseIndexOffset::match(BaseLoc, *this); + auto LocDecomp = BaseIndexOffset::match(Loc, *this); + + int64_t Offset = 0; + if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset)) + return (Dist * Bytes == Offset); return false; } Index: lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -11,8 +11,10 @@ #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/Target/TargetLowering.h" namespace llvm { @@ -26,32 +28,57 @@ // Match GlobalAddresses if (Index == Other.Index) - if (GlobalAddressSDNode *A = dyn_cast(Base)) - if (GlobalAddressSDNode *B = dyn_cast(Other.Base)) + if (auto *A = dyn_cast(Base)) + if (auto *B = dyn_cast(Other.Base)) if (A->getGlobal() == B->getGlobal()) { Off += B->getOffset() - A->getOffset(); return true; } - // TODO: we should be able to add FrameIndex analysis improvements here. + // Match FrameIndexes + if (Index == Other.Index) + if (auto *A = dyn_cast(Base)) + if (auto *B = dyn_cast(Other.Base)) { + const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + Off += MFI.getObjectOffset(B->getIndex()) - + MFI.getObjectOffset(A->getIndex()); + return true; + } return false; } /// Parses tree in Ptr for base, index, offset addresses. -BaseIndexOffset BaseIndexOffset::match(SDValue Ptr) { +BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // (((B + I*M) + c)) + c ... - SDValue Base = Ptr; + SDValue Base = TLI.unwrapAddress(Ptr); SDValue Index = SDValue(); int64_t Offset = 0; bool IsIndexSignExt = false; - // Consume constant adds - while (Base->getOpcode() == ISD::ADD && - isa(Base->getOperand(1))) { - int64_t POffset = cast(Base->getOperand(1))->getSExtValue(); - Offset += POffset; - Base = Base->getOperand(0); + // Consume constant adds & ors with appropriate masking. + while (Base->getOpcode() == ISD::ADD || Base->getOpcode() == ISD::OR) { + if (auto *C = dyn_cast(Base->getOperand(1))) { + if (Base->getOpcode() == ISD::ADD || + (Base->getOpcode() == ISD::OR && + DAG.MaskedValueIsZero(Base->getOperand(0), C->getAPIntValue()))) { + Offset += C->getSExtValue(); + Base = Base->getOperand(0); + continue; + } + } + if (auto *C = dyn_cast(Base->getOperand(0))) { + if (Base->getOpcode() == ISD::ADD || + (Base->getOpcode() == ISD::OR && + DAG.MaskedValueIsZero(Base->getOperand(1), C->getAPIntValue()))) { + Offset += C->getSExtValue(); + Base = Base->getOperand(1); + continue; + } + } + break; } if (Base->getOpcode() == ISD::ADD) { Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -786,6 +786,8 @@ /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + bool mergeStoresAfterLegalization() const override { return true; } + bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; @@ -841,6 +843,8 @@ const SelectionDAG &DAG, unsigned Depth) const override; + SDValue unwrapAddress(SDValue N) const override; + bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const override; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26865,6 +26865,12 @@ return 1; } +SDValue X86TargetLowering::unwrapAddress(SDValue N) const { + if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP) + return N->getOperand(0); + return N; +} + /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. bool X86TargetLowering::isGAPlusOffset(SDNode *N, Index: test/CodeGen/BPF/undef.ll =================================================================== --- test/CodeGen/BPF/undef.ll +++ test/CodeGen/BPF/undef.ll @@ -13,36 +13,30 @@ ; Function Attrs: nounwind uwtable define i32 @ebpf_filter(%struct.__sk_buff* nocapture readnone %ebpf_packet) #0 section "socket1" { -; CHECK: r2 = r10 -; CHECK: r2 += -2 -; CHECK: r1 = 0 -; CHECK: *(u16 *)(r2 + 6) = r1 -; CHECK: *(u16 *)(r2 + 4) = r1 -; CHECK: *(u16 *)(r2 + 2) = r1 -; CHECK: r2 = 6 -; CHECK: *(u8 *)(r10 - 7) = r2 -; CHECK: r2 = 5 -; CHECK: *(u8 *)(r10 - 8) = r2 -; CHECK: r2 = 7 -; CHECK: *(u8 *)(r10 - 6) = r2 -; CHECK: r2 = 8 -; CHECK: *(u8 *)(r10 - 5) = r2 -; CHECK: r2 = 9 -; CHECK: *(u8 *)(r10 - 4) = r2 -; CHECK: r2 = 10 -; CHECK: *(u8 *)(r10 - 3) = r2 -; CHECK: *(u16 *)(r10 + 24) = r1 -; CHECK: *(u16 *)(r10 + 22) = r1 -; CHECK: *(u16 *)(r10 + 20) = r1 -; CHECK: *(u16 *)(r10 + 18) = r1 -; CHECK: *(u16 *)(r10 + 16) = r1 -; CHECK: *(u16 *)(r10 + 14) = r1 -; CHECK: *(u16 *)(r10 + 12) = r1 -; CHECK: *(u16 *)(r10 + 10) = r1 -; CHECK: *(u16 *)(r10 + 8) = r1 -; CHECK: *(u16 *)(r10 + 6) = r1 -; CHECK: *(u16 *)(r10 - 2) = r1 -; CHECK: *(u16 *)(r10 + 26) = r1 +; CHECK: r1 = r10 +; CHECK: r1 += -2 +; CHECK: r2 = 0 +; CHECK: *(u16 *)(r1 + 6) = r2 +; CHECK: *(u16 *)(r1 + 4) = r2 +; CHECK: *(u16 *)(r1 + 2) = r2 +; CHECK: r1 = 134678021 +; CHECK: *(u32 *)(r10 - 8) = r1 +; CHECK: r1 = 9 +; CHECK: *(u8 *)(r10 - 4) = r1 +; CHECK: r1 = 10 +; CHECK: *(u8 *)(r10 - 3) = r1 +; CHECK: *(u16 *)(r10 + 24) = r2 +; CHECK: *(u16 *)(r10 + 22) = r2 +; CHECK: *(u16 *)(r10 + 20) = r2 +; CHECK: *(u16 *)(r10 + 18) = r2 +; CHECK: *(u16 *)(r10 + 16) = r2 +; CHECK: *(u16 *)(r10 + 14) = r2 +; CHECK: *(u16 *)(r10 + 12) = r2 +; CHECK: *(u16 *)(r10 + 10) = r2 +; CHECK: *(u16 *)(r10 + 8) = r2 +; CHECK: *(u16 *)(r10 + 6) = r2 +; CHECK: *(u16 *)(r10 - 2) = r2 +; CHECK: *(u16 *)(r10 + 26) = r2 ; CHECK: r2 = r10 ; CHECK: r2 += -8 ; CHECK: r1 = ll Index: test/CodeGen/MSP430/Inst16mm.ll =================================================================== --- test/CodeGen/MSP430/Inst16mm.ll +++ test/CodeGen/MSP430/Inst16mm.ll @@ -64,6 +64,6 @@ %0 = load i16, i16* %retval ; [#uses=1] ret i16 %0 ; CHECK-LABEL: mov2: -; CHECK: mov.w 2(r1), 6(r1) -; CHECK: mov.w 0(r1), 4(r1) +; CHECK-DAG: mov.w 2(r1), 6(r1) +; CHECK-DAG: mov.w 0(r1), 4(r1) } Index: test/CodeGen/PowerPC/complex-return.ll =================================================================== --- test/CodeGen/PowerPC/complex-return.ll +++ test/CodeGen/PowerPC/complex-return.ll @@ -9,7 +9,7 @@ %x = alloca { ppc_fp128, ppc_fp128 }, align 16 %real = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* %x, i32 0, i32 0 %imag = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* %x, i32 0, i32 1 - store ppc_fp128 0xM400C0000000000000000000000000000, ppc_fp128* %real + store ppc_fp128 0xM400C0000000000300000000010000000, ppc_fp128* %real store ppc_fp128 0xMC00547AE147AE1483CA47AE147AE147A, ppc_fp128* %imag %x.realp = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* %x, i32 0, i32 0 %x.real = load ppc_fp128, ppc_fp128* %x.realp Index: test/CodeGen/X86/MergeConsecutiveStores.ll =================================================================== --- test/CodeGen/X86/MergeConsecutiveStores.ll +++ test/CodeGen/X86/MergeConsecutiveStores.ll @@ -558,8 +558,7 @@ } ; This is a minimized test based on real code that was failing. -; We could merge stores (and loads) like this... - +; This should now be merged. define void @merge_vec_element_and_scalar_load([6 x i64]* %array) { %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0 %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1 @@ -576,15 +575,11 @@ ret void ; CHECK-LABEL: merge_vec_element_and_scalar_load -; CHECK: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movq %rax, 32(%rdi) -; CHECK-NEXT: movq %rcx, 40(%rdi) +; CHECK: vmovups (%rdi), %xmm0 +; CHECK-NEXT: vmovups %xmm0, 32(%rdi) ; CHECK-NEXT: retq } - - ; Don't let a non-consecutive store thwart merging of the last two. define void @almost_consecutive_stores(i8* %p) { store i8 0, i8* %p @@ -601,3 +596,29 @@ ; CHECK-DAG: movw $770, 2(%rdi) ; CHECK: retq } + +; We should be able to merge these. +define void @merge_bitcast(<4 x i32> %v, float* %ptr) { + %fv = bitcast <4 x i32> %v to <4 x float> + + %vecext1 = extractelement <4 x i32> %v, i32 1 + %vecext2 = extractelement <4 x i32> %v, i32 2 + %vecext3 = extractelement <4 x i32> %v, i32 3 + %f0 = extractelement <4 x float> %fv, i32 0 + %f1 = bitcast i32 %vecext1 to float + %f2 = bitcast i32 %vecext2 to float + %f3 = bitcast i32 %vecext3 to float + %idx0 = getelementptr inbounds float, float* %ptr, i64 0 + %idx1 = getelementptr inbounds float, float* %ptr, i64 1 + %idx2 = getelementptr inbounds float, float* %ptr, i64 2 + %idx3 = getelementptr inbounds float, float* %ptr, i64 3 + store float %f0, float* %idx0, align 4 + store float %f1, float* %idx1, align 4 + store float %f2, float* %idx2, align 4 + store float %f3, float* %idx3, align 4 + ret void + +; CHECK-LABEL: merge_bitcast +; CHECK: vmovups %xmm0, (%rdi) +; CHECK-NEXT: retq +} Index: test/CodeGen/X86/bigstructret.ll =================================================================== --- test/CodeGen/X86/bigstructret.ll +++ test/CodeGen/X86/bigstructret.ll @@ -19,10 +19,9 @@ } ; CHECK: ReturnBigStruct2 -; CHECK: movl $48, 4(%ecx) -; CHECK: movb $1, 2(%ecx) -; CHECK: movb $1, 1(%ecx) -; CHECK: movb $0, (%ecx) +; CHECK-DAG: movl $48, 4(%ecx) +; CHECK-DAG: movb $1, 2(%ecx) +; CHECK-DAG: movw $256, (%ecx) define fastcc %1 @ReturnBigStruct2() nounwind readnone { entry: Index: test/CodeGen/X86/bitcast-i256.ll =================================================================== --- test/CodeGen/X86/bitcast-i256.ll +++ test/CodeGen/X86/bitcast-i256.ll @@ -5,7 +5,6 @@ ret i256 %r ; CHECK: foo ; CHECK: vextractf128 -; CHECK: vpextrq -; CHECK: vpextrq +; CHECK: vmovups ; CHECK: ret } Index: test/CodeGen/X86/build-vector-128.ll =================================================================== --- test/CodeGen/X86/build-vector-128.ll +++ test/CodeGen/X86/build-vector-128.ll @@ -72,12 +72,10 @@ } define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) { -; SSE2-32-LABEL: test_buildvector_v2i64: -; SSE2-32: # BB#0: -; SSE2-32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-32-NEXT: retl +; SSE-32-LABEL: test_buildvector_v2i64: +; SSE-32: # BB#0: +; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; SSE-32-NEXT: retl ; ; SSE-64-LABEL: test_buildvector_v2i64: ; SSE-64: # BB#0: @@ -86,20 +84,9 @@ ; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; -; SSE41-32-LABEL: test_buildvector_v2i64: -; SSE41-32: # BB#0: -; SSE41-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 -; SSE41-32-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0 -; SSE41-32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0 -; SSE41-32-NEXT: retl -; ; AVX-32-LABEL: test_buildvector_v2i64: ; AVX-32: # BB#0: -; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v2i64: Index: test/CodeGen/X86/build-vector-256.ll =================================================================== --- test/CodeGen/X86/build-vector-256.ll +++ test/CodeGen/X86/build-vector-256.ll @@ -51,18 +51,10 @@ } define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) { -; AVX1-32-LABEL: test_buildvector_v4i64: -; AVX1-32: # BB#0: -; AVX1-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX1-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-32-NEXT: retl +; AVX-32-LABEL: test_buildvector_v4i64: +; AVX-32: # BB#0: +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 +; AVX-32-NEXT: retl ; ; AVX1-64-LABEL: test_buildvector_v4i64: ; AVX1-64: # BB#0: @@ -75,19 +67,6 @@ ; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-64-NEXT: retq ; -; AVX2-32-LABEL: test_buildvector_v4i64: -; AVX2-32: # BB#0: -; AVX2-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX2-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX2-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-32-NEXT: retl -; ; AVX2-64-LABEL: test_buildvector_v4i64: ; AVX2-64: # BB#0: ; AVX2-64-NEXT: vmovq %rcx, %xmm0 Index: test/CodeGen/X86/build-vector-512.ll =================================================================== --- test/CodeGen/X86/build-vector-512.ll +++ test/CodeGen/X86/build-vector-512.ll @@ -79,25 +79,7 @@ define <8 x i64> @test_buildvector_v8i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) { ; AVX-32-LABEL: test_buildvector_v8i64: ; AVX-32: # BB#0: -; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0 ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v8i64: Index: test/CodeGen/X86/clear_upper_vector_element_bits.ll =================================================================== --- test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -1063,87 +1063,89 @@ ; ; AVX1-LABEL: _clearupper32xi8b: ; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 ; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r14 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: movq %rcx, %r8 +; AVX1-NEXT: movq %rcx, %r9 +; AVX1-NEXT: movq %rcx, %r10 +; AVX1-NEXT: movq %rcx, %r11 +; AVX1-NEXT: movq %rcx, %r14 +; AVX1-NEXT: movq %rcx, %r15 ; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: movq %rdx, %r8 -; AVX1-NEXT: movq %rdx, %r9 -; AVX1-NEXT: movq %rdx, %r11 -; AVX1-NEXT: movq %rdx, %rsi -; AVX1-NEXT: movq %rdx, %rdi -; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: movq %rdx, %r12 +; AVX1-NEXT: movq %rdx, %r13 +; AVX1-NEXT: movq %rdx, %rbx ; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: movq %rdx, %rdi +; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: movq %rdx, %rbp ; AVX1-NEXT: andb $15, %dl ; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: shrq $56, %rax -; AVX1-NEXT: andb $15, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %r10 -; AVX1-NEXT: shrq $48, %rcx +; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: andb $15, %cl ; AVX1-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rdx -; AVX1-NEXT: shrq $40, %rdi -; AVX1-NEXT: andb $15, %dil -; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rax -; AVX1-NEXT: shrq $32, %rsi +; AVX1-NEXT: shrq $56, %rbp +; AVX1-NEXT: andb $15, %bpl +; AVX1-NEXT: movb %bpl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $48, %rsi ; AVX1-NEXT: andb $15, %sil ; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rcx -; AVX1-NEXT: shrq $24, %r11 -; AVX1-NEXT: andb $15, %r11b -; AVX1-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rsi -; AVX1-NEXT: shrq $16, %r9 -; AVX1-NEXT: andb $15, %r9b -; AVX1-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rdi -; AVX1-NEXT: shrq $8, %r8 -; AVX1-NEXT: andb $15, %r8b -; AVX1-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rbx -; AVX1-NEXT: andb $15, %r14b -; AVX1-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: shrq $8, %r10 -; AVX1-NEXT: shrq $16, %rdx -; AVX1-NEXT: shrq $24, %rax -; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: shrq $40, %rsi -; AVX1-NEXT: shrq $48, %rdi -; AVX1-NEXT: shrq $56, %rbx -; AVX1-NEXT: andb $15, %bl -; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $40, %rdi ; AVX1-NEXT: andb $15, %dil ; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: andb $15, %sil -; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $32, %rax ; AVX1-NEXT: andb $15, %al ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $24, %rbx +; AVX1-NEXT: andb $15, %bl +; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $16, %r13 +; AVX1-NEXT: andb $15, %r13b +; AVX1-NEXT: movb %r13b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $8, %r12 +; AVX1-NEXT: andb $15, %r12b +; AVX1-NEXT: movb %r12b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $8, %r8 +; AVX1-NEXT: shrq $16, %r9 +; AVX1-NEXT: shrq $24, %r10 +; AVX1-NEXT: shrq $32, %r11 +; AVX1-NEXT: shrq $40, %r14 +; AVX1-NEXT: shrq $48, %r15 +; AVX1-NEXT: shrq $56, %rdx ; AVX1-NEXT: andb $15, %dl ; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andb $15, %r15b +; AVX1-NEXT: movb %r15b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andb $15, %r14b +; AVX1-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andb $15, %r11b +; AVX1-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: andb $15, %r10b ; AVX1-NEXT: movb %r10b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andb $15, %r9b +; AVX1-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andb $15, %r8b +; AVX1-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movq %rax, %r8 +; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: movq %rax, %rdx ; AVX1-NEXT: movq %rax, %rsi ; AVX1-NEXT: movq %rax, %rdi +; AVX1-NEXT: movl %eax, %ebp ; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: vmovd %eax, %xmm1 ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: shrl $24, %ebx -; AVX1-NEXT: vpinsrb $3, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: shrl $16, %ebx +; AVX1-NEXT: vpinsrb $2, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: shrl $24, %ebp +; AVX1-NEXT: vpinsrb $3, %ebp, %xmm1, %xmm1 ; AVX1-NEXT: shrq $32, %rdi ; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 ; AVX1-NEXT: shrq $40, %rsi @@ -1153,8 +1155,8 @@ ; AVX1-NEXT: shrq $48, %rdx ; AVX1-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: shrq $56, %r8 -; AVX1-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0 +; AVX1-NEXT: shrq $56, %rcx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0 ; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %ecx ; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 @@ -1222,92 +1224,98 @@ ; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 ; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: _clearupper32xi8b: ; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 ; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r14 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %r8 +; AVX2-NEXT: movq %rcx, %r9 +; AVX2-NEXT: movq %rcx, %r10 +; AVX2-NEXT: movq %rcx, %r11 +; AVX2-NEXT: movq %rcx, %r14 +; AVX2-NEXT: movq %rcx, %r15 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: movq %rdx, %r8 -; AVX2-NEXT: movq %rdx, %r9 -; AVX2-NEXT: movq %rdx, %r11 -; AVX2-NEXT: movq %rdx, %rsi -; AVX2-NEXT: movq %rdx, %rdi -; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: movq %rdx, %r12 +; AVX2-NEXT: movq %rdx, %r13 +; AVX2-NEXT: movq %rdx, %rbx ; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: movq %rdx, %rbp ; AVX2-NEXT: andb $15, %dl ; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: andb $15, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: shrq $48, %rcx +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andb $15, %cl ; AVX2-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rdx -; AVX2-NEXT: shrq $40, %rdi -; AVX2-NEXT: andb $15, %dil -; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rax -; AVX2-NEXT: shrq $32, %rsi +; AVX2-NEXT: shrq $56, %rbp +; AVX2-NEXT: andb $15, %bpl +; AVX2-NEXT: movb %bpl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $48, %rsi ; AVX2-NEXT: andb $15, %sil ; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rcx -; AVX2-NEXT: shrq $24, %r11 -; AVX2-NEXT: andb $15, %r11b -; AVX2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rsi -; AVX2-NEXT: shrq $16, %r9 -; AVX2-NEXT: andb $15, %r9b -; AVX2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rdi -; AVX2-NEXT: shrq $8, %r8 -; AVX2-NEXT: andb $15, %r8b -; AVX2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rbx -; AVX2-NEXT: andb $15, %r14b -; AVX2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: shrq $8, %r10 -; AVX2-NEXT: shrq $16, %rdx -; AVX2-NEXT: shrq $24, %rax -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: shrq $40, %rsi -; AVX2-NEXT: shrq $48, %rdi -; AVX2-NEXT: shrq $56, %rbx -; AVX2-NEXT: andb $15, %bl -; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $40, %rdi ; AVX2-NEXT: andb $15, %dil ; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: andb $15, %sil -; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $32, %rax ; AVX2-NEXT: andb $15, %al ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $24, %rbx +; AVX2-NEXT: andb $15, %bl +; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $16, %r13 +; AVX2-NEXT: andb $15, %r13b +; AVX2-NEXT: movb %r13b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $8, %r12 +; AVX2-NEXT: andb $15, %r12b +; AVX2-NEXT: movb %r12b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $8, %r8 +; AVX2-NEXT: shrq $16, %r9 +; AVX2-NEXT: shrq $24, %r10 +; AVX2-NEXT: shrq $32, %r11 +; AVX2-NEXT: shrq $40, %r14 +; AVX2-NEXT: shrq $48, %r15 +; AVX2-NEXT: shrq $56, %rdx ; AVX2-NEXT: andb $15, %dl ; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andb $15, %r15b +; AVX2-NEXT: movb %r15b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andb $15, %r14b +; AVX2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andb $15, %r11b +; AVX2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: andb $15, %r10b ; AVX2-NEXT: movb %r10b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andb $15, %r9b +; AVX2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andb $15, %r8b +; AVX2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: movq %rax, %rdx ; AVX2-NEXT: movq %rax, %rsi ; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: movl %eax, %ebp ; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: shrl $24, %ebx -; AVX2-NEXT: vpinsrb $3, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: shrl $16, %ebx +; AVX2-NEXT: vpinsrb $2, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: shrl $24, %ebp +; AVX2-NEXT: vpinsrb $3, %ebp, %xmm1, %xmm1 ; AVX2-NEXT: shrq $32, %rdi ; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 ; AVX2-NEXT: shrq $40, %rsi @@ -1317,8 +1325,8 @@ ; AVX2-NEXT: shrq $48, %rdx ; AVX2-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: shrq $56, %r8 -; AVX2-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0 +; AVX2-NEXT: shrq $56, %rcx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %ecx ; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 @@ -1386,7 +1394,11 @@ ; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 ; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq %x4 = bitcast <32 x i8> %0 to <64 x i4> %r0 = insertelement <64 x i4> %x4, i4 zeroinitializer, i32 1 Index: test/CodeGen/X86/constant-combines.ll =================================================================== --- test/CodeGen/X86/constant-combines.ll +++ test/CodeGen/X86/constant-combines.ll @@ -15,12 +15,11 @@ ; ; CHECK-LABEL: PR22524: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movl $0, 4(%rdi) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: mulss %xmm0, %xmm1 -; CHECK-NEXT: movl $0, (%rdi) +; CHECK-NEXT: movq $0, (%rdi) ; CHECK-NEXT: movss %xmm1, 4(%rdi) ; CHECK-NEXT: retq entry: Index: test/CodeGen/X86/fold-vector-sext-crash2.ll =================================================================== --- test/CodeGen/X86/fold-vector-sext-crash2.ll +++ test/CodeGen/X86/fold-vector-sext-crash2.ll @@ -53,8 +53,10 @@ ret <2 x i256> %Shuff ; X64-LABEL: test_zext1 - ; X64: movq $0 - ; X64-NEXT: movq $0 + ; X64: xorps %xmm0, %xmm0 + ; X64: movaps %xmm0 + ; X64: movaps %xmm0 + ; X64: movaps %xmm0 ; X64-NEXT: movq $0 ; X64-NEXT: movq $254 @@ -75,8 +77,10 @@ ret <2 x i256> %Shuff ; X64-LABEL: test_zext2 - ; X64: movq $0 - ; X64-NEXT: movq $0 + ; X64: xorps %xmm0, %xmm0 + ; X64-NEXT: movaps %xmm0 + ; X64-NEXT: movaps %xmm0 + ; X64-NEXT: movaps %xmm0 ; X64-NEXT: movq $-1 ; X64-NEXT: movq $-2 Index: test/CodeGen/X86/legalize-shl-vec.ll =================================================================== --- test/CodeGen/X86/legalize-shl-vec.ll +++ test/CodeGen/X86/legalize-shl-vec.ll @@ -26,14 +26,11 @@ ; ; X64-LABEL: test_shl: ; X64: # BB#0: -; X64-NEXT: movq $0, 56(%rdi) -; X64-NEXT: movq $0, 48(%rdi) -; X64-NEXT: movq $0, 40(%rdi) -; X64-NEXT: movq $0, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movaps %xmm0, 32(%rdi) +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 @@ -65,14 +62,11 @@ ; ; X64-LABEL: test_srl: ; X64: # BB#0: -; X64-NEXT: movq $0, 56(%rdi) -; X64-NEXT: movq $0, 48(%rdi) -; X64-NEXT: movq $0, 40(%rdi) -; X64-NEXT: movq $0, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movaps %xmm0, 32(%rdi) +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 Index: test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-128.ll +++ test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -526,44 +526,28 @@ ; ; X32-SSE1-LABEL: merge_8i16_i16_23u567u9: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %ebp +; X32-SSE1-NEXT: pushl %edi ; X32-SSE1-NEXT: .Lcfi6: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: pushl %ebx +; X32-SSE1-NEXT: pushl %esi ; X32-SSE1-NEXT: .Lcfi7: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: pushl %edi ; X32-SSE1-NEXT: .Lcfi8: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 16 -; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .cfi_offset %esi, -12 ; X32-SSE1-NEXT: .Lcfi9: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 -; X32-SSE1-NEXT: .Lcfi10: -; X32-SSE1-NEXT: .cfi_offset %esi, -20 -; X32-SSE1-NEXT: .Lcfi11: -; X32-SSE1-NEXT: .cfi_offset %edi, -16 -; X32-SSE1-NEXT: .Lcfi12: -; X32-SSE1-NEXT: .cfi_offset %ebx, -12 -; X32-SSE1-NEXT: .Lcfi13: -; X32-SSE1-NEXT: .cfi_offset %ebp, -8 +; X32-SSE1-NEXT: .cfi_offset %edi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 4(%ecx), %edx -; X32-SSE1-NEXT: movzwl 6(%ecx), %esi -; X32-SSE1-NEXT: movzwl 10(%ecx), %edi -; X32-SSE1-NEXT: movzwl 12(%ecx), %ebx -; X32-SSE1-NEXT: movzwl 14(%ecx), %ebp +; X32-SSE1-NEXT: movl 4(%ecx), %edx +; X32-SSE1-NEXT: movl 10(%ecx), %esi +; X32-SSE1-NEXT: movzwl 14(%ecx), %edi ; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx -; X32-SSE1-NEXT: movw %bp, 10(%eax) -; X32-SSE1-NEXT: movw %bx, 8(%eax) +; X32-SSE1-NEXT: movw %di, 10(%eax) ; X32-SSE1-NEXT: movw %cx, 14(%eax) -; X32-SSE1-NEXT: movw %si, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) -; X32-SSE1-NEXT: movw %di, 6(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) +; X32-SSE1-NEXT: movl %esi, 6(%eax) ; X32-SSE1-NEXT: popl %esi ; X32-SSE1-NEXT: popl %edi -; X32-SSE1-NEXT: popl %ebx -; X32-SSE1-NEXT: popl %ebp ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_23u567u9: @@ -607,10 +591,8 @@ ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 6(%ecx), %edx -; X32-SSE1-NEXT: movzwl 8(%ecx), %ecx -; X32-SSE1-NEXT: movw %cx, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) +; X32-SSE1-NEXT: movl 6(%ecx), %ecx +; X32-SSE1-NEXT: movl %ecx, (%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_34uuuuuu: @@ -640,24 +622,14 @@ ; ; X32-SSE1-LABEL: merge_8i16_i16_45u7zzzz: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi14: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: .Lcfi15: -; X32-SSE1-NEXT: .cfi_offset %esi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 8(%ecx), %edx -; X32-SSE1-NEXT: movzwl 10(%ecx), %esi +; X32-SSE1-NEXT: movl 8(%ecx), %edx ; X32-SSE1-NEXT: movzwl 14(%ecx), %ecx -; X32-SSE1-NEXT: movw %si, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movw %cx, 6(%eax) -; X32-SSE1-NEXT: movw $0, 14(%eax) -; X32-SSE1-NEXT: movw $0, 12(%eax) -; X32-SSE1-NEXT: movw $0, 10(%eax) -; X32-SSE1-NEXT: movw $0, 8(%eax) -; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: movl $0, 12(%eax) +; X32-SSE1-NEXT: movl $0, 8(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_45u7zzzz: @@ -694,64 +666,44 @@ ; ; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF: ; X32-SSE1: # BB#0: +; X32-SSE1-NEXT: pushl %ebp +; X32-SSE1-NEXT: .Lcfi10: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %ebx +; X32-SSE1-NEXT: .Lcfi11: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X32-SSE1-NEXT: pushl %edi +; X32-SSE1-NEXT: .Lcfi12: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 16 +; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .Lcfi13: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 +; X32-SSE1-NEXT: .Lcfi14: +; X32-SSE1-NEXT: .cfi_offset %esi, -20 +; X32-SSE1-NEXT: .Lcfi15: +; X32-SSE1-NEXT: .cfi_offset %edi, -16 ; X32-SSE1-NEXT: .Lcfi16: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: subl $12, %esp +; X32-SSE1-NEXT: .cfi_offset %ebx, -12 ; X32-SSE1-NEXT: .Lcfi17: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 -; X32-SSE1-NEXT: .Lcfi18: -; X32-SSE1-NEXT: .cfi_offset %ebx, -8 +; X32-SSE1-NEXT: .cfi_offset %ebp, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 1(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 3(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 4(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 5(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 6(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 7(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 8(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 9(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 10(%ecx), %bh -; X32-SSE1-NEXT: movb 11(%ecx), %bl -; X32-SSE1-NEXT: movb 12(%ecx), %dh +; X32-SSE1-NEXT: movzwl (%ecx), %ebp +; X32-SSE1-NEXT: movl 3(%ecx), %esi +; X32-SSE1-NEXT: movl 7(%ecx), %edi +; X32-SSE1-NEXT: movzwl 11(%ecx), %ebx ; X32-SSE1-NEXT: movb 13(%ecx), %dl ; X32-SSE1-NEXT: movb 15(%ecx), %cl ; X32-SSE1-NEXT: movb %dl, 13(%eax) -; X32-SSE1-NEXT: movb %dh, 12(%eax) ; X32-SSE1-NEXT: movb %cl, 15(%eax) -; X32-SSE1-NEXT: movb %bl, 11(%eax) -; X32-SSE1-NEXT: movb %bh, 10(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 9(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 8(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 7(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 6(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 5(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 4(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 1(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, (%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 3(%eax) -; X32-SSE1-NEXT: addl $12, %esp +; X32-SSE1-NEXT: movw %bx, 11(%eax) +; X32-SSE1-NEXT: movl %edi, 7(%eax) +; X32-SSE1-NEXT: movw %bp, (%eax) +; X32-SSE1-NEXT: movl %esi, 3(%eax) +; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: popl %edi ; X32-SSE1-NEXT: popl %ebx +; X32-SSE1-NEXT: popl %ebp ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF: @@ -819,17 +771,13 @@ ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb 1(%ecx), %dh +; X32-SSE1-NEXT: movzwl (%ecx), %edx ; X32-SSE1-NEXT: movb 3(%ecx), %cl -; X32-SSE1-NEXT: movb %dh, 1(%eax) -; X32-SSE1-NEXT: movb %dl, (%eax) +; X32-SSE1-NEXT: movw %dx, (%eax) ; X32-SSE1-NEXT: movb %cl, 3(%eax) ; X32-SSE1-NEXT: movb $0, 15(%eax) -; X32-SSE1-NEXT: movb $0, 14(%eax) -; X32-SSE1-NEXT: movb $0, 13(%eax) -; X32-SSE1-NEXT: movb $0, 7(%eax) -; X32-SSE1-NEXT: movb $0, 6(%eax) +; X32-SSE1-NEXT: movw $0, 13(%eax) +; X32-SSE1-NEXT: movw $0, 6(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: @@ -867,35 +815,14 @@ ; ; X32-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %ebx -; X32-SSE1-NEXT: .Lcfi19: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: pushl %eax -; X32-SSE1-NEXT: .Lcfi20: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: .Lcfi21: -; X32-SSE1-NEXT: .cfi_offset %ebx, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 1(%ecx), %dh -; X32-SSE1-NEXT: movb 2(%ecx), %bl -; X32-SSE1-NEXT: movb 3(%ecx), %bh -; X32-SSE1-NEXT: movb 6(%ecx), %dl -; X32-SSE1-NEXT: movb 7(%ecx), %cl -; X32-SSE1-NEXT: movb %cl, 7(%eax) -; X32-SSE1-NEXT: movb %dl, 6(%eax) -; X32-SSE1-NEXT: movb %bh, 3(%eax) -; X32-SSE1-NEXT: movb %bl, 2(%eax) -; X32-SSE1-NEXT: movb %dh, 1(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, (%eax) +; X32-SSE1-NEXT: movl (%ecx), %edx +; X32-SSE1-NEXT: movzwl 6(%ecx), %ecx +; X32-SSE1-NEXT: movw %cx, 6(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movb $0, 15(%eax) -; X32-SSE1-NEXT: movb $0, 14(%eax) -; X32-SSE1-NEXT: movb $0, 13(%eax) -; X32-SSE1-NEXT: addl $4, %esp -; X32-SSE1-NEXT: popl %ebx +; X32-SSE1-NEXT: movw $0, 13(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: @@ -990,14 +917,14 @@ ; X32-SSE1-LABEL: merge_2i64_i64_12_volatile: ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: pushl %edi -; X32-SSE1-NEXT: .Lcfi22: +; X32-SSE1-NEXT: .Lcfi18: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi23: +; X32-SSE1-NEXT: .Lcfi19: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: .Lcfi24: +; X32-SSE1-NEXT: .Lcfi20: ; X32-SSE1-NEXT: .cfi_offset %esi, -12 -; X32-SSE1-NEXT: .Lcfi25: +; X32-SSE1-NEXT: .Lcfi21: ; X32-SSE1-NEXT: .cfi_offset %edi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx Index: test/CodeGen/X86/merge-store-partially-alias-loads.ll =================================================================== --- test/CodeGen/X86/merge-store-partially-alias-loads.ll +++ test/CodeGen/X86/merge-store-partially-alias-loads.ll @@ -13,7 +13,7 @@ ; X86-NEXT: movb [[HI1]], 3([[BASEREG]]) ; X86-NEXT: retq -; DBGDAG-LABEL: Optimized lowered selection DAG: BB#0 'merge_store_partial_overlap_load:' +; DBGDAG-LABEL: Optimized legalized selection DAG: BB#0 'merge_store_partial_overlap_load:' ; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch = EntryToken ; DBGDAG-DAG: [[BASEPTR:t[0-9]+]]: i64,ch = CopyFromReg [[ENTRYTOKEN]], ; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add [[BASEPTR]], Constant:i64<2> @@ -27,7 +27,7 @@ ; DBGDAG: X86ISD::RET_FLAG t{{[0-9]+}}, -; DBGDAG: Type-legalized selection DAG: BB#0 'merge_store_partial_overlap_load:' +; DBGDAG-LABEL: Instruction selection begins define void @merge_store_partial_overlap_load([4 x i8]* %tmp) { %tmp8 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 0 %tmp10 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 1 Index: test/CodeGen/X86/no-sse2-avg.ll =================================================================== --- test/CodeGen/X86/no-sse2-avg.ll +++ test/CodeGen/X86/no-sse2-avg.ll @@ -5,22 +5,8 @@ define <16 x i8> @PR27973() { ; CHECK-LABEL: PR27973: ; CHECK: # BB#0: -; CHECK-NEXT: movb $0, 15(%rdi) -; CHECK-NEXT: movb $0, 14(%rdi) -; CHECK-NEXT: movb $0, 13(%rdi) -; CHECK-NEXT: movb $0, 12(%rdi) -; CHECK-NEXT: movb $0, 11(%rdi) -; CHECK-NEXT: movb $0, 10(%rdi) -; CHECK-NEXT: movb $0, 9(%rdi) -; CHECK-NEXT: movb $0, 8(%rdi) -; CHECK-NEXT: movb $0, 7(%rdi) -; CHECK-NEXT: movb $0, 6(%rdi) -; CHECK-NEXT: movb $0, 5(%rdi) -; CHECK-NEXT: movb $0, 4(%rdi) -; CHECK-NEXT: movb $0, 3(%rdi) -; CHECK-NEXT: movb $0, 2(%rdi) -; CHECK-NEXT: movb $0, 1(%rdi) -; CHECK-NEXT: movb $0, (%rdi) +; CHECK-NEXT: movq $0, 8(%rdi) +; CHECK-NEXT: movq $0, (%rdi) ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: retq %t0 = zext <16 x i8> zeroinitializer to <16 x i32> Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -83,24 +83,21 @@ ; SSE42: # BB#0: ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi) -; SSE42-NEXT: movq %xmm1, (%rdi) +; SSE42-NEXT: movdqa %xmm1, (%rdi) ; SSE42-NEXT: retq ; ; AVX1-LABEL: v3i32: ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi) -; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v3i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi) -; AVX2-NEXT: vmovq %xmm1, (%rdi) +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) ; AVX2-NEXT: retq %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> store <3 x i32> %r, <3 x i32>* %p Index: test/CodeGen/X86/stdarg.ll =================================================================== --- test/CodeGen/X86/stdarg.ll +++ test/CodeGen/X86/stdarg.ll @@ -14,8 +14,7 @@ ; ; CHECK-DAG: movq {{.*}}, 192(%rsp) ; CHECK-DAG: movq {{.*}}, 184(%rsp) -; CHECK-DAG: movl {{.*}}, 180(%rsp) -; CHECK-DAG: movl {{.*}}, 176(%rsp) +; CHECK-DAG: movq {{.*}}, 176(%rsp) %ap3 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0; <%struct.__va_list_tag*> [#uses=1] call void @bar(%struct.__va_list_tag* %ap3) nounwind call void @llvm.va_end(i8* %ap12) Index: test/CodeGen/X86/stores-merging.ll =================================================================== --- test/CodeGen/X86/stores-merging.ll +++ test/CodeGen/X86/stores-merging.ll @@ -13,9 +13,8 @@ ;; the same result in memory in the end. ; CHECK-LABEL: redundant_stores_merging: -; CHECK: movabsq $528280977409, %rax -; CHECK: movq %rax, e+4(%rip) -; CHECK: movl $456, e+8(%rip) +; CHECK: movabsq $1958505086977, %rax +; CHECK: movq %rax, e+4(%rip) define void @redundant_stores_merging() { entry: store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4 @@ -26,9 +25,8 @@ ;; This variant tests PR25154. ; CHECK-LABEL: redundant_stores_merging_reverse: -; CHECK: movabsq $528280977409, %rax -; CHECK: movq %rax, e+4(%rip) -; CHECK: movl $456, e+8(%rip) +; CHECK: movabsq $1958505086977, %rax +; CHECK: movq %rax, e+4(%rip) define void @redundant_stores_merging_reverse() { entry: store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4 Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -807,10 +807,10 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0412: ; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/wide-integer-cmp.ll =================================================================== --- test/CodeGen/X86/wide-integer-cmp.ll +++ test/CodeGen/X86/wide-integer-cmp.ll @@ -101,8 +101,8 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: jge .LBB4_2 ; CHECK-NEXT: # BB#1: # %bb1 ; CHECK-NEXT: movl $1, %eax Index: test/CodeGen/X86/widen_arith-6.ll =================================================================== --- test/CodeGen/X86/widen_arith-6.ll +++ test/CodeGen/X86/widen_arith-6.ll @@ -10,9 +10,9 @@ ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-16, %esp ; CHECK-NEXT: subl $48, %esp +; CHECK-NEXT: movl $1065353216, {{[0-9]+}}(%esp) # imm = 0x3F800000 ; CHECK-NEXT: movl $1077936128, {{[0-9]+}}(%esp) # imm = 0x40400000 ; CHECK-NEXT: movl $1073741824, {{[0-9]+}}(%esp) # imm = 0x40000000 -; CHECK-NEXT: movl $1065353216, {{[0-9]+}}(%esp) # imm = 0x3F800000 ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movaps {{.*#+}} xmm0 = <1976.04004,1976.04004,1976.04004,u> ; CHECK-NEXT: jmp .LBB0_1