diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -599,7 +599,19 @@ /// Allow store merging for the specified type after legalization in addition /// to before legalization. This may transform stores that do not exist /// earlier (for example, stores created from intrinsics). - virtual bool mergeStoresAfterLegalization(EVT MemVT) const { + /// The answer may depend on the origin of the store, where lack of the + /// source (`std::nullopt`) means that we are asking in general, + /// i.e. do we allow *any* such merging post-legalization. + enum class StoreSource { + Unknown, + Constant, + ExtractVectorElt, + ExtractSubvector, + Load + }; + virtual bool + mergeStoresAfterLegalization(EVT MemVT, + std::optional StoreSrc) const { return true; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -662,19 +662,19 @@ }; // Classify the origin of a stored value. - enum class StoreSource { Unknown, Constant, Extract, Load }; - StoreSource getStoreSource(SDValue StoreVal) { + TargetLoweringBase::StoreSource getStoreSource(SDValue StoreVal) { switch (StoreVal.getOpcode()) { case ISD::Constant: case ISD::ConstantFP: - return StoreSource::Constant; + return TargetLoweringBase::StoreSource::Constant; case ISD::EXTRACT_VECTOR_ELT: + return TargetLoweringBase::StoreSource::ExtractVectorElt; case ISD::EXTRACT_SUBVECTOR: - return StoreSource::Extract; + return TargetLoweringBase::StoreSource::ExtractSubvector; case ISD::LOAD: - return StoreSource::Load; + return TargetLoweringBase::StoreSource::Load; default: - return StoreSource::Unknown; + return TargetLoweringBase::StoreSource::Unknown; } } @@ -18658,14 +18658,15 @@ return; SDValue Val = peekThroughBitcasts(St->getValue()); - StoreSource StoreSrc = getStoreSource(Val); - assert(StoreSrc != StoreSource::Unknown && "Expected known source for store"); + TargetLoweringBase::StoreSource StoreSrc = getStoreSource(Val); + assert(StoreSrc != TargetLoweringBase::StoreSource::Unknown && + "Expected known source for store"); // Match on loadbaseptr if relevant. EVT MemVT = St->getMemoryVT(); BaseIndexOffset LBasePtr; EVT LoadVT; - if (StoreSrc == StoreSource::Load) { + if (StoreSrc == TargetLoweringBase::StoreSource::Load) { auto *Ld = cast(Val); LBasePtr = BaseIndexOffset::match(Ld, DAG); LoadVT = Ld->getMemoryVT(); @@ -18694,7 +18695,7 @@ bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) : Other->getMemoryVT() != MemVT; switch (StoreSrc) { - case StoreSource::Load: { + case TargetLoweringBase::StoreSource::Load: { if (NoTypeMatch) return false; // The Load's Base Ptr must also match. @@ -18718,13 +18719,14 @@ return false; break; } - case StoreSource::Constant: + case TargetLoweringBase::StoreSource::Constant: if (NoTypeMatch) return false; if (!isIntOrFPConstant(OtherBC)) return false; break; - case StoreSource::Extract: + case TargetLoweringBase::StoreSource::ExtractVectorElt: + case TargetLoweringBase::StoreSource::ExtractSubvector: // Do not merge truncated stores here. if (Other->isTruncatingStore()) return false; @@ -19416,10 +19418,14 @@ return false; // Do not bother looking at stored values that are not constants, loads, or - // extracted vector elements. + // extracted vector elements/subvectors. SDValue StoredVal = peekThroughBitcasts(St->getValue()); - const StoreSource StoreSrc = getStoreSource(StoredVal); - if (StoreSrc == StoreSource::Unknown) + const TargetLoweringBase::StoreSource StoreSrc = getStoreSource(StoredVal); + if (StoreSrc == TargetLoweringBase::StoreSource::Unknown) + return false; + + if (LegalTypes && + !TLI.mergeStoresAfterLegalization(St->getMemoryVT(), StoreSrc)) return false; SmallVector StoreNodes; @@ -19440,7 +19446,7 @@ bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); bool IsNonTemporalStore = St->isNonTemporal(); - bool IsNonTemporalLoad = StoreSrc == StoreSource::Load && + bool IsNonTemporalLoad = StoreSrc == TargetLoweringBase::StoreSource::Load && cast(StoredVal)->isNonTemporal(); // Store Merge attempts to merge the lowest stores. This generally @@ -19461,17 +19467,18 @@ // We have at least 2 consecutive stores. Try to merge them. assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores"); switch (StoreSrc) { - case StoreSource::Constant: + case TargetLoweringBase::StoreSource::Constant: MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores, MemVT, RootNode, AllowVectors); break; - case StoreSource::Extract: + case TargetLoweringBase::StoreSource::ExtractVectorElt: + case TargetLoweringBase::StoreSource::ExtractSubvector: MadeChange |= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores, MemVT, RootNode); break; - case StoreSource::Load: + case TargetLoweringBase::StoreSource::Load: MadeChange |= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores, MemVT, RootNode, AllowVectors, IsNonTemporalStore, IsNonTemporalLoad); @@ -19771,7 +19778,8 @@ // Always perform this optimization before types are legal. If the target // prefers, also try this after legalization to catch stores that were created // by intrinsics or other nodes. - if (!LegalTypes || (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) { + if (!LegalTypes || + (TLI.mergeStoresAfterLegalization(ST->getMemoryVT(), std::nullopt))) { while (true) { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -883,7 +883,8 @@ /// illegal as the original, thus leading to an infinite legalisation loop. /// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal /// vector types this override can be removed. - bool mergeStoresAfterLegalization(EVT VT) const override; + bool mergeStoresAfterLegalization(EVT, + std::optional) const override; // If the platform/function should have a redzone, return the size in bytes. unsigned getRedZoneSize(const Function &F) const { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6107,7 +6107,8 @@ } } -bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const { +bool AArch64TargetLowering::mergeStoresAfterLegalization( + EVT, std::optional) const { return !Subtarget->useSVEForFixedLengthVectors(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -243,7 +243,10 @@ // MergeConsecutiveStores() merges two stores; LegalizeStoreOps() un-merges; // MergeConsecutiveStores() re-merges, etc. ) to warrant turning it off for // now. - bool mergeStoresAfterLegalization(EVT) const override { return false; } + bool mergeStoresAfterLegalization(EVT, + std::optional) const override { + return false; + } bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { return true; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -736,7 +736,8 @@ /// illegal as the original, thus leading to an infinite legalisation loop. /// NOTE: Once BUILD_VECTOR can be custom lowered for all legal vector types, /// this override can be removed. - bool mergeStoresAfterLegalization(EVT VT) const override; + bool mergeStoresAfterLegalization(EVT VT, + std::optional) const override; /// Disable normalizing /// select(N0&N1, X, Y) => select(N0, select(N1, X, Y), Y) and diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1699,7 +1699,8 @@ // Permit combining of mask vectors as BUILD_VECTOR never expands to scalar // stores for those types. -bool RISCVTargetLowering::mergeStoresAfterLegalization(EVT VT) const { +bool RISCVTargetLowering::mergeStoresAfterLegalization( + EVT VT, std::optional) const { return !Subtarget.useRVVForFixedLengthVectors() || (VT.isFixedLengthVector() && VT.getVectorElementType() == MVT::i1); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1067,10 +1067,19 @@ /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; - /// Do not merge vector stores after legalization because that may conflict - /// with x86-specific store splitting optimizations. - bool mergeStoresAfterLegalization(EVT MemVT) const override { - return !MemVT.isVector(); + /// In general, we do want to always merge stores, even after legalization. + /// However, we wish to avoid stores of vector concatenations, + /// so avoid merging stores of subvector extractions, + /// because that may conflict with x86-specific store splitting + /// optimizations. + bool mergeStoresAfterLegalization( + EVT MemVT, std::optional StoreSrc) const override { + // In general, we *DO* want to merge stores post-legalization. + if (!StoreSrc) + return true; + assert(StoreSrc != StoreSource::Unknown && "Can't get that here."); + // Post-legalization, don't merge stores of subvector extractions. + return StoreSrc != StoreSource::ExtractSubvector; } bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -1121,10 +1121,8 @@ ; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $31, %eax -; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %xmm0 -; X64-AVX-NEXT: vmovups -48(%rsp,%rax), %xmm1 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; @@ -1229,10 +1227,8 @@ ; X32-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovups %ymm0, (%esp) ; X32-AVX-NEXT: andl $31, %ecx -; X32-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X32-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X32-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X32-AVX-NEXT: vmovups %xmm0, (%eax) +; X32-AVX-NEXT: vmovups (%esp,%ecx), %ymm0 +; X32-AVX-NEXT: vmovups %ymm0, (%eax) ; X32-AVX-NEXT: addl $64, %esp ; X32-AVX-NEXT: vzeroupper ; X32-AVX-NEXT: retl @@ -1301,10 +1297,8 @@ ; X64-AVX-NEXT: andb $31, %al ; X64-AVX-NEXT: negb %al ; X64-AVX-NEXT: movsbq %al, %rax -; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %xmm0 -; X64-AVX-NEXT: vmovups -16(%rsp,%rax), %xmm1 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; @@ -1415,10 +1409,8 @@ ; X32-AVX-NEXT: andb $31, %cl ; X32-AVX-NEXT: negb %cl ; X32-AVX-NEXT: movsbl %cl, %ecx -; X32-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0 -; X32-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1 -; X32-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X32-AVX-NEXT: vmovups %xmm0, (%eax) +; X32-AVX-NEXT: vmovups 32(%esp,%ecx), %ymm0 +; X32-AVX-NEXT: vmovups %ymm0, (%eax) ; X32-AVX-NEXT: addl $64, %esp ; X32-AVX-NEXT: vzeroupper ; X32-AVX-NEXT: retl @@ -1493,10 +1485,9 @@ ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $31, %esi -; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %xmm0 -; X64-AVX-NEXT: vmovups -48(%rsp,%rsi), %xmm1 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rdx) +; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; ; X32-SSE2-LABEL: ashr_32bytes: @@ -1637,14 +1628,13 @@ ; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: andl $31, %ecx -; X32-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X32-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X32-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X32-AVX-NEXT: vmovups %xmm0, (%eax) +; X32-AVX-NEXT: vmovups (%esp,%ecx), %ymm0 +; X32-AVX-NEXT: vmovups %ymm0, (%eax) ; X32-AVX-NEXT: addl $64, %esp ; X32-AVX-NEXT: popl %esi ; X32-AVX-NEXT: popl %edi ; X32-AVX-NEXT: popl %ebx +; X32-AVX-NEXT: vzeroupper ; X32-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -1741,14 +1731,10 @@ ; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: andl $63, %eax -; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %xmm0 -; X64-AVX1-NEXT: vmovups -112(%rsp,%rax), %xmm1 -; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %xmm2 -; X64-AVX1-NEXT: vmovups -80(%rsp,%rax), %xmm3 -; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %ymm0 +; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %ymm1 +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdx) +; X64-AVX1-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1760,14 +1746,8 @@ ; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: andl $63, %eax -; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %xmm0 -; X64-AVX512-NEXT: vmovups -112(%rsp,%rax), %xmm1 -; X64-AVX512-NEXT: vmovups -96(%rsp,%rax), %xmm2 -; X64-AVX512-NEXT: vmovups -80(%rsp,%rax), %xmm3 -; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %zmm0 +; X64-AVX512-NEXT: vmovups %zmm0, (%rdx) ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq ; @@ -1957,14 +1937,10 @@ ; X32-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X32-AVX1-NEXT: vmovups %ymm0, (%esp) ; X32-AVX1-NEXT: andl $63, %ecx -; X32-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0 -; X32-AVX1-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X32-AVX1-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X32-AVX1-NEXT: vmovups 48(%esp,%ecx), %xmm3 -; X32-AVX1-NEXT: vmovups %xmm3, 48(%eax) -; X32-AVX1-NEXT: vmovups %xmm2, 32(%eax) -; X32-AVX1-NEXT: vmovups %xmm1, 16(%eax) -; X32-AVX1-NEXT: vmovups %xmm0, (%eax) +; X32-AVX1-NEXT: vmovups (%esp,%ecx), %ymm0 +; X32-AVX1-NEXT: vmovups 32(%esp,%ecx), %ymm1 +; X32-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X32-AVX1-NEXT: vmovups %ymm0, (%eax) ; X32-AVX1-NEXT: addl $128, %esp ; X32-AVX1-NEXT: vzeroupper ; X32-AVX1-NEXT: retl @@ -1981,14 +1957,8 @@ ; X32-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; X32-AVX512-NEXT: vmovups %zmm0, (%esp) ; X32-AVX512-NEXT: andl $63, %ecx -; X32-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0 -; X32-AVX512-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X32-AVX512-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X32-AVX512-NEXT: vmovups 48(%esp,%ecx), %xmm3 -; X32-AVX512-NEXT: vmovups %xmm3, 48(%eax) -; X32-AVX512-NEXT: vmovups %xmm2, 32(%eax) -; X32-AVX512-NEXT: vmovups %xmm1, 16(%eax) -; X32-AVX512-NEXT: vmovups %xmm0, (%eax) +; X32-AVX512-NEXT: vmovups (%esp,%ecx), %zmm0 +; X32-AVX512-NEXT: vmovups %zmm0, (%eax) ; X32-AVX512-NEXT: addl $128, %esp ; X32-AVX512-NEXT: vzeroupper ; X32-AVX512-NEXT: retl @@ -2092,14 +2062,10 @@ ; X64-AVX1-NEXT: andl $63, %eax ; X64-AVX1-NEXT: negl %eax ; X64-AVX1-NEXT: cltq -; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %xmm0 -; X64-AVX1-NEXT: vmovups -48(%rsp,%rax), %xmm1 -; X64-AVX1-NEXT: vmovups -32(%rsp,%rax), %xmm2 -; X64-AVX1-NEXT: vmovups -16(%rsp,%rax), %xmm3 -; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %ymm0 +; X64-AVX1-NEXT: vmovups -32(%rsp,%rax), %ymm1 +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdx) +; X64-AVX1-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -2113,14 +2079,8 @@ ; X64-AVX512-NEXT: andl $63, %eax ; X64-AVX512-NEXT: negl %eax ; X64-AVX512-NEXT: cltq -; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0 -; X64-AVX512-NEXT: vmovups -48(%rsp,%rax), %xmm1 -; X64-AVX512-NEXT: vmovups -32(%rsp,%rax), %xmm2 -; X64-AVX512-NEXT: vmovups -16(%rsp,%rax), %xmm3 -; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %zmm0 +; X64-AVX512-NEXT: vmovups %zmm0, (%rdx) ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq ; @@ -2318,15 +2278,11 @@ ; X32-AVX1-NEXT: andl $63, %ecx ; X32-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx ; X32-AVX1-NEXT: subl %ecx, %edx -; X32-AVX1-NEXT: vmovups (%edx), %xmm0 -; X32-AVX1-NEXT: vmovups 16(%edx), %xmm1 -; X32-AVX1-NEXT: vmovups 32(%edx), %xmm2 +; X32-AVX1-NEXT: vmovups (%edx), %ymm0 ; X32-AVX1-NEXT: negl %ecx -; X32-AVX1-NEXT: vmovups 112(%esp,%ecx), %xmm3 -; X32-AVX1-NEXT: vmovups %xmm3, 48(%eax) -; X32-AVX1-NEXT: vmovups %xmm2, 32(%eax) -; X32-AVX1-NEXT: vmovups %xmm1, 16(%eax) -; X32-AVX1-NEXT: vmovups %xmm0, (%eax) +; X32-AVX1-NEXT: vmovups 96(%esp,%ecx), %ymm1 +; X32-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X32-AVX1-NEXT: vmovups %ymm0, (%eax) ; X32-AVX1-NEXT: addl $128, %esp ; X32-AVX1-NEXT: vzeroupper ; X32-AVX1-NEXT: retl @@ -2343,17 +2299,9 @@ ; X32-AVX512-NEXT: vmovups %zmm1, (%esp) ; X32-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; X32-AVX512-NEXT: andl $63, %ecx -; X32-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx -; X32-AVX512-NEXT: subl %ecx, %edx -; X32-AVX512-NEXT: vmovups (%edx), %xmm0 -; X32-AVX512-NEXT: vmovups 16(%edx), %xmm1 -; X32-AVX512-NEXT: vmovups 32(%edx), %xmm2 ; X32-AVX512-NEXT: negl %ecx -; X32-AVX512-NEXT: vmovups 112(%esp,%ecx), %xmm3 -; X32-AVX512-NEXT: vmovups %xmm3, 48(%eax) -; X32-AVX512-NEXT: vmovups %xmm2, 32(%eax) -; X32-AVX512-NEXT: vmovups %xmm1, 16(%eax) -; X32-AVX512-NEXT: vmovups %xmm0, (%eax) +; X32-AVX512-NEXT: vmovups 64(%esp,%ecx), %zmm0 +; X32-AVX512-NEXT: vmovups %zmm0, (%eax) ; X32-AVX512-NEXT: addl $128, %esp ; X32-AVX512-NEXT: vzeroupper ; X32-AVX512-NEXT: retl @@ -2449,37 +2397,59 @@ ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: ashr_64bytes: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX-NEXT: movq 48(%rdi), %rax -; X64-AVX-NEXT: movq 56(%rdi), %rcx -; X64-AVX-NEXT: movl (%rsi), %esi -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: sarq $63, %rcx -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: andl $63, %esi -; X64-AVX-NEXT: vmovups -128(%rsp,%rsi), %xmm0 -; X64-AVX-NEXT: vmovups -112(%rsp,%rsi), %xmm1 -; X64-AVX-NEXT: vmovups -96(%rsp,%rsi), %xmm2 -; X64-AVX-NEXT: vmovups -80(%rsp,%rsi), %xmm3 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: ashr_64bytes: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 +; X64-AVX1-NEXT: movq 48(%rdi), %rax +; X64-AVX1-NEXT: movq 56(%rdi), %rcx +; X64-AVX1-NEXT: movl (%rsi), %esi +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: sarq $63, %rcx +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: andl $63, %esi +; X64-AVX1-NEXT: vmovups -128(%rsp,%rsi), %ymm0 +; X64-AVX1-NEXT: vmovups -96(%rsp,%rsi), %ymm1 +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdx) +; X64-AVX1-NEXT: vmovups %ymm0, (%rdx) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: ashr_64bytes: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX512-NEXT: vmovups 32(%rdi), %xmm1 +; X64-AVX512-NEXT: movq 48(%rdi), %rax +; X64-AVX512-NEXT: movq 56(%rdi), %rcx +; X64-AVX512-NEXT: movl (%rsi), %esi +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: sarq $63, %rcx +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: andl $63, %esi +; X64-AVX512-NEXT: vmovups -128(%rsp,%rsi), %zmm0 +; X64-AVX512-NEXT: vmovups %zmm0, (%rdx) +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq ; ; X32-SSE2-LABEL: ashr_64bytes: ; X32-SSE2: # %bb.0: @@ -2677,60 +2647,105 @@ ; X32-SSE42-NEXT: popl %ebx ; X32-SSE42-NEXT: retl ; -; X32-AVX-LABEL: ashr_64bytes: -; X32-AVX: # %bb.0: -; X32-AVX-NEXT: pushl %ebx -; X32-AVX-NEXT: pushl %edi -; X32-AVX-NEXT: pushl %esi -; X32-AVX-NEXT: subl $128, %esp -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-AVX-NEXT: vmovups (%edx), %ymm0 -; X32-AVX-NEXT: vmovups 32(%edx), %xmm1 -; X32-AVX-NEXT: movl 48(%edx), %esi -; X32-AVX-NEXT: movl 52(%edx), %edi -; X32-AVX-NEXT: movl 56(%edx), %ebx -; X32-AVX-NEXT: movl 60(%edx), %edx -; X32-AVX-NEXT: movl (%ecx), %ecx -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovups %ymm0, (%esp) -; X32-AVX-NEXT: sarl $31, %edx -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: andl $63, %ecx -; X32-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X32-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X32-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X32-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm3 -; X32-AVX-NEXT: vmovups %xmm3, 48(%eax) -; X32-AVX-NEXT: vmovups %xmm2, 32(%eax) -; X32-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X32-AVX-NEXT: vmovups %xmm0, (%eax) -; X32-AVX-NEXT: addl $128, %esp -; X32-AVX-NEXT: popl %esi -; X32-AVX-NEXT: popl %edi -; X32-AVX-NEXT: popl %ebx -; X32-AVX-NEXT: vzeroupper -; X32-AVX-NEXT: retl +; X32-AVX1-LABEL: ashr_64bytes: +; X32-AVX1: # %bb.0: +; X32-AVX1-NEXT: pushl %ebx +; X32-AVX1-NEXT: pushl %edi +; X32-AVX1-NEXT: pushl %esi +; X32-AVX1-NEXT: subl $128, %esp +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-AVX1-NEXT: vmovups (%edx), %ymm0 +; X32-AVX1-NEXT: vmovups 32(%edx), %xmm1 +; X32-AVX1-NEXT: movl 48(%edx), %esi +; X32-AVX1-NEXT: movl 52(%edx), %edi +; X32-AVX1-NEXT: movl 56(%edx), %ebx +; X32-AVX1-NEXT: movl 60(%edx), %edx +; X32-AVX1-NEXT: movl (%ecx), %ecx +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: vmovups %ymm0, (%esp) +; X32-AVX1-NEXT: sarl $31, %edx +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: andl $63, %ecx +; X32-AVX1-NEXT: vmovups (%esp,%ecx), %ymm0 +; X32-AVX1-NEXT: vmovups 32(%esp,%ecx), %ymm1 +; X32-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X32-AVX1-NEXT: vmovups %ymm0, (%eax) +; X32-AVX1-NEXT: addl $128, %esp +; X32-AVX1-NEXT: popl %esi +; X32-AVX1-NEXT: popl %edi +; X32-AVX1-NEXT: popl %ebx +; X32-AVX1-NEXT: vzeroupper +; X32-AVX1-NEXT: retl +; +; X32-AVX512-LABEL: ashr_64bytes: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: pushl %ebx +; X32-AVX512-NEXT: pushl %edi +; X32-AVX512-NEXT: pushl %esi +; X32-AVX512-NEXT: subl $128, %esp +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-AVX512-NEXT: vmovups (%edx), %ymm0 +; X32-AVX512-NEXT: vmovups 32(%edx), %xmm1 +; X32-AVX512-NEXT: movl 48(%edx), %esi +; X32-AVX512-NEXT: movl 52(%edx), %edi +; X32-AVX512-NEXT: movl 56(%edx), %ebx +; X32-AVX512-NEXT: movl 60(%edx), %edx +; X32-AVX512-NEXT: movl (%ecx), %ecx +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: vmovups %ymm0, (%esp) +; X32-AVX512-NEXT: sarl $31, %edx +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: andl $63, %ecx +; X32-AVX512-NEXT: vmovups (%esp,%ecx), %zmm0 +; X32-AVX512-NEXT: vmovups %zmm0, (%eax) +; X32-AVX512-NEXT: addl $128, %esp +; X32-AVX512-NEXT: popl %esi +; X32-AVX512-NEXT: popl %edi +; X32-AVX512-NEXT: popl %ebx +; X32-AVX512-NEXT: vzeroupper +; X32-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3