diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -599,7 +599,19 @@
   /// Allow store merging for the specified type after legalization in addition
   /// to before legalization. This may transform stores that do not exist
   /// earlier (for example, stores created from intrinsics).
-  virtual bool mergeStoresAfterLegalization(EVT MemVT) const {
+  /// The answer may depend on the origin of the store, where lack of the
+  /// source (`std::nullopt`) means that we are asking in general,
+  /// i.e. do we allow *any* such merging post-legalization.
+  enum class StoreSource {
+    Unknown,
+    Constant,
+    ExtractVectorElt,
+    ExtractSubvector,
+    Load
+  };
+  virtual bool
+  mergeStoresAfterLegalization(EVT MemVT,
+                               std::optional<StoreSource> StoreSrc) const {
     return true;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -662,19 +662,19 @@
     };
 
     // Classify the origin of a stored value.
-    enum class StoreSource { Unknown, Constant, Extract, Load };
-    StoreSource getStoreSource(SDValue StoreVal) {
+    TargetLoweringBase::StoreSource getStoreSource(SDValue StoreVal) {
       switch (StoreVal.getOpcode()) {
       case ISD::Constant:
       case ISD::ConstantFP:
-        return StoreSource::Constant;
+        return TargetLoweringBase::StoreSource::Constant;
       case ISD::EXTRACT_VECTOR_ELT:
+        return TargetLoweringBase::StoreSource::ExtractVectorElt;
       case ISD::EXTRACT_SUBVECTOR:
-        return StoreSource::Extract;
+        return TargetLoweringBase::StoreSource::ExtractSubvector;
       case ISD::LOAD:
-        return StoreSource::Load;
+        return TargetLoweringBase::StoreSource::Load;
       default:
-        return StoreSource::Unknown;
+        return TargetLoweringBase::StoreSource::Unknown;
       }
     }
 
@@ -18658,14 +18658,15 @@
     return;
 
   SDValue Val = peekThroughBitcasts(St->getValue());
-  StoreSource StoreSrc = getStoreSource(Val);
-  assert(StoreSrc != StoreSource::Unknown && "Expected known source for store");
+  TargetLoweringBase::StoreSource StoreSrc = getStoreSource(Val);
+  assert(StoreSrc != TargetLoweringBase::StoreSource::Unknown &&
+         "Expected known source for store");
 
   // Match on loadbaseptr if relevant.
   EVT MemVT = St->getMemoryVT();
   BaseIndexOffset LBasePtr;
   EVT LoadVT;
-  if (StoreSrc == StoreSource::Load) {
+  if (StoreSrc == TargetLoweringBase::StoreSource::Load) {
     auto *Ld = cast<LoadSDNode>(Val);
     LBasePtr = BaseIndexOffset::match(Ld, DAG);
     LoadVT = Ld->getMemoryVT();
@@ -18694,7 +18695,7 @@
     bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
                                            : Other->getMemoryVT() != MemVT;
     switch (StoreSrc) {
-    case StoreSource::Load: {
+    case TargetLoweringBase::StoreSource::Load: {
       if (NoTypeMatch)
         return false;
       // The Load's Base Ptr must also match.
@@ -18718,13 +18719,14 @@
         return false;
       break;
     }
-    case StoreSource::Constant:
+    case TargetLoweringBase::StoreSource::Constant:
       if (NoTypeMatch)
         return false;
       if (!isIntOrFPConstant(OtherBC))
         return false;
       break;
-    case StoreSource::Extract:
+    case TargetLoweringBase::StoreSource::ExtractVectorElt:
+    case TargetLoweringBase::StoreSource::ExtractSubvector:
       // Do not merge truncated stores here.
       if (Other->isTruncatingStore())
         return false;
@@ -19416,10 +19418,14 @@
     return false;
 
   // Do not bother looking at stored values that are not constants, loads, or
-  // extracted vector elements.
+  // extracted vector elements/subvectors.
   SDValue StoredVal = peekThroughBitcasts(St->getValue());
-  const StoreSource StoreSrc = getStoreSource(StoredVal);
-  if (StoreSrc == StoreSource::Unknown)
+  const TargetLoweringBase::StoreSource StoreSrc = getStoreSource(StoredVal);
+  if (StoreSrc == TargetLoweringBase::StoreSource::Unknown)
+    return false;
+
+  if (LegalTypes &&
+      !TLI.mergeStoresAfterLegalization(St->getMemoryVT(), StoreSrc))
     return false;
 
   SmallVector<MemOpLink, 8> StoreNodes;
@@ -19440,7 +19446,7 @@
   bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute(
       Attribute::NoImplicitFloat);
   bool IsNonTemporalStore = St->isNonTemporal();
-  bool IsNonTemporalLoad = StoreSrc == StoreSource::Load &&
+  bool IsNonTemporalLoad = StoreSrc == TargetLoweringBase::StoreSource::Load &&
                            cast<LoadSDNode>(StoredVal)->isNonTemporal();
 
   // Store Merge attempts to merge the lowest stores. This generally
@@ -19461,17 +19467,18 @@
     // We have at least 2 consecutive stores. Try to merge them.
     assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores");
     switch (StoreSrc) {
-    case StoreSource::Constant:
+    case TargetLoweringBase::StoreSource::Constant:
       MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores,
                                              MemVT, RootNode, AllowVectors);
       break;
 
-    case StoreSource::Extract:
+    case TargetLoweringBase::StoreSource::ExtractVectorElt:
+    case TargetLoweringBase::StoreSource::ExtractSubvector:
       MadeChange |= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores,
                                             MemVT, RootNode);
       break;
 
-    case StoreSource::Load:
+    case TargetLoweringBase::StoreSource::Load:
       MadeChange |= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores,
                                          MemVT, RootNode, AllowVectors,
                                          IsNonTemporalStore, IsNonTemporalLoad);
@@ -19771,7 +19778,8 @@
   // Always perform this optimization before types are legal. If the target
   // prefers, also try this after legalization to catch stores that were created
   // by intrinsics or other nodes.
-  if (!LegalTypes || (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) {
+  if (!LegalTypes ||
+      (TLI.mergeStoresAfterLegalization(ST->getMemoryVT(), std::nullopt))) {
     while (true) {
       // There can be multiple store sequences on the same chain.
       // Keep trying to merge store sequences until we are unable to do so
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -883,7 +883,8 @@
   /// illegal as the original, thus leading to an infinite legalisation loop.
   /// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal
   /// vector types this override can be removed.
-  bool mergeStoresAfterLegalization(EVT VT) const override;
+  bool mergeStoresAfterLegalization(EVT,
+                                    std::optional<StoreSource>) const override;
 
   // If the platform/function should have a redzone, return the size in bytes.
   unsigned getRedZoneSize(const Function &F) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6107,7 +6107,8 @@
   }
 }
 
-bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
+bool AArch64TargetLowering::mergeStoresAfterLegalization(
+    EVT, std::optional<StoreSource>) const {
   return !Subtarget->useSVEForFixedLengthVectors();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -243,7 +243,10 @@
   // MergeConsecutiveStores() merges two stores; LegalizeStoreOps() un-merges;
   // MergeConsecutiveStores() re-merges, etc. ) to warrant turning it off for
   // now.
-  bool mergeStoresAfterLegalization(EVT) const override { return false; }
+  bool mergeStoresAfterLegalization(EVT,
+                                    std::optional<StoreSource>) const override {
+    return false;
+  }
 
   bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
     return true;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -736,7 +736,8 @@
   /// illegal as the original, thus leading to an infinite legalisation loop.
   /// NOTE: Once BUILD_VECTOR can be custom lowered for all legal vector types,
   /// this override can be removed.
-  bool mergeStoresAfterLegalization(EVT VT) const override;
+  bool mergeStoresAfterLegalization(EVT VT,
+                                    std::optional<StoreSource>) const override;
 
   /// Disable normalizing
   /// select(N0&N1, X, Y) => select(N0, select(N1, X, Y), Y) and
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1699,7 +1699,8 @@
 
 // Permit combining of mask vectors as BUILD_VECTOR never expands to scalar
 // stores for those types.
-bool RISCVTargetLowering::mergeStoresAfterLegalization(EVT VT) const {
+bool RISCVTargetLowering::mergeStoresAfterLegalization(
+    EVT VT, std::optional<StoreSource>) const {
   return !Subtarget.useRVVForFixedLengthVectors() ||
          (VT.isFixedLengthVector() && VT.getVectorElementType() == MVT::i1);
 }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1067,10 +1067,19 @@
     /// This method returns the name of a target specific DAG node.
     const char *getTargetNodeName(unsigned Opcode) const override;
 
-    /// Do not merge vector stores after legalization because that may conflict
-    /// with x86-specific store splitting optimizations.
-    bool mergeStoresAfterLegalization(EVT MemVT) const override {
-      return !MemVT.isVector();
+    /// In general, we do want to always merge stores, even after legalization.
+    /// However, we wish to avoid stores of vector concatenations,
+    /// so avoid merging stores of subvector extractions,
+    /// because that may conflict with x86-specific store splitting
+    /// optimizations.
+    bool mergeStoresAfterLegalization(
+        EVT MemVT, std::optional<StoreSource> StoreSrc) const override {
+      // In general, we *DO* want to merge stores post-legalization.
+      if (!StoreSrc)
+        return true;
+      assert(StoreSrc != StoreSource::Unknown && "Can't get that here.");
+      // Post-legalization, don't merge stores of subvector extractions.
+      return StoreSrc != StoreSource::ExtractSubvector;
     }
 
     bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -1121,10 +1121,8 @@
 ; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    andl $31, %eax
-; X64-AVX-NEXT:    vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX-NEXT:    vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    vmovups -64(%rsp,%rax), %ymm0
+; X64-AVX-NEXT:    vmovups %ymm0, (%rdx)
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
 ;
@@ -1229,10 +1227,8 @@
 ; X32-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X32-AVX-NEXT:    vmovups %ymm0, (%esp)
 ; X32-AVX-NEXT:    andl $31, %ecx
-; X32-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X32-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X32-AVX-NEXT:    vmovups %xmm1, 16(%eax)
-; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX-NEXT:    vmovups (%esp,%ecx), %ymm0
+; X32-AVX-NEXT:    vmovups %ymm0, (%eax)
 ; X32-AVX-NEXT:    addl $64, %esp
 ; X32-AVX-NEXT:    vzeroupper
 ; X32-AVX-NEXT:    retl
@@ -1301,10 +1297,8 @@
 ; X64-AVX-NEXT:    andb $31, %al
 ; X64-AVX-NEXT:    negb %al
 ; X64-AVX-NEXT:    movsbq %al, %rax
-; X64-AVX-NEXT:    vmovups -32(%rsp,%rax), %xmm0
-; X64-AVX-NEXT:    vmovups -16(%rsp,%rax), %xmm1
-; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    vmovups -32(%rsp,%rax), %ymm0
+; X64-AVX-NEXT:    vmovups %ymm0, (%rdx)
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
 ;
@@ -1415,10 +1409,8 @@
 ; X32-AVX-NEXT:    andb $31, %cl
 ; X32-AVX-NEXT:    negb %cl
 ; X32-AVX-NEXT:    movsbl %cl, %ecx
-; X32-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm0
-; X32-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm1
-; X32-AVX-NEXT:    vmovups %xmm1, 16(%eax)
-; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX-NEXT:    vmovups 32(%esp,%ecx), %ymm0
+; X32-AVX-NEXT:    vmovups %ymm0, (%eax)
 ; X32-AVX-NEXT:    addl $64, %esp
 ; X32-AVX-NEXT:    vzeroupper
 ; X32-AVX-NEXT:    retl
@@ -1493,10 +1485,9 @@
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    andl $31, %esi
-; X64-AVX-NEXT:    vmovups -64(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT:    vmovups -48(%rsp,%rsi), %xmm1
-; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT:    vmovups -64(%rsp,%rsi), %ymm0
+; X64-AVX-NEXT:    vmovups %ymm0, (%rdx)
+; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
 ;
 ; X32-SSE2-LABEL: ashr_32bytes:
@@ -1637,14 +1628,13 @@
 ; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-AVX-NEXT:    andl $31, %ecx
-; X32-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X32-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X32-AVX-NEXT:    vmovups %xmm1, 16(%eax)
-; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX-NEXT:    vmovups (%esp,%ecx), %ymm0
+; X32-AVX-NEXT:    vmovups %ymm0, (%eax)
 ; X32-AVX-NEXT:    addl $64, %esp
 ; X32-AVX-NEXT:    popl %esi
 ; X32-AVX-NEXT:    popl %edi
 ; X32-AVX-NEXT:    popl %ebx
+; X32-AVX-NEXT:    vzeroupper
 ; X32-AVX-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -1741,14 +1731,10 @@
 ; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; X64-AVX1-NEXT:    andl $63, %eax
-; X64-AVX1-NEXT:    vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT:    vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT:    vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT:    vmovups -80(%rsp,%rax), %xmm3
-; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT:    vmovups -128(%rsp,%rax), %ymm0
+; X64-AVX1-NEXT:    vmovups -96(%rsp,%rax), %ymm1
+; X64-AVX1-NEXT:    vmovups %ymm1, 32(%rdx)
+; X64-AVX1-NEXT:    vmovups %ymm0, (%rdx)
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -1760,14 +1746,8 @@
 ; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
 ; X64-AVX512-NEXT:    andl $63, %eax
-; X64-AVX512-NEXT:    vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT:    vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT:    vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT:    vmovups -80(%rsp,%rax), %xmm3
-; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX512-NEXT:    vmovups -128(%rsp,%rax), %zmm0
+; X64-AVX512-NEXT:    vmovups %zmm0, (%rdx)
 ; X64-AVX512-NEXT:    vzeroupper
 ; X64-AVX512-NEXT:    retq
 ;
@@ -1957,14 +1937,10 @@
 ; X32-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X32-AVX1-NEXT:    vmovups %ymm0, (%esp)
 ; X32-AVX1-NEXT:    andl $63, %ecx
-; X32-AVX1-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X32-AVX1-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X32-AVX1-NEXT:    vmovups 32(%esp,%ecx), %xmm2
-; X32-AVX1-NEXT:    vmovups 48(%esp,%ecx), %xmm3
-; X32-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
-; X32-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
-; X32-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
-; X32-AVX1-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX1-NEXT:    vmovups (%esp,%ecx), %ymm0
+; X32-AVX1-NEXT:    vmovups 32(%esp,%ecx), %ymm1
+; X32-AVX1-NEXT:    vmovups %ymm1, 32(%eax)
+; X32-AVX1-NEXT:    vmovups %ymm0, (%eax)
 ; X32-AVX1-NEXT:    addl $128, %esp
 ; X32-AVX1-NEXT:    vzeroupper
 ; X32-AVX1-NEXT:    retl
@@ -1981,14 +1957,8 @@
 ; X32-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; X32-AVX512-NEXT:    vmovups %zmm0, (%esp)
 ; X32-AVX512-NEXT:    andl $63, %ecx
-; X32-AVX512-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X32-AVX512-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X32-AVX512-NEXT:    vmovups 32(%esp,%ecx), %xmm2
-; X32-AVX512-NEXT:    vmovups 48(%esp,%ecx), %xmm3
-; X32-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
-; X32-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
-; X32-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
-; X32-AVX512-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX512-NEXT:    vmovups (%esp,%ecx), %zmm0
+; X32-AVX512-NEXT:    vmovups %zmm0, (%eax)
 ; X32-AVX512-NEXT:    addl $128, %esp
 ; X32-AVX512-NEXT:    vzeroupper
 ; X32-AVX512-NEXT:    retl
@@ -2092,14 +2062,10 @@
 ; X64-AVX1-NEXT:    andl $63, %eax
 ; X64-AVX1-NEXT:    negl %eax
 ; X64-AVX1-NEXT:    cltq
-; X64-AVX1-NEXT:    vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT:    vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT:    vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT:    vmovups -16(%rsp,%rax), %xmm3
-; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT:    vmovups -64(%rsp,%rax), %ymm0
+; X64-AVX1-NEXT:    vmovups -32(%rsp,%rax), %ymm1
+; X64-AVX1-NEXT:    vmovups %ymm1, 32(%rdx)
+; X64-AVX1-NEXT:    vmovups %ymm0, (%rdx)
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -2113,14 +2079,8 @@
 ; X64-AVX512-NEXT:    andl $63, %eax
 ; X64-AVX512-NEXT:    negl %eax
 ; X64-AVX512-NEXT:    cltq
-; X64-AVX512-NEXT:    vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT:    vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT:    vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT:    vmovups -16(%rsp,%rax), %xmm3
-; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX512-NEXT:    vmovups -64(%rsp,%rax), %zmm0
+; X64-AVX512-NEXT:    vmovups %zmm0, (%rdx)
 ; X64-AVX512-NEXT:    vzeroupper
 ; X64-AVX512-NEXT:    retq
 ;
@@ -2318,15 +2278,11 @@
 ; X32-AVX1-NEXT:    andl $63, %ecx
 ; X32-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %edx
 ; X32-AVX1-NEXT:    subl %ecx, %edx
-; X32-AVX1-NEXT:    vmovups (%edx), %xmm0
-; X32-AVX1-NEXT:    vmovups 16(%edx), %xmm1
-; X32-AVX1-NEXT:    vmovups 32(%edx), %xmm2
+; X32-AVX1-NEXT:    vmovups (%edx), %ymm0
 ; X32-AVX1-NEXT:    negl %ecx
-; X32-AVX1-NEXT:    vmovups 112(%esp,%ecx), %xmm3
-; X32-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
-; X32-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
-; X32-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
-; X32-AVX1-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX1-NEXT:    vmovups 96(%esp,%ecx), %ymm1
+; X32-AVX1-NEXT:    vmovups %ymm1, 32(%eax)
+; X32-AVX1-NEXT:    vmovups %ymm0, (%eax)
 ; X32-AVX1-NEXT:    addl $128, %esp
 ; X32-AVX1-NEXT:    vzeroupper
 ; X32-AVX1-NEXT:    retl
@@ -2343,17 +2299,9 @@
 ; X32-AVX512-NEXT:    vmovups %zmm1, (%esp)
 ; X32-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
 ; X32-AVX512-NEXT:    andl $63, %ecx
-; X32-AVX512-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X32-AVX512-NEXT:    subl %ecx, %edx
-; X32-AVX512-NEXT:    vmovups (%edx), %xmm0
-; X32-AVX512-NEXT:    vmovups 16(%edx), %xmm1
-; X32-AVX512-NEXT:    vmovups 32(%edx), %xmm2
 ; X32-AVX512-NEXT:    negl %ecx
-; X32-AVX512-NEXT:    vmovups 112(%esp,%ecx), %xmm3
-; X32-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
-; X32-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
-; X32-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
-; X32-AVX512-NEXT:    vmovups %xmm0, (%eax)
+; X32-AVX512-NEXT:    vmovups 64(%esp,%ecx), %zmm0
+; X32-AVX512-NEXT:    vmovups %zmm0, (%eax)
 ; X32-AVX512-NEXT:    addl $128, %esp
 ; X32-AVX512-NEXT:    vzeroupper
 ; X32-AVX512-NEXT:    retl
@@ -2449,37 +2397,59 @@
 ; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
 ; X64-SSE42-NEXT:    retq
 ;
-; X64-AVX-LABEL: ashr_64bytes:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX-NEXT:    movq 48(%rdi), %rax
-; X64-AVX-NEXT:    movq 56(%rdi), %rcx
-; X64-AVX-NEXT:    movl (%rsi), %esi
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    sarq $63, %rcx
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    andl $63, %esi
-; X64-AVX-NEXT:    vmovups -128(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT:    vmovups -112(%rsp,%rsi), %xmm1
-; X64-AVX-NEXT:    vmovups -96(%rsp,%rsi), %xmm2
-; X64-AVX-NEXT:    vmovups -80(%rsp,%rsi), %xmm3
-; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT:    vzeroupper
-; X64-AVX-NEXT:    retq
+; X64-AVX1-LABEL: ashr_64bytes:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
+; X64-AVX1-NEXT:    movq 48(%rdi), %rax
+; X64-AVX1-NEXT:    movq 56(%rdi), %rcx
+; X64-AVX1-NEXT:    movl (%rsi), %esi
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    sarq $63, %rcx
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    andl $63, %esi
+; X64-AVX1-NEXT:    vmovups -128(%rsp,%rsi), %ymm0
+; X64-AVX1-NEXT:    vmovups -96(%rsp,%rsi), %ymm1
+; X64-AVX1-NEXT:    vmovups %ymm1, 32(%rdx)
+; X64-AVX1-NEXT:    vmovups %ymm0, (%rdx)
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: ashr_64bytes:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovups 32(%rdi), %xmm1
+; X64-AVX512-NEXT:    movq 48(%rdi), %rax
+; X64-AVX512-NEXT:    movq 56(%rdi), %rcx
+; X64-AVX512-NEXT:    movl (%rsi), %esi
+; X64-AVX512-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    sarq $63, %rcx
+; X64-AVX512-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT:    andl $63, %esi
+; X64-AVX512-NEXT:    vmovups -128(%rsp,%rsi), %zmm0
+; X64-AVX512-NEXT:    vmovups %zmm0, (%rdx)
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
 ;
 ; X32-SSE2-LABEL: ashr_64bytes:
 ; X32-SSE2:       # %bb.0:
@@ -2677,60 +2647,105 @@
 ; X32-SSE42-NEXT:    popl %ebx
 ; X32-SSE42-NEXT:    retl
 ;
-; X32-AVX-LABEL: ashr_64bytes:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    pushl %ebx
-; X32-AVX-NEXT:    pushl %edi
-; X32-AVX-NEXT:    pushl %esi
-; X32-AVX-NEXT:    subl $128, %esp
-; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-AVX-NEXT:    vmovups (%edx), %ymm0
-; X32-AVX-NEXT:    vmovups 32(%edx), %xmm1
-; X32-AVX-NEXT:    movl 48(%edx), %esi
-; X32-AVX-NEXT:    movl 52(%edx), %edi
-; X32-AVX-NEXT:    movl 56(%edx), %ebx
-; X32-AVX-NEXT:    movl 60(%edx), %edx
-; X32-AVX-NEXT:    movl (%ecx), %ecx
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    vmovups %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    vmovups %ymm0, (%esp)
-; X32-AVX-NEXT:    sarl $31, %edx
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT:    andl $63, %ecx
-; X32-AVX-NEXT:    vmovups (%esp,%ecx), %xmm0
-; X32-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm1
-; X32-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm2
-; X32-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm3
-; X32-AVX-NEXT:    vmovups %xmm3, 48(%eax)
-; X32-AVX-NEXT:    vmovups %xmm2, 32(%eax)
-; X32-AVX-NEXT:    vmovups %xmm1, 16(%eax)
-; X32-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X32-AVX-NEXT:    addl $128, %esp
-; X32-AVX-NEXT:    popl %esi
-; X32-AVX-NEXT:    popl %edi
-; X32-AVX-NEXT:    popl %ebx
-; X32-AVX-NEXT:    vzeroupper
-; X32-AVX-NEXT:    retl
+; X32-AVX1-LABEL: ashr_64bytes:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    pushl %ebx
+; X32-AVX1-NEXT:    pushl %edi
+; X32-AVX1-NEXT:    pushl %esi
+; X32-AVX1-NEXT:    subl $128, %esp
+; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX1-NEXT:    vmovups (%edx), %ymm0
+; X32-AVX1-NEXT:    vmovups 32(%edx), %xmm1
+; X32-AVX1-NEXT:    movl 48(%edx), %esi
+; X32-AVX1-NEXT:    movl 52(%edx), %edi
+; X32-AVX1-NEXT:    movl 56(%edx), %ebx
+; X32-AVX1-NEXT:    movl 60(%edx), %edx
+; X32-AVX1-NEXT:    movl (%ecx), %ecx
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    vmovups %xmm1, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    vmovups %ymm0, (%esp)
+; X32-AVX1-NEXT:    sarl $31, %edx
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX1-NEXT:    andl $63, %ecx
+; X32-AVX1-NEXT:    vmovups (%esp,%ecx), %ymm0
+; X32-AVX1-NEXT:    vmovups 32(%esp,%ecx), %ymm1
+; X32-AVX1-NEXT:    vmovups %ymm1, 32(%eax)
+; X32-AVX1-NEXT:    vmovups %ymm0, (%eax)
+; X32-AVX1-NEXT:    addl $128, %esp
+; X32-AVX1-NEXT:    popl %esi
+; X32-AVX1-NEXT:    popl %edi
+; X32-AVX1-NEXT:    popl %ebx
+; X32-AVX1-NEXT:    vzeroupper
+; X32-AVX1-NEXT:    retl
+;
+; X32-AVX512-LABEL: ashr_64bytes:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    pushl %ebx
+; X32-AVX512-NEXT:    pushl %edi
+; X32-AVX512-NEXT:    pushl %esi
+; X32-AVX512-NEXT:    subl $128, %esp
+; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-AVX512-NEXT:    vmovups (%edx), %ymm0
+; X32-AVX512-NEXT:    vmovups 32(%edx), %xmm1
+; X32-AVX512-NEXT:    movl 48(%edx), %esi
+; X32-AVX512-NEXT:    movl 52(%edx), %edi
+; X32-AVX512-NEXT:    movl 56(%edx), %ebx
+; X32-AVX512-NEXT:    movl 60(%edx), %edx
+; X32-AVX512-NEXT:    movl (%ecx), %ecx
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    vmovups %xmm1, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    vmovups %ymm0, (%esp)
+; X32-AVX512-NEXT:    sarl $31, %edx
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-AVX512-NEXT:    andl $63, %ecx
+; X32-AVX512-NEXT:    vmovups (%esp,%ecx), %zmm0
+; X32-AVX512-NEXT:    vmovups %zmm0, (%eax)
+; X32-AVX512-NEXT:    addl $128, %esp
+; X32-AVX512-NEXT:    popl %esi
+; X32-AVX512-NEXT:    popl %edi
+; X32-AVX512-NEXT:    popl %ebx
+; X32-AVX512-NEXT:    vzeroupper
+; X32-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
   %byteOff = load i512, ptr %byteOff.ptr, align 1
   %bitOff = shl i512 %byteOff, 3