Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4964,19 +4964,6 @@
                       LD->getPointerInfo().getWithOffset(Offset),
                       LD->getOriginalAlign(), MMOFlags, AAInfo);
       LdChain.push_back(L.getValue(1));
-      if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) {
-        // Later code assumes the vector loads produced will be mergeable, so we
-        // must pad the final entry up to the previous width. Scalars are
-        // combined separately.
-        SmallVector<SDValue, 16> Loads;
-        Loads.push_back(L);
-        unsigned size = L->getValueSizeInBits(0);
-        while (size < LdOp->getValueSizeInBits(0)) {
-          Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
-          size += L->getValueSizeInBits(0);
-        }
-        L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads);
-      }
     } else {
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset),
@@ -5017,8 +5004,17 @@
     EVT NewLdTy = LdOps[i].getValueType();
     if (NewLdTy != LdTy) {
       // Create a larger vector.
+      unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits();
+      assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0);
+      SmallVector<SDValue, 16> WidenOps(NumOps);
+      unsigned j = 0;
+      for (; j != End-Idx; ++j)
+        WidenOps[j] = ConcatOps[Idx+j];
+      for (; j != NumOps; ++j)
+        WidenOps[j] = DAG.getUNDEF(LdTy);
+
       ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
-                                     makeArrayRef(&ConcatOps[Idx], End - Idx));
+                                     WidenOps);
       Idx = End - 1;
       LdTy = NewLdTy;
     }
Index: llvm/test/CodeGen/X86/pr46820.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/pr46820.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s
+
+; The alignment of 16 causes type legalization to split this as 3 loads,
+; v16f32, v4f32, and v4f32. There was an issue with type legalization building
+; the proper concat_vectors for this because the two v4f32s don't add up to
+; v16f32 and require padding.
+
+define <23 x float> @load23(<23 x float>* %p) {
+  %t0 = load <23 x float>, <23 x float>* %p, align 16
+  ret <23 x float> %t0
+}