diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2901,13 +2901,20 @@
 
 }
 
-static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
-                                const RISCVSubtarget &Subtarget) {
+/// Is this shuffle interleaving contiguous elements from one vector into the
+/// even elements and contiguous elements from another vector into the odd
+/// elements. \p Src1 will contain the element that should be in the first even
+/// element. \p Src2 will contain the element that should be in the first odd
+/// element. These can be the first element in a source or the element half
+/// way through the source.
+static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, int &EvenSrc,
+                                int &OddSrc, const RISCVSubtarget &Subtarget) {
   // We need to be able to widen elements to the next larger integer type.
   if (VT.getScalarSizeInBits() >= Subtarget.getELEN())
     return false;
 
   int Size = Mask.size();
+  int HalfSize = Size / 2;
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
 
   int Srcs[] = {-1, -1};
@@ -2919,8 +2926,8 @@
     // Is this an even or odd element.
     int Pol = i % 2;
 
-    // Ensure we consistently use the same source for this element polarity.
-    int Src = Mask[i] / Size;
+    // Ensure we consistently use the same half source for this polarity.
+    int Src = alignDown(Mask[i], HalfSize);
     if (Srcs[Pol] < 0)
       Srcs[Pol] = Src;
     if (Srcs[Pol] != Src)
@@ -2928,17 +2935,24 @@
 
     // Make sure the element within the source is appropriate for this element
     // in the destination.
-    int Elt = Mask[i] % Size;
+    int Elt = Mask[i] % HalfSize;
     if (Elt != i / 2)
       return false;
   }
 
-  // We need to find a source for each polarity and they can't be the same.
-  if (Srcs[0] < 0 || Srcs[1] < 0 || Srcs[0] == Srcs[1])
+  // One source should be low half of first vector.
+  if (Srcs[0] != 0 && Srcs[1] != 0)
+    return false;
+
+  // Other source should be the upper half of the first source or the lower
+  // half of the second source.
+  // FIXME: This is only a heuristic to avoid regressions.
+  if (Srcs[0] != HalfSize && Srcs[0] != Size && Srcs[1] != HalfSize &&
+      Srcs[1] != Size)
     return false;
 
-  // Swap the sources if the second source was in the even polarity.
-  SwapSources = Srcs[0] > Srcs[1];
+  EvenSrc = Srcs[0];
+  OddSrc = Srcs[1];
 
   return true;
 }
@@ -3338,18 +3352,22 @@
 
   // Detect an interleave shuffle and lower to
   // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1))
-  bool SwapSources;
-  if (isInterleaveShuffle(Mask, VT, SwapSources, Subtarget)) {
-    // Swap sources if needed.
-    if (SwapSources)
-      std::swap(V1, V2);
-
-    // Extract the lower half of the vectors.
+  int EvenSrc, OddSrc;
+  if (isInterleaveShuffle(Mask, VT, EvenSrc, OddSrc, Subtarget)) {
+    // Extract the halves of the vectors.
     MVT HalfVT = VT.getHalfNumVectorElementsVT();
-    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
-                     DAG.getConstant(0, DL, XLenVT));
-    V2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V2,
-                     DAG.getConstant(0, DL, XLenVT));
+
+    int Size = Mask.size();
+    SDValue EvenV, OddV;
+    assert(EvenSrc >= 0 && "Undef source?");
+    EvenV = (EvenSrc / Size) == 0 ? V1 : V2;
+    EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV,
+                        DAG.getConstant(EvenSrc % Size, DL, XLenVT));
+
+    assert(OddSrc >= 0 && "Undef source?");
+    OddV = (OddSrc / Size) == 0 ? V1 : V2;
+    OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV,
+                       DAG.getConstant(OddSrc % Size, DL, XLenVT));
 
     // Double the element width and halve the number of elements in an int type.
     unsigned EltBits = VT.getScalarSizeInBits();
@@ -3365,36 +3383,37 @@
     // larger type.
     MVT HalfContainerVT = MVT::getVectorVT(
         VT.getVectorElementType(), WideIntContainerVT.getVectorElementCount());
-    V1 = convertToScalableVector(HalfContainerVT, V1, DAG, Subtarget);
-    V2 = convertToScalableVector(HalfContainerVT, V2, DAG, Subtarget);
+    EvenV = convertToScalableVector(HalfContainerVT, EvenV, DAG, Subtarget);
+    OddV = convertToScalableVector(HalfContainerVT, OddV, DAG, Subtarget);
 
     // Cast sources to integer.
     MVT IntEltVT = MVT::getIntegerVT(EltBits);
     MVT IntHalfVT =
         MVT::getVectorVT(IntEltVT, HalfContainerVT.getVectorElementCount());
-    V1 = DAG.getBitcast(IntHalfVT, V1);
-    V2 = DAG.getBitcast(IntHalfVT, V2);
+    EvenV = DAG.getBitcast(IntHalfVT, EvenV);
+    OddV = DAG.getBitcast(IntHalfVT, OddV);
 
-    // Freeze V2 since we use it twice and we need to be sure that the add and
+    // Freeze OddV since we use it twice and we need to be sure that the add and
     // multiply see the same value.
-    V2 = DAG.getFreeze(V2);
+    OddV = DAG.getFreeze(OddV);
 
     // Recreate TrueMask using the widened type's element count.
     TrueMask = getAllOnesMask(HalfContainerVT, VL, DL, DAG);
 
-    // Widen V1 and V2 with 0s and add one copy of V2 to V1.
+    // Widen EvenV and OddV with 0s and add one copy of OddV to EvenV.
     SDValue Add =
-        DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, V1, V2,
+        DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, EvenV, OddV,
                     DAG.getUNDEF(WideIntContainerVT), TrueMask, VL);
-    // Create 2^eltbits - 1 copies of V2 by multiplying by the largest integer.
+    // Create 2^eltbits - 1 copies of OddV by multiplying by the largest
+    // integer.
     SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT,
                                      DAG.getUNDEF(IntHalfVT),
                                      DAG.getAllOnesConstant(DL, XLenVT), VL);
     SDValue WidenMul =
-        DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, V2, Multiplier,
-                    DAG.getUNDEF(WideIntContainerVT), TrueMask, VL);
+        DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT, OddV,
+                    Multiplier, DAG.getUNDEF(WideIntContainerVT), TrueMask, VL);
     // Add the new copies to our previous addition giving us 2^eltbits copies of
-    // V2. This is equivalent to shifting V2 left by eltbits. This should
+    // OddV. This is equivalent to shifting OddV left by eltbits. This should
     // combine with the vwmulu.vv above to form vwmaccu.vv.
     Add = DAG.getNode(RISCVISD::ADD_VL, DL, WideIntContainerVT, Add, WidenMul,
                       DAG.getUNDEF(WideIntContainerVT), TrueMask, VL);
@@ -3555,10 +3574,9 @@
 
   MVT SVT = VT.getSimpleVT();
 
-  bool SwapSources;
-  int LoSrc, HiSrc;
-  return (isElementRotate(LoSrc, HiSrc, M) > 0) ||
-         isInterleaveShuffle(M, SVT, SwapSources, Subtarget);
+  int Dummy1, Dummy2;
+  return (isElementRotate(Dummy1, Dummy2, M) > 0) ||
+         isInterleaveShuffle(M, SVT, Dummy1, Dummy2, Subtarget);
 }
 
 // Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -370,3 +370,152 @@
   %a = shufflevector <32 x float> %x, <32 x float> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   ret <64 x float> %a
 }
+
+define <4 x half> @unary_interleave_v4f16(<4 x half> %x) {
+; V128-LABEL: unary_interleave_v4f16:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 2
+; V128-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v4f16:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 2
+; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <4 x half> %x, <4 x half> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x half> %a
+}
+
+define <4 x float> @unary_interleave_v4f32(<4 x float> %x) {
+; V128-LABEL: unary_interleave_v4f32:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 2
+; V128-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v4f32:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 2
+; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %a
+}
+
+; FIXME: Is there better codegen we can do here?
+define <4 x double> @unary_interleave_v4f64(<4 x double> %x) {
+; RV32-V128-LABEL: unary_interleave_v4f64:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    lui a0, %hi(.LCPI13_0)
+; RV32-V128-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV32-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-V128-NEXT:    vle16.v v12, (a0)
+; RV32-V128-NEXT:    vrgatherei16.vv v10, v8, v12
+; RV32-V128-NEXT:    vmv.v.v v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V128-LABEL: unary_interleave_v4f64:
+; RV64-V128:       # %bb.0:
+; RV64-V128-NEXT:    lui a0, %hi(.LCPI13_0)
+; RV64-V128-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV64-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-V128-NEXT:    vle64.v v12, (a0)
+; RV64-V128-NEXT:    vrgather.vv v10, v8, v12
+; RV64-V128-NEXT:    vmv.v.v v8, v10
+; RV64-V128-NEXT:    ret
+;
+; RV32-V512-LABEL: unary_interleave_v4f64:
+; RV32-V512:       # %bb.0:
+; RV32-V512-NEXT:    lui a0, %hi(.LCPI13_0)
+; RV32-V512-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV32-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; RV32-V512-NEXT:    vle16.v v10, (a0)
+; RV32-V512-NEXT:    vrgatherei16.vv v9, v8, v10
+; RV32-V512-NEXT:    vmv.v.v v8, v9
+; RV32-V512-NEXT:    ret
+;
+; RV64-V512-LABEL: unary_interleave_v4f64:
+; RV64-V512:       # %bb.0:
+; RV64-V512-NEXT:    lui a0, %hi(.LCPI13_0)
+; RV64-V512-NEXT:    addi a0, a0, %lo(.LCPI13_0)
+; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; RV64-V512-NEXT:    vle64.v v10, (a0)
+; RV64-V512-NEXT:    vrgather.vv v9, v8, v10
+; RV64-V512-NEXT:    vmv.v.v v8, v9
+; RV64-V512-NEXT:    ret
+  %a = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x double> %a
+}
+
+define <8 x half> @unary_interleave_v8f16(<8 x half> %x) {
+; V128-LABEL: unary_interleave_v8f16:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 4
+; V128-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v8f16:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 4
+; V512-NEXT:    vsetivli zero, 8, e16, mf4, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <8 x half> %x, <8 x half> poison, <8 x i32> <i32 0, i32 4, i32 undef, i32 5, i32 2, i32 undef, i32 3, i32 7>
+  ret <8 x half> %a
+}
+
+define <8 x float> @unary_interleave_v8f32(<8 x float> %x) {
+; V128-LABEL: unary_interleave_v8f32:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; V128-NEXT:    vslidedown.vi v12, v8, 4
+; V128-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
+; V128-NEXT:    vwaddu.vv v10, v12, v8
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v10, a0, v8
+; V128-NEXT:    vmv2r.v v8, v10
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v8f32:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 4
+; V512-NEXT:    vsetivli zero, 8, e32, mf2, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v10, v8
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v8
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 4, i32 0, i32 undef, i32 1, i32 6, i32 undef, i32 7, i32 3>
+  ret <8 x float> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -476,3 +476,204 @@
   %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   ret <64 x i32> %a
 }
+
+define <4 x i8> @unary_interleave_v4i8(<4 x i8> %x) {
+; V128-LABEL: unary_interleave_v4i8:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 2
+; V128-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v4i8:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 2
+; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x i8> %a
+}
+
+define <4 x i16> @unary_interleave_v4i16(<4 x i16> %x) {
+; V128-LABEL: unary_interleave_v4i16:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 2
+; V128-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v4i16:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 2
+; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x i16> %a
+}
+
+define <4 x i32> @unary_interleave_v4i32(<4 x i32> %x) {
+; V128-LABEL: unary_interleave_v4i32:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 2
+; V128-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v4i32:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 2
+; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x i32> %a
+}
+
+; FIXME: Is there better codegen we can do here?
+define <4 x i64> @unary_interleave_v4i64(<4 x i64> %x) {
+; RV32-V128-LABEL: unary_interleave_v4i64:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV32-V128-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; RV32-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-V128-NEXT:    vle16.v v12, (a0)
+; RV32-V128-NEXT:    vrgatherei16.vv v10, v8, v12
+; RV32-V128-NEXT:    vmv.v.v v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V128-LABEL: unary_interleave_v4i64:
+; RV64-V128:       # %bb.0:
+; RV64-V128-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV64-V128-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; RV64-V128-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-V128-NEXT:    vle64.v v12, (a0)
+; RV64-V128-NEXT:    vrgather.vv v10, v8, v12
+; RV64-V128-NEXT:    vmv.v.v v8, v10
+; RV64-V128-NEXT:    ret
+;
+; RV32-V512-LABEL: unary_interleave_v4i64:
+; RV32-V512:       # %bb.0:
+; RV32-V512-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV32-V512-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; RV32-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; RV32-V512-NEXT:    vle16.v v10, (a0)
+; RV32-V512-NEXT:    vrgatherei16.vv v9, v8, v10
+; RV32-V512-NEXT:    vmv.v.v v8, v9
+; RV32-V512-NEXT:    ret
+;
+; RV64-V512-LABEL: unary_interleave_v4i64:
+; RV64-V512:       # %bb.0:
+; RV64-V512-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV64-V512-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
+; RV64-V512-NEXT:    vle64.v v10, (a0)
+; RV64-V512-NEXT:    vrgather.vv v9, v8, v10
+; RV64-V512-NEXT:    vmv.v.v v8, v9
+; RV64-V512-NEXT:    ret
+  %a = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x i64> %a
+}
+
+define <8 x i8> @unary_interleave_v8i8(<8 x i8> %x) {
+; V128-LABEL: unary_interleave_v8i8:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 4
+; V128-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v8, v10
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v10
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v8i8:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 4
+; V512-NEXT:    vsetivli zero, 8, e8, mf8, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 undef, i32 6, i32 3, i32 7>
+  ret <8 x i8> %a
+}
+
+define <8 x i16> @unary_interleave_v8i16(<8 x i16> %x) {
+; V128-LABEL: unary_interleave_v8i16:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
+; V128-NEXT:    vslidedown.vi v10, v8, 4
+; V128-NEXT:    vsetivli zero, 8, e16, mf2, ta, ma
+; V128-NEXT:    vwaddu.vv v9, v10, v8
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v9, a0, v8
+; V128-NEXT:    vmv1r.v v8, v9
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v8i16:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 4
+; V512-NEXT:    vsetivli zero, 8, e16, mf4, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v10, v8
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v8
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
+  ret <8 x i16> %a
+}
+
+define <8 x i32> @unary_interleave_v8i32(<8 x i32> %x) {
+; V128-LABEL: unary_interleave_v8i32:
+; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; V128-NEXT:    vslidedown.vi v12, v8, 4
+; V128-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
+; V128-NEXT:    vwaddu.vv v10, v8, v12
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v10, a0, v12
+; V128-NEXT:    vmv2r.v v8, v10
+; V128-NEXT:    ret
+;
+; V512-LABEL: unary_interleave_v8i32:
+; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V512-NEXT:    vslidedown.vi v10, v8, 4
+; V512-NEXT:    vsetivli zero, 8, e32, mf2, ta, ma
+; V512-NEXT:    vwaddu.vv v9, v8, v10
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v9, a0, v10
+; V512-NEXT:    vmv1r.v v8, v9
+; V512-NEXT:    ret
+  %a = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x i32> %a
+}