diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2504,14 +2504,16 @@ // (in our case undisturbed). This means we can set up a subvector insertion // where OFFSET is the insertion offset, and the VL is the OFFSET plus the // size of the subvector. - MVT InterSubVT = getLMUL1VT(VecVT); - - // Extract a subvector equal to the nearest full vector register type. This - // should resolve to a EXTRACT_SUBREG instruction. + MVT InterSubVT = VecVT; + SDValue AlignedExtract = Vec; unsigned AlignedIdx = OrigIdx - RemIdx; - SDValue AlignedExtract = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec, - DAG.getConstant(AlignedIdx, DL, XLenVT)); + if (VecVT.bitsGT(getLMUL1VT(VecVT))) { + InterSubVT = getLMUL1VT(VecVT); + // Extract a subvector equal to the nearest full vector register type. This + // should resolve to a EXTRACT_SUBREG instruction. + AlignedExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec, + DAG.getConstant(AlignedIdx, DL, XLenVT)); + } SDValue SlideupAmt = DAG.getConstant(RemIdx, DL, XLenVT); // For scalable vectors this must be further multiplied by vscale. @@ -2532,10 +2534,12 @@ SDValue Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, InterSubVT, AlignedExtract, SubVec, SlideupAmt, Mask, VL); - // Insert this subvector into the correct vector register. This should - // resolve to an INSERT_SUBREG instruction. - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, Slideup, - DAG.getConstant(AlignedIdx, DL, XLenVT)); + // If required, insert this subvector back into the correct vector register. + // This should resolve to an INSERT_SUBREG instruction. + if (VecVT.bitsGT(InterSubVT)) + Slideup = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, Slideup, + DAG.getConstant(AlignedIdx, DL, XLenVT)); + return Slideup; } SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op, @@ -2630,13 +2634,15 @@ // Else we must shift our vector register directly to extract the subvector. // Do this using VSLIDEDOWN. - // Extract a subvector equal to the nearest full vector register type. This - // should resolve to a EXTRACT_SUBREG instruction. - unsigned AlignedIdx = OrigIdx - RemIdx; - MVT InterSubVT = getLMUL1VT(VecVT); - SDValue AlignedExtract = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec, - DAG.getConstant(AlignedIdx, DL, XLenVT)); + // If the vector type is an LMUL-group type, extract a subvector equal to the + // nearest full vector register type. This should resolve to a EXTRACT_SUBREG + // instruction. + MVT InterSubVT = VecVT; + if (VecVT.bitsGT(getLMUL1VT(VecVT))) { + InterSubVT = getLMUL1VT(VecVT); + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec, + DAG.getConstant(OrigIdx - RemIdx, DL, XLenVT)); + } // Slide this vector register down by the desired number of elements in order // to place the desired subvector starting at element 0. @@ -2646,14 +2652,16 @@ SDValue Mask, VL; std::tie(Mask, VL) = getDefaultScalableVLOps(InterSubVT, DL, DAG, Subtarget); - SDValue Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, InterSubVT, - DAG.getUNDEF(InterSubVT), AlignedExtract, - SlidedownAmt, Mask, VL); - - // Now the vector is in the right position, extract our final subvector. This - // should resolve to a COPY. - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Slidedown, - DAG.getConstant(0, DL, XLenVT)); + SDValue Slidedown = + DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, InterSubVT, + DAG.getUNDEF(InterSubVT), Vec, SlidedownAmt, Mask, VL); + + // Now the vector is in the right position, extract our final subvector if + // required. This should resolve to a COPY. + if (VecVT.bitsGT(InterSubVT)) + Slidedown = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Slidedown, + DAG.getConstant(0, DL, XLenVT)); + return Slidedown; } SDValue diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -327,6 +327,20 @@ ret %c } +define @extract_nxv4i8_nxv1i8_3( %vec) { +; CHECK-LABEL: extract_nxv4i8_nxv1i8_3: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: vsetvli a1, zero, e8,mf2,ta,mu +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv1i8.nxv4i8( %vec, i64 3) + ret %c +} + define @extract_nxv2f16_nxv16f16_0( %vec) { ; CHECK-LABEL: extract_nxv2f16_nxv16f16_0: ; CHECK: # %bb.0: @@ -402,6 +416,52 @@ ret %c } +define @extract_nxv4i1_nxv32i1_0( %x) { +; CHECK-LABEL: extract_nxv4i1_nxv32i1_0: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv4i1( %x, i64 0) + ret %c +} + +define @extract_nxv4i1_nxv32i1_4( %x) { +; CHECK-LABEL: extract_nxv4i1_nxv32i1_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8,m4,ta,mu +; CHECK-NEXT: vmv.v.i v28, 0 +; CHECK-NEXT: vmerge.vim v28, v28, 1, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; CHECK-NEXT: vslidedown.vx v25, v28, a0 +; CHECK-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv4i1( %x, i64 4) + ret %c +} + +define @extract_nxv16i1_nxv32i1_0( %x) { +; CHECK-LABEL: extract_nxv16i1_nxv32i1_0: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv16i1( %x, i64 0) + ret %c +} + +define @extract_nxv16i1_nxv32i1_16( %x) { +; CHECK-LABEL: extract_nxv16i1_nxv32i1_16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e8,mf2,ta,mu +; CHECK-NEXT: vslidedown.vx v0, v0, a0 +; CHECK-NEXT: ret + %c = call @llvm.experimental.vector.extract.nxv16i1( %x, i64 16) + ret %c +} + +declare @llvm.experimental.vector.extract.nxv1i8.nxv4i8( %vec, i64 %idx) declare @llvm.experimental.vector.extract.nxv1i8.nxv8i8( %vec, i64 %idx) declare @llvm.experimental.vector.extract.nxv2i8.nxv32i8( %vec, i64 %idx) @@ -418,5 +478,8 @@ declare @llvm.experimental.vector.extract.nxv2f16.nxv16f16( %vec, i64 %idx) +declare @llvm.experimental.vector.extract.nxv4i1( %vec, i64 %idx) +declare @llvm.experimental.vector.extract.nxv16i1( %vec, i64 %idx) + declare @llvm.experimental.vector.extract.nxv2i1( %vec, i64 %idx) declare @llvm.experimental.vector.extract.nxv8i1( %vec, i64 %idx) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll @@ -2,6 +2,35 @@ ; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 ; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 +define void @extract_v2i8_v4i8_0(<4 x i8>* %x, <2 x i8>* %y) { +; CHECK-LABEL: extract_v2i8_v4i8_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a2, 4, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vsetivli a0, 2, e8,m1,ta,mu +; CHECK-NEXT: vse8.v v25, (a1) +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %c = call <2 x i8> @llvm.experimental.vector.extract.v2i8.v4i8(<4 x i8> %a, i64 0) + store <2 x i8> %c, <2 x i8>* %y + ret void +} + +define void @extract_v2i8_v4i8_2(<4 x i8>* %x, <2 x i8>* %y) { +; CHECK-LABEL: extract_v2i8_v4i8_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a2, 4, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vsetivli a0, 2, e8,m1,ta,mu +; CHECK-NEXT: vslidedown.vi v25, v25, 2 +; CHECK-NEXT: vse8.v v25, (a1) +; CHECK-NEXT: ret + %a = load <4 x i8>, <4 x i8>* %x + %c = call <2 x i8> @llvm.experimental.vector.extract.v2i8.v4i8(<4 x i8> %a, i64 2) + store <2 x i8> %c, <2 x i8>* %y + ret void +} + define void @extract_v2i8_v8i8_0(<8 x i8>* %x, <2 x i8>* %y) { ; CHECK-LABEL: extract_v2i8_v8i8_0: ; CHECK: # %bb.0: @@ -128,6 +157,30 @@ ret void } +define void @extract_v2i8_nxv2i8_0( %x, <2 x i8>* %y) { +; CHECK-LABEL: extract_v2i8_nxv2i8_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 2, e8,m1,ta,mu +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: ret + %c = call <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8( %x, i64 0) + store <2 x i8> %c, <2 x i8>* %y + ret void +} + +define void @extract_v2i8_nxv2i8_2( %x, <2 x i8>* %y) { +; CHECK-LABEL: extract_v2i8_nxv2i8_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 2, e8,mf4,ta,mu +; CHECK-NEXT: vslidedown.vi v25, v8, 2 +; CHECK-NEXT: vsetivli a1, 2, e8,m1,ta,mu +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %c = call <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8( %x, i64 2) + store <2 x i8> %c, <2 x i8>* %y + ret void +} + define void @extract_v8i32_nxv16i32_8( %x, <8 x i32>* %y) { ; LMULMAX2-LABEL: extract_v8i32_nxv16i32_8: ; LMULMAX2: # %bb.0: @@ -458,17 +511,53 @@ ret void } +define void @extract_v2i1_nxv32i1_26( %x, <2 x i1>* %y) { +; CHECK-LABEL: extract_v2i1_nxv32i1_26: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8,m4,ta,mu +; CHECK-NEXT: vmv.v.i v28, 0 +; CHECK-NEXT: vmerge.vim v28, v28, 1, v0 +; CHECK-NEXT: vsetivli a1, 2, e8,m4,ta,mu +; CHECK-NEXT: vslidedown.vi v28, v28, 26 +; CHECK-NEXT: vsetivli a1, 2, e8,m1,ta,mu +; CHECK-NEXT: vmsne.vi v25, v28, 0 +; CHECK-NEXT: vse1.v v25, (a0) +; CHECK-NEXT: ret + %c = call <2 x i1> @llvm.experimental.vector.extract.v2i1.nxv32i1( %x, i64 26) + store <2 x i1> %c, <2 x i1>* %y + ret void +} + +define void @extract_v8i1_nxv32i1_16( %x, <8 x i1>* %y) { +; CHECK-LABEL: extract_v8i1_nxv32i1_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 1, e8,mf2,ta,mu +; CHECK-NEXT: vslidedown.vi v25, v0, 2 +; CHECK-NEXT: vsetivli a1, 8, e8,m1,ta,mu +; CHECK-NEXT: vse1.v v25, (a0) +; CHECK-NEXT: ret + %c = call <8 x i1> @llvm.experimental.vector.extract.v8i1.nxv32i1( %x, i64 16) + store <8 x i1> %c, <8 x i1>* %y + ret void +} + declare <2 x i1> @llvm.experimental.vector.extract.v2i1.v64i1(<64 x i1> %vec, i64 %idx) declare <8 x i1> @llvm.experimental.vector.extract.v8i1.v64i1(<64 x i1> %vec, i64 %idx) declare <2 x i1> @llvm.experimental.vector.extract.v2i1.nxv2i1( %vec, i64 %idx) declare <8 x i1> @llvm.experimental.vector.extract.v8i1.nxv2i1( %vec, i64 %idx) +declare <2 x i1> @llvm.experimental.vector.extract.v2i1.nxv32i1( %vec, i64 %idx) +declare <8 x i1> @llvm.experimental.vector.extract.v8i1.nxv32i1( %vec, i64 %idx) + declare <2 x i1> @llvm.experimental.vector.extract.v2i1.nxv64i1( %vec, i64 %idx) declare <8 x i1> @llvm.experimental.vector.extract.v8i1.nxv64i1( %vec, i64 %idx) +declare <2 x i8> @llvm.experimental.vector.extract.v2i8.v4i8(<4 x i8> %vec, i64 %idx) declare <2 x i8> @llvm.experimental.vector.extract.v2i8.v8i8(<8 x i8> %vec, i64 %idx) declare <2 x i32> @llvm.experimental.vector.extract.v2i32.v8i32(<8 x i32> %vec, i64 %idx) +declare <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8( %vec, i64 %idx) + declare <2 x i32> @llvm.experimental.vector.extract.v2i32.nxv16i32( %vec, i64 %idx) declare <8 x i32> @llvm.experimental.vector.extract.v8i32.nxv16i32( %vec, i64 %idx) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -323,9 +323,77 @@ ret void } +define void @insert_v4i16_v2i16_0(<4 x i16>* %vp, <2 x i16>* %svp) { +; CHECK-LABEL: insert_v4i16_v2i16_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a2, 4, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vsetivli a2, 2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsetivli a1, 2, e16,m1,tu,mu +; CHECK-NEXT: vslideup.vi v25, v26, 0 +; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %v = load <4 x i16>, <4 x i16>* %vp + %sv = load <2 x i16>, <2 x i16>* %svp + %c = call <4 x i16> @llvm.experimental.vector.insert.v2i16.v4i16(<4 x i16> %v, <2 x i16> %sv, i64 0) + store <4 x i16> %c, <4 x i16>* %vp + ret void +} + +define void @insert_v4i16_v2i16_2(<4 x i16>* %vp, <2 x i16>* %svp) { +; CHECK-LABEL: insert_v4i16_v2i16_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a2, 4, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vsetivli a2, 2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsetivli a1, 4, e16,m1,tu,mu +; CHECK-NEXT: vslideup.vi v25, v26, 2 +; CHECK-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %v = load <4 x i16>, <4 x i16>* %vp + %sv = load <2 x i16>, <2 x i16>* %svp + %c = call <4 x i16> @llvm.experimental.vector.insert.v2i16.v4i16(<4 x i16> %v, <2 x i16> %sv, i64 2) + store <4 x i16> %c, <4 x i16>* %vp + ret void +} + +define @insert_nxv2i16_v2i16_0( %v, <2 x i16>* %svp) { +; CHECK-LABEL: insert_nxv2i16_v2i16_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vsetivli a0, 2, e16,mf2,tu,mu +; CHECK-NEXT: vslideup.vi v8, v25, 0 +; CHECK-NEXT: ret + %sv = load <2 x i16>, <2 x i16>* %svp + %c = call @llvm.experimental.vector.insert.v2i16.nxv2i16( %v, <2 x i16> %sv, i64 0) + ret %c +} + +define @insert_nxv2i16_v2i16_2( %v, <2 x i16>* %svp) { +; CHECK-LABEL: insert_nxv2i16_v2i16_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vsetivli a0, 6, e16,mf2,tu,mu +; CHECK-NEXT: vslideup.vi v8, v25, 4 +; CHECK-NEXT: ret + %sv = load <2 x i16>, <2 x i16>* %svp + %c = call @llvm.experimental.vector.insert.v2i16.nxv2i16( %v, <2 x i16> %sv, i64 4) + ret %c +} + +declare <4 x i16> @llvm.experimental.vector.insert.v2i16.v4i16(<4 x i16>, <2 x i16>, i64) + declare <4 x i32> @llvm.experimental.vector.insert.v2i32.v4i32(<4 x i32>, <2 x i32>, i64) declare <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32>, <2 x i32>, i64) +declare @llvm.experimental.vector.insert.v2i16.nxv2i16(, <2 x i16>, i64) + declare @llvm.experimental.vector.insert.v2i32.nxv8i32(, <2 x i32>, i64) declare @llvm.experimental.vector.insert.v4i32.nxv8i32(, <4 x i32>, i64) declare @llvm.experimental.vector.insert.v8i32.nxv8i32(, <8 x i32>, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll @@ -55,6 +55,33 @@ ret %v } +define @insert_nxv1i8_nxv4i8_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv1i8_nxv4i8_0: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a0, a0, e8,mf2,tu,mu +; CHECK-NEXT: vslideup.vi v8, v9, 0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv4i8( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv1i8_nxv4i8_3( %vec, %subvec) { +; CHECK-LABEL: insert_nxv1i8_nxv4i8_3: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a1, a1, a0 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: vsetvli a0, a0, e8,mf2,tu,mu +; CHECK-NEXT: vslideup.vx v8, v9, a1 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv4i8( %vec, %subvec, i64 3) + ret %v +} + define @insert_nxv16i32_nxv8i32_0( %vec, %subvec) { ; CHECK-LABEL: insert_nxv16i32_nxv8i32_0: ; CHECK: # %bb.0: @@ -367,6 +394,8 @@ declare @llvm.experimental.vector.insert.nxv1f16.nxv32f16(, , i64) declare @llvm.experimental.vector.insert.nxv2f16.nxv32f16(, , i64) +declare @llvm.experimental.vector.insert.nxv1i8.nxv4i8(, , i64 %idx) + declare @llvm.experimental.vector.insert.nxv2i32.nxv8i32(, , i64 %idx) declare @llvm.experimental.vector.insert.nxv4i32.nxv8i32(, , i64 %idx)