diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -7588,6 +7588,22 @@ } } + // If after narrowing, the required slide is still greater than LMUL2, + // fallback to generic expansion and go through the stack. This is done + // for a subtle reason: extracting *all* elements out of a vector is + // widely expected to be linear in vector size, but because vslidedown + // is linear in LMUL, performing N extracts using vslidedown becomes + // O(n^2) / (VLEN/ETYPE) work. On the surface, going through the stack + // seems to have the same problem (the store is linear in LMUL), but the + // generic expansion *memoizes* the store, and thus for many extracts of + // the same vector we end up with one store and a bunch of loads. + // TODO: We don't have the same code for insert_vector_elt because we + // have BUILD_VECTOR and handle the degenerate case there. Should we + // consider adding an inverse BUILD_VECTOR node? + MVT LMUL2VT = getLMUL1VT(ContainerVT).getDoubleNumVectorElementsVT(); + if (ContainerVT.bitsGT(LMUL2VT) && VecVT.isFixedLengthVector()) + return SDValue(); + // If the index is 0, the vector is already in the right position. if (!isNullConstant(Idx)) { // Use a VL of 1 to avoid processing more elements than we need. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -244,15 +244,43 @@ ; A LMUL8 type define i32 @extractelt_v32i32(ptr %x) nounwind { -; CHECK-LABEL: extractelt_v32i32: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 31 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: extractelt_v32i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -256 +; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 256 +; RV32-NEXT: andi sp, sp, -128 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: lw a0, 124(sp) +; RV32-NEXT: addi sp, s0, -256 +; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 256 +; RV32-NEXT: ret +; +; RV64-LABEL: extractelt_v32i32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -256 +; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 256 +; RV64-NEXT: andi sp, sp, -128 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: mv a0, sp +; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: lw a0, 124(sp) +; RV64-NEXT: addi sp, s0, -256 +; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 256 +; RV64-NEXT: ret %a = load <32 x i32>, ptr %x %b = extractelement <32 x i32> %a, i32 31 ret i32 %b @@ -260,16 +288,45 @@ ; Exercise type legalization for type beyond LMUL8 define i32 @extractelt_v64i32(ptr %x) nounwind { -; CHECK-LABEL: extractelt_v64i32: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 31 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: extractelt_v64i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -256 +; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 256 +; RV32-NEXT: andi sp, sp, -128 +; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: lw a0, 124(sp) +; RV32-NEXT: addi sp, s0, -256 +; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 256 +; RV32-NEXT: ret +; +; RV64-LABEL: extractelt_v64i32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -256 +; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 256 +; RV64-NEXT: andi sp, sp, -128 +; RV64-NEXT: addi a0, a0, 128 +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: mv a0, sp +; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: lw a0, 124(sp) +; RV64-NEXT: addi sp, s0, -256 +; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 256 +; RV64-NEXT: ret %a = load <64 x i32>, ptr %x %b = extractelement <64 x i32> %a, i32 63 ret i32 %b @@ -548,16 +605,105 @@ } define i32 @extractelt_v32i32_idx(ptr %x, i32 zeroext %idx) nounwind { -; CHECK-LABEL: extractelt_v32i32_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vsetivli zero, 1, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret +; RV32NOM-LABEL: extractelt_v32i32_idx: +; RV32NOM: # %bb.0: +; RV32NOM-NEXT: addi sp, sp, -256 +; RV32NOM-NEXT: sw ra, 252(sp) # 4-byte Folded Spill +; RV32NOM-NEXT: sw s0, 248(sp) # 4-byte Folded Spill +; RV32NOM-NEXT: sw s2, 244(sp) # 4-byte Folded Spill +; RV32NOM-NEXT: addi s0, sp, 256 +; RV32NOM-NEXT: andi sp, sp, -128 +; RV32NOM-NEXT: mv s2, a0 +; RV32NOM-NEXT: andi a0, a1, 31 +; RV32NOM-NEXT: li a1, 4 +; RV32NOM-NEXT: call __mulsi3@plt +; RV32NOM-NEXT: li a1, 32 +; RV32NOM-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32NOM-NEXT: vle32.v v8, (s2) +; RV32NOM-NEXT: mv a1, sp +; RV32NOM-NEXT: add a0, a1, a0 +; RV32NOM-NEXT: vadd.vv v8, v8, v8 +; RV32NOM-NEXT: vse32.v v8, (a1) +; RV32NOM-NEXT: lw a0, 0(a0) +; RV32NOM-NEXT: addi sp, s0, -256 +; RV32NOM-NEXT: lw ra, 252(sp) # 4-byte Folded Reload +; RV32NOM-NEXT: lw s0, 248(sp) # 4-byte Folded Reload +; RV32NOM-NEXT: lw s2, 244(sp) # 4-byte Folded Reload +; RV32NOM-NEXT: addi sp, sp, 256 +; RV32NOM-NEXT: ret +; +; RV32M-LABEL: extractelt_v32i32_idx: +; RV32M: # %bb.0: +; RV32M-NEXT: addi sp, sp, -256 +; RV32M-NEXT: sw ra, 252(sp) # 4-byte Folded Spill +; RV32M-NEXT: sw s0, 248(sp) # 4-byte Folded Spill +; RV32M-NEXT: addi s0, sp, 256 +; RV32M-NEXT: andi sp, sp, -128 +; RV32M-NEXT: andi a1, a1, 31 +; RV32M-NEXT: li a2, 32 +; RV32M-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32M-NEXT: vle32.v v8, (a0) +; RV32M-NEXT: slli a1, a1, 2 +; RV32M-NEXT: mv a0, sp +; RV32M-NEXT: or a1, a0, a1 +; RV32M-NEXT: vadd.vv v8, v8, v8 +; RV32M-NEXT: vse32.v v8, (a0) +; RV32M-NEXT: lw a0, 0(a1) +; RV32M-NEXT: addi sp, s0, -256 +; RV32M-NEXT: lw ra, 252(sp) # 4-byte Folded Reload +; RV32M-NEXT: lw s0, 248(sp) # 4-byte Folded Reload +; RV32M-NEXT: addi sp, sp, 256 +; RV32M-NEXT: ret +; +; RV64NOM-LABEL: extractelt_v32i32_idx: +; RV64NOM: # %bb.0: +; RV64NOM-NEXT: addi sp, sp, -256 +; RV64NOM-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; RV64NOM-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; RV64NOM-NEXT: sd s2, 232(sp) # 8-byte Folded Spill +; RV64NOM-NEXT: addi s0, sp, 256 +; RV64NOM-NEXT: andi sp, sp, -128 +; RV64NOM-NEXT: mv s2, a0 +; RV64NOM-NEXT: andi a0, a1, 31 +; RV64NOM-NEXT: li a1, 4 +; RV64NOM-NEXT: call __muldi3@plt +; RV64NOM-NEXT: li a1, 32 +; RV64NOM-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64NOM-NEXT: vle32.v v8, (s2) +; RV64NOM-NEXT: mv a1, sp +; RV64NOM-NEXT: add a0, a1, a0 +; RV64NOM-NEXT: vadd.vv v8, v8, v8 +; RV64NOM-NEXT: vse32.v v8, (a1) +; RV64NOM-NEXT: lw a0, 0(a0) +; RV64NOM-NEXT: addi sp, s0, -256 +; RV64NOM-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; RV64NOM-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; RV64NOM-NEXT: ld s2, 232(sp) # 8-byte Folded Reload +; RV64NOM-NEXT: addi sp, sp, 256 +; RV64NOM-NEXT: ret +; +; RV64M-LABEL: extractelt_v32i32_idx: +; RV64M: # %bb.0: +; RV64M-NEXT: addi sp, sp, -256 +; RV64M-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; RV64M-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; RV64M-NEXT: addi s0, sp, 256 +; RV64M-NEXT: andi sp, sp, -128 +; RV64M-NEXT: andi a1, a1, 31 +; RV64M-NEXT: li a2, 32 +; RV64M-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV64M-NEXT: vle32.v v8, (a0) +; RV64M-NEXT: slli a1, a1, 2 +; RV64M-NEXT: mv a0, sp +; RV64M-NEXT: or a1, a0, a1 +; RV64M-NEXT: vadd.vv v8, v8, v8 +; RV64M-NEXT: vse32.v v8, (a0) +; RV64M-NEXT: lw a0, 0(a1) +; RV64M-NEXT: addi sp, s0, -256 +; RV64M-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; RV64M-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; RV64M-NEXT: addi sp, sp, 256 +; RV64M-NEXT: ret %a = load <32 x i32>, ptr %x %b = add <32 x i32> %a, %a %c = extractelement <32 x i32> %b, i32 %idx diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll @@ -310,11 +310,22 @@ ; ; RV32-LABEL: fp2si_v8f64_v8i8: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -128 +; RV32-NEXT: .cfi_def_cfa_offset 128 +; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: addi s0, sp, 128 +; RV32-NEXT: .cfi_def_cfa s0, 0 +; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 1 -; RV32-NEXT: vfmv.f.s fa3, v12 +; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: vfmv.f.s fa3, v10 ; RV32-NEXT: lui a0, %hi(.LCPI12_0) ; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; RV32-NEXT: lui a0, %hi(.LCPI12_1) @@ -333,11 +344,11 @@ ; RV32-NEXT: fcvt.w.d a3, fa3, rtz ; RV32-NEXT: and a2, a2, a3 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v8, a2 -; RV32-NEXT: vslide1down.vx v12, v12, a0 +; RV32-NEXT: vslide1down.vx v10, v8, a2 +; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v14, v8, 2 -; RV32-NEXT: vfmv.f.s fa3, v14 +; RV32-NEXT: vslidedown.vi v12, v8, 2 +; RV32-NEXT: vfmv.f.s fa3, v12 ; RV32-NEXT: feq.d a0, fa3, fa3 ; RV32-NEXT: neg a0, a0 ; RV32-NEXT: fmax.d fa3, fa3, fa5 @@ -345,72 +356,75 @@ ; RV32-NEXT: fcvt.w.d a2, fa3, rtz ; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 +; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v14, v8, 3 -; RV32-NEXT: vfmv.f.s fa3, v14 +; RV32-NEXT: vslidedown.vi v8, v8, 3 +; RV32-NEXT: vfmv.f.s fa3, v8 ; RV32-NEXT: feq.d a0, fa3, fa3 -; RV32-NEXT: neg a0, a0 ; RV32-NEXT: fmax.d fa3, fa3, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 ; RV32-NEXT: fcvt.w.d a2, fa3, rtz -; RV32-NEXT: and a0, a0, a2 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 4 -; RV32-NEXT: vfmv.f.s fa3, v16 -; RV32-NEXT: feq.d a0, fa3, fa3 +; RV32-NEXT: fld fa3, 32(sp) ; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vslide1down.vx v8, v10, a0 +; RV32-NEXT: feq.d a0, fa3, fa3 ; RV32-NEXT: fmax.d fa3, fa3, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 ; RV32-NEXT: fcvt.w.d a2, fa3, rtz +; RV32-NEXT: fld fa3, 40(sp) +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 5 -; RV32-NEXT: vfmv.f.s fa3, v16 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: feq.d a0, fa3, fa3 -; RV32-NEXT: neg a0, a0 ; RV32-NEXT: fmax.d fa3, fa3, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 ; RV32-NEXT: fcvt.w.d a2, fa3, rtz +; RV32-NEXT: fld fa3, 48(sp) +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 6 -; RV32-NEXT: vfmv.f.s fa3, v16 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: feq.d a0, fa3, fa3 -; RV32-NEXT: neg a0, a0 ; RV32-NEXT: fmax.d fa3, fa3, fa5 ; RV32-NEXT: fmin.d fa3, fa3, fa4 ; RV32-NEXT: fcvt.w.d a2, fa3, rtz +; RV32-NEXT: fld fa3, 56(sp) +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 7 -; RV32-NEXT: vfmv.f.s fa3, v8 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: feq.d a0, fa3, fa3 ; RV32-NEXT: neg a0, a0 ; RV32-NEXT: fmax.d fa5, fa3, fa5 ; RV32-NEXT: fmin.d fa5, fa5, fa4 ; RV32-NEXT: fcvt.w.d a2, fa5, rtz ; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v8, v12, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: vse8.v v8, (a1) +; RV32-NEXT: addi sp, s0, -128 +; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 128 ; RV32-NEXT: ret ; ; RV64-LABEL: fp2si_v8f64_v8i8: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -128 +; RV64-NEXT: .cfi_def_cfa_offset 128 +; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: addi s0, sp, 128 +; RV64-NEXT: .cfi_def_cfa s0, 0 +; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: mv a0, sp +; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 1 -; RV64-NEXT: vfmv.f.s fa3, v12 +; RV64-NEXT: vslidedown.vi v10, v8, 1 +; RV64-NEXT: vfmv.f.s fa3, v10 ; RV64-NEXT: lui a0, %hi(.LCPI12_0) ; RV64-NEXT: fld fa5, %lo(.LCPI12_0)(a0) ; RV64-NEXT: lui a0, %hi(.LCPI12_1) @@ -429,11 +443,11 @@ ; RV64-NEXT: fcvt.l.d a3, fa3, rtz ; RV64-NEXT: and a2, a2, a3 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v8, a2 -; RV64-NEXT: vslide1down.vx v12, v12, a0 +; RV64-NEXT: vslide1down.vx v10, v8, a2 +; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v14, v8, 2 -; RV64-NEXT: vfmv.f.s fa3, v14 +; RV64-NEXT: vslidedown.vi v12, v8, 2 +; RV64-NEXT: vfmv.f.s fa3, v12 ; RV64-NEXT: feq.d a0, fa3, fa3 ; RV64-NEXT: neg a0, a0 ; RV64-NEXT: fmax.d fa3, fa3, fa5 @@ -441,63 +455,55 @@ ; RV64-NEXT: fcvt.l.d a2, fa3, rtz ; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 +; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v14, v8, 3 -; RV64-NEXT: vfmv.f.s fa3, v14 +; RV64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-NEXT: vfmv.f.s fa3, v8 ; RV64-NEXT: feq.d a0, fa3, fa3 -; RV64-NEXT: neg a0, a0 ; RV64-NEXT: fmax.d fa3, fa3, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 ; RV64-NEXT: fcvt.l.d a2, fa3, rtz -; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 -; RV64-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 4 -; RV64-NEXT: vfmv.f.s fa3, v16 -; RV64-NEXT: feq.d a0, fa3, fa3 +; RV64-NEXT: fld fa3, 32(sp) ; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-NEXT: feq.d a0, fa3, fa3 ; RV64-NEXT: fmax.d fa3, fa3, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 ; RV64-NEXT: fcvt.l.d a2, fa3, rtz +; RV64-NEXT: fld fa3, 40(sp) +; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 -; RV64-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 5 -; RV64-NEXT: vfmv.f.s fa3, v16 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: feq.d a0, fa3, fa3 -; RV64-NEXT: neg a0, a0 ; RV64-NEXT: fmax.d fa3, fa3, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 ; RV64-NEXT: fcvt.l.d a2, fa3, rtz +; RV64-NEXT: fld fa3, 48(sp) +; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 -; RV64-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 6 -; RV64-NEXT: vfmv.f.s fa3, v16 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: feq.d a0, fa3, fa3 -; RV64-NEXT: neg a0, a0 ; RV64-NEXT: fmax.d fa3, fa3, fa5 ; RV64-NEXT: fmin.d fa3, fa3, fa4 ; RV64-NEXT: fcvt.l.d a2, fa3, rtz +; RV64-NEXT: fld fa3, 56(sp) +; RV64-NEXT: neg a0, a0 ; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 -; RV64-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 7 -; RV64-NEXT: vfmv.f.s fa3, v8 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: feq.d a0, fa3, fa3 ; RV64-NEXT: neg a0, a0 ; RV64-NEXT: fmax.d fa5, fa3, fa5 ; RV64-NEXT: fmin.d fa5, fa5, fa4 ; RV64-NEXT: fcvt.l.d a2, fa5, rtz ; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v8, v12, a0 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: vse8.v v8, (a1) +; RV64-NEXT: addi sp, s0, -128 +; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 128 ; RV64-NEXT: ret %a = load <8 x double>, ptr %x %d = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> %a) @@ -510,8 +516,19 @@ ; ; RV32-LABEL: fp2ui_v8f64_v8i8: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -128 +; RV32-NEXT: .cfi_def_cfa_offset 128 +; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: addi s0, sp, 128 +; RV32-NEXT: .cfi_def_cfa s0, 0 +; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: mv a0, sp +; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: lui a0, %hi(.LCPI13_0) ; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; RV32-NEXT: vfmv.f.s fa4, v8 @@ -520,70 +537,73 @@ ; RV32-NEXT: fmin.d fa4, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa4, rtz ; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v8, a0 +; RV32-NEXT: vslide1down.vx v10, v8, a0 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vslidedown.vi v13, v8, 1 -; RV32-NEXT: vfmv.f.s fa4, v13 +; RV32-NEXT: vslidedown.vi v11, v8, 1 +; RV32-NEXT: vfmv.f.s fa4, v11 ; RV32-NEXT: fmax.d fa4, fa4, fa3 ; RV32-NEXT: fmin.d fa4, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa4, rtz ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 +; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v14, v8, 2 -; RV32-NEXT: vfmv.f.s fa4, v14 +; RV32-NEXT: vslidedown.vi v12, v8, 2 +; RV32-NEXT: vfmv.f.s fa4, v12 ; RV32-NEXT: fmax.d fa4, fa4, fa3 ; RV32-NEXT: fmin.d fa4, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa4, rtz ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 +; RV32-NEXT: vslide1down.vx v10, v10, a0 ; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v14, v8, 3 -; RV32-NEXT: vfmv.f.s fa4, v14 +; RV32-NEXT: vslidedown.vi v8, v8, 3 +; RV32-NEXT: vfmv.f.s fa4, v8 ; RV32-NEXT: fmax.d fa4, fa4, fa3 -; RV32-NEXT: fmin.d fa4, fa4, fa5 -; RV32-NEXT: fcvt.wu.d a0, fa4, rtz ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 4 -; RV32-NEXT: vfmv.f.s fa4, v16 -; RV32-NEXT: fmax.d fa4, fa4, fa3 +; RV32-NEXT: fld fa2, 32(sp) ; RV32-NEXT: fmin.d fa4, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa4, rtz -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 5 -; RV32-NEXT: vfmv.f.s fa4, v16 +; RV32-NEXT: fld fa4, 40(sp) +; RV32-NEXT: fmax.d fa2, fa2, fa3 +; RV32-NEXT: fmin.d fa2, fa2, fa5 +; RV32-NEXT: fcvt.wu.d a2, fa2, rtz ; RV32-NEXT: fmax.d fa4, fa4, fa3 +; RV32-NEXT: fld fa2, 48(sp) ; RV32-NEXT: fmin.d fa4, fa4, fa5 -; RV32-NEXT: fcvt.wu.d a0, fa4, rtz -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 6 -; RV32-NEXT: vfmv.f.s fa4, v16 -; RV32-NEXT: fmax.d fa4, fa4, fa3 +; RV32-NEXT: fcvt.wu.d a3, fa4, rtz +; RV32-NEXT: vslide1down.vx v8, v10, a0 +; RV32-NEXT: fmax.d fa4, fa2, fa3 ; RV32-NEXT: fmin.d fa4, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa4, rtz -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v12, v12, a0 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 7 -; RV32-NEXT: vfmv.f.s fa4, v8 +; RV32-NEXT: fld fa4, 56(sp) +; RV32-NEXT: vslide1down.vx v8, v8, a2 +; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: fmax.d fa4, fa4, fa3 ; RV32-NEXT: fmin.d fa5, fa4, fa5 ; RV32-NEXT: fcvt.wu.d a0, fa5, rtz -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vslide1down.vx v8, v12, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a0 ; RV32-NEXT: vse8.v v8, (a1) +; RV32-NEXT: addi sp, s0, -128 +; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 128 ; RV32-NEXT: ret ; ; RV64-LABEL: fp2ui_v8f64_v8i8: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -128 +; RV64-NEXT: .cfi_def_cfa_offset 128 +; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: addi s0, sp, 128 +; RV64-NEXT: .cfi_def_cfa s0, 0 +; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: mv a0, sp +; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: lui a0, %hi(.LCPI13_0) ; RV64-NEXT: fld fa5, %lo(.LCPI13_0)(a0) ; RV64-NEXT: vfmv.f.s fa4, v8 @@ -592,64 +612,56 @@ ; RV64-NEXT: fmin.d fa4, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa4, rtz ; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v8, a0 +; RV64-NEXT: vslide1down.vx v10, v8, a0 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vslidedown.vi v13, v8, 1 -; RV64-NEXT: vfmv.f.s fa4, v13 +; RV64-NEXT: vslidedown.vi v11, v8, 1 +; RV64-NEXT: vfmv.f.s fa4, v11 ; RV64-NEXT: fmax.d fa4, fa4, fa3 ; RV64-NEXT: fmin.d fa4, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa4, rtz ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 +; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v14, v8, 2 -; RV64-NEXT: vfmv.f.s fa4, v14 +; RV64-NEXT: vslidedown.vi v12, v8, 2 +; RV64-NEXT: vfmv.f.s fa4, v12 ; RV64-NEXT: fmax.d fa4, fa4, fa3 ; RV64-NEXT: fmin.d fa4, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa4, rtz ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 +; RV64-NEXT: vslide1down.vx v10, v10, a0 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v14, v8, 3 -; RV64-NEXT: vfmv.f.s fa4, v14 +; RV64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-NEXT: vfmv.f.s fa4, v8 ; RV64-NEXT: fmax.d fa4, fa4, fa3 -; RV64-NEXT: fmin.d fa4, fa4, fa5 -; RV64-NEXT: fcvt.lu.d a0, fa4, rtz ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 -; RV64-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 4 -; RV64-NEXT: vfmv.f.s fa4, v16 -; RV64-NEXT: fmax.d fa4, fa4, fa3 +; RV64-NEXT: fld fa2, 32(sp) ; RV64-NEXT: fmin.d fa4, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa4, rtz -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 -; RV64-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 5 -; RV64-NEXT: vfmv.f.s fa4, v16 +; RV64-NEXT: fld fa4, 40(sp) +; RV64-NEXT: fmax.d fa2, fa2, fa3 +; RV64-NEXT: fmin.d fa2, fa2, fa5 +; RV64-NEXT: fcvt.lu.d a2, fa2, rtz ; RV64-NEXT: fmax.d fa4, fa4, fa3 +; RV64-NEXT: fld fa2, 48(sp) ; RV64-NEXT: fmin.d fa4, fa4, fa5 -; RV64-NEXT: fcvt.lu.d a0, fa4, rtz -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 -; RV64-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 6 -; RV64-NEXT: vfmv.f.s fa4, v16 -; RV64-NEXT: fmax.d fa4, fa4, fa3 +; RV64-NEXT: fcvt.lu.d a3, fa4, rtz +; RV64-NEXT: vslide1down.vx v8, v10, a0 +; RV64-NEXT: fmax.d fa4, fa2, fa3 ; RV64-NEXT: fmin.d fa4, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa4, rtz -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v12, v12, a0 -; RV64-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 7 -; RV64-NEXT: vfmv.f.s fa4, v8 +; RV64-NEXT: fld fa4, 56(sp) +; RV64-NEXT: vslide1down.vx v8, v8, a2 +; RV64-NEXT: vslide1down.vx v8, v8, a3 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: fmax.d fa4, fa4, fa3 ; RV64-NEXT: fmin.d fa5, fa4, fa5 ; RV64-NEXT: fcvt.lu.d a0, fa5, rtz -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vslide1down.vx v8, v12, a0 +; RV64-NEXT: vslide1down.vx v8, v8, a0 ; RV64-NEXT: vse8.v v8, (a1) +; RV64-NEXT: addi sp, s0, -128 +; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 128 ; RV64-NEXT: ret %a = load <8 x double>, ptr %x %d = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> %a) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll @@ -614,10 +614,17 @@ define i32 @explode_16xi32(<16 x i32> %v) { ; RV32-LABEL: explode_16xi32: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: addi sp, sp, -128 +; RV32-NEXT: .cfi_def_cfa_offset 128 +; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 116(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: .cfi_offset s2, -12 +; RV32-NEXT: addi s0, sp, 128 +; RV32-NEXT: .cfi_def_cfa s0, 0 +; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vslidedown.vi v12, v8, 1 @@ -635,23 +642,17 @@ ; RV32-NEXT: vmv.x.s a6, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 7 ; RV32-NEXT: vmv.x.s a7, v12 -; RV32-NEXT: vsetivli zero, 1, e32, m4, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 8 -; RV32-NEXT: vmv.x.s t0, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 9 -; RV32-NEXT: vmv.x.s t1, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 10 -; RV32-NEXT: vmv.x.s t2, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 11 -; RV32-NEXT: vmv.x.s t3, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 12 -; RV32-NEXT: vmv.x.s t4, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 13 -; RV32-NEXT: vmv.x.s t5, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 14 -; RV32-NEXT: vmv.x.s t6, v12 -; RV32-NEXT: vslidedown.vi v8, v8, 15 -; RV32-NEXT: vmv.x.s s0, v8 +; RV32-NEXT: mv t0, sp +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vse32.v v8, (t0) +; RV32-NEXT: lw t0, 32(sp) +; RV32-NEXT: lw t1, 36(sp) +; RV32-NEXT: lw t2, 40(sp) +; RV32-NEXT: lw t3, 44(sp) +; RV32-NEXT: lw t4, 48(sp) +; RV32-NEXT: lw t5, 52(sp) +; RV32-NEXT: lw t6, 56(sp) +; RV32-NEXT: lw s2, 60(sp) ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: add a0, a0, a2 @@ -659,24 +660,34 @@ ; RV32-NEXT: add a4, a4, a6 ; RV32-NEXT: add a0, a0, a4 ; RV32-NEXT: add a7, a7, t0 -; RV32-NEXT: add a7, a7, t1 -; RV32-NEXT: add a7, a7, t2 ; RV32-NEXT: add a0, a0, a7 -; RV32-NEXT: add t3, t3, t4 -; RV32-NEXT: add t3, t3, t5 -; RV32-NEXT: add t3, t3, t6 -; RV32-NEXT: add t3, t3, s0 -; RV32-NEXT: add a0, a0, t3 -; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: add t1, t1, t2 +; RV32-NEXT: add t1, t1, t3 +; RV32-NEXT: add a0, a0, t1 +; RV32-NEXT: add t4, t4, t5 +; RV32-NEXT: add t4, t4, t6 +; RV32-NEXT: add t4, t4, s2 +; RV32-NEXT: add a0, a0, t4 +; RV32-NEXT: addi sp, s0, -128 +; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 116(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 128 ; RV32-NEXT: ret ; ; RV64-LABEL: explode_16xi32: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset s0, -8 +; RV64-NEXT: addi sp, sp, -128 +; RV64-NEXT: .cfi_def_cfa_offset 128 +; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 104(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s2, -24 +; RV64-NEXT: addi s0, sp, 128 +; RV64-NEXT: .cfi_def_cfa s0, 0 +; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: vslidedown.vi v12, v8, 1 @@ -694,23 +705,17 @@ ; RV64-NEXT: vmv.x.s a6, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 7 ; RV64-NEXT: vmv.x.s a7, v12 -; RV64-NEXT: vsetivli zero, 1, e32, m4, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 8 -; RV64-NEXT: vmv.x.s t0, v12 -; RV64-NEXT: vslidedown.vi v12, v8, 9 -; RV64-NEXT: vmv.x.s t1, v12 -; RV64-NEXT: vslidedown.vi v12, v8, 10 -; RV64-NEXT: vmv.x.s t2, v12 -; RV64-NEXT: vslidedown.vi v12, v8, 11 -; RV64-NEXT: vmv.x.s t3, v12 -; RV64-NEXT: vslidedown.vi v12, v8, 12 -; RV64-NEXT: vmv.x.s t4, v12 -; RV64-NEXT: vslidedown.vi v12, v8, 13 -; RV64-NEXT: vmv.x.s t5, v12 -; RV64-NEXT: vslidedown.vi v12, v8, 14 -; RV64-NEXT: vmv.x.s t6, v12 -; RV64-NEXT: vslidedown.vi v8, v8, 15 -; RV64-NEXT: vmv.x.s s0, v8 +; RV64-NEXT: mv t0, sp +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV64-NEXT: vse32.v v8, (t0) +; RV64-NEXT: lw t0, 32(sp) +; RV64-NEXT: lw t1, 36(sp) +; RV64-NEXT: lw t2, 40(sp) +; RV64-NEXT: lw t3, 44(sp) +; RV64-NEXT: lw t4, 48(sp) +; RV64-NEXT: lw t5, 52(sp) +; RV64-NEXT: lw t6, 56(sp) +; RV64-NEXT: lw s2, 60(sp) ; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: add a0, a0, a2 @@ -718,16 +723,19 @@ ; RV64-NEXT: add a4, a4, a6 ; RV64-NEXT: add a0, a0, a4 ; RV64-NEXT: add a7, a7, t0 -; RV64-NEXT: add a7, a7, t1 -; RV64-NEXT: add a7, a7, t2 ; RV64-NEXT: add a0, a0, a7 -; RV64-NEXT: add t3, t3, t4 -; RV64-NEXT: add t3, t3, t5 -; RV64-NEXT: add t3, t3, t6 -; RV64-NEXT: add t3, t3, s0 -; RV64-NEXT: addw a0, a0, t3 -; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: add t1, t1, t2 +; RV64-NEXT: add t1, t1, t3 +; RV64-NEXT: add a0, a0, t1 +; RV64-NEXT: add t4, t4, t5 +; RV64-NEXT: add t4, t4, t6 +; RV64-NEXT: add t4, t4, s2 +; RV64-NEXT: addw a0, a0, t4 +; RV64-NEXT: addi sp, s0, -128 +; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 104(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 128 ; RV64-NEXT: ret %e0 = extractelement <16 x i32> %v, i32 0 %e1 = extractelement <16 x i32> %v, i32 1 @@ -929,6 +937,15 @@ ; ; RV64-LABEL: explode_8xi64: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -128 +; RV64-NEXT: .cfi_def_cfa_offset 128 +; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: addi s0, sp, 128 +; RV64-NEXT: .cfi_def_cfa s0, 0 +; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: vslidedown.vi v12, v8, 1 @@ -938,22 +955,24 @@ ; RV64-NEXT: vmv.x.s a2, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 3 ; RV64-NEXT: vmv.x.s a3, v12 -; RV64-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 4 -; RV64-NEXT: vmv.x.s a4, v12 -; RV64-NEXT: vslidedown.vi v12, v8, 5 -; RV64-NEXT: vmv.x.s a5, v12 -; RV64-NEXT: vslidedown.vi v12, v8, 6 -; RV64-NEXT: vmv.x.s a6, v12 -; RV64-NEXT: vslidedown.vi v8, v8, 7 -; RV64-NEXT: vmv.x.s a7, v8 +; RV64-NEXT: mv a4, sp +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vse64.v v8, (a4) +; RV64-NEXT: ld a4, 32(sp) +; RV64-NEXT: ld a5, 40(sp) +; RV64-NEXT: ld a6, 48(sp) +; RV64-NEXT: ld a7, 56(sp) ; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: add a4, a4, a6 ; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: add a5, a5, a6 +; RV64-NEXT: add a0, a0, a5 ; RV64-NEXT: add a0, a0, a7 +; RV64-NEXT: addi sp, s0, -128 +; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 128 ; RV64-NEXT: ret %e0 = extractelement <8 x i64> %v, i32 0 %e1 = extractelement <8 x i64> %v, i32 1 @@ -1149,10 +1168,17 @@ ; ; RV64-LABEL: explode_16xi64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset s0, -8 +; RV64-NEXT: addi sp, sp, -256 +; RV64-NEXT: .cfi_def_cfa_offset 256 +; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 232(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s2, -24 +; RV64-NEXT: addi s0, sp, 256 +; RV64-NEXT: .cfi_def_cfa s0, 0 +; RV64-NEXT: andi sp, sp, -128 ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: vslidedown.vi v16, v8, 1 @@ -1162,49 +1188,41 @@ ; RV64-NEXT: vmv.x.s a2, v16 ; RV64-NEXT: vslidedown.vi v16, v8, 3 ; RV64-NEXT: vmv.x.s a3, v16 -; RV64-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 4 -; RV64-NEXT: vmv.x.s a4, v16 -; RV64-NEXT: vslidedown.vi v16, v8, 5 -; RV64-NEXT: vmv.x.s a5, v16 -; RV64-NEXT: vslidedown.vi v16, v8, 6 -; RV64-NEXT: vmv.x.s a6, v16 -; RV64-NEXT: vslidedown.vi v16, v8, 7 -; RV64-NEXT: vmv.x.s a7, v16 -; RV64-NEXT: vsetivli zero, 1, e64, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 8 -; RV64-NEXT: vmv.x.s t0, v16 -; RV64-NEXT: vslidedown.vi v16, v8, 9 -; RV64-NEXT: vmv.x.s t1, v16 -; RV64-NEXT: vslidedown.vi v16, v8, 10 -; RV64-NEXT: vmv.x.s t2, v16 -; RV64-NEXT: vslidedown.vi v16, v8, 11 -; RV64-NEXT: vmv.x.s t3, v16 -; RV64-NEXT: vslidedown.vi v16, v8, 12 -; RV64-NEXT: vmv.x.s t4, v16 -; RV64-NEXT: vslidedown.vi v16, v8, 13 -; RV64-NEXT: vmv.x.s t5, v16 -; RV64-NEXT: vslidedown.vi v16, v8, 14 -; RV64-NEXT: vmv.x.s t6, v16 -; RV64-NEXT: vslidedown.vi v8, v8, 15 -; RV64-NEXT: vmv.x.s s0, v8 +; RV64-NEXT: mv a4, sp +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vse64.v v8, (a4) +; RV64-NEXT: ld a4, 32(sp) +; RV64-NEXT: ld a5, 40(sp) +; RV64-NEXT: ld a6, 48(sp) +; RV64-NEXT: ld a7, 56(sp) +; RV64-NEXT: ld t0, 64(sp) +; RV64-NEXT: ld t1, 72(sp) +; RV64-NEXT: ld t2, 80(sp) +; RV64-NEXT: ld t3, 88(sp) +; RV64-NEXT: ld t4, 96(sp) +; RV64-NEXT: ld t5, 104(sp) +; RV64-NEXT: ld t6, 112(sp) +; RV64-NEXT: ld s2, 120(sp) ; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: add a4, a4, a5 -; RV64-NEXT: add a4, a4, a6 ; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: add a5, a5, a6 +; RV64-NEXT: add a0, a0, a5 ; RV64-NEXT: add a7, a7, t0 ; RV64-NEXT: add a7, a7, t1 -; RV64-NEXT: add a7, a7, t2 ; RV64-NEXT: add a0, a0, a7 -; RV64-NEXT: add t3, t3, t4 -; RV64-NEXT: add t3, t3, t5 -; RV64-NEXT: add t3, t3, t6 -; RV64-NEXT: add t3, t3, s0 -; RV64-NEXT: add a0, a0, t3 -; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: add t2, t2, t3 +; RV64-NEXT: add t2, t2, t4 +; RV64-NEXT: add t2, t2, t5 +; RV64-NEXT: add a0, a0, t2 +; RV64-NEXT: add t6, t6, s2 +; RV64-NEXT: add a0, a0, t6 +; RV64-NEXT: addi sp, s0, -256 +; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 232(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 256 ; RV64-NEXT: ret %e0 = extractelement <16 x i64> %v, i32 0 %e1 = extractelement <16 x i64> %v, i32 1