diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1631,7 +1631,8 @@ unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits(); // For scalable vectors, try to use a SPLAT_VECTOR_PARTS node. - if (VT.isScalableVector()) { + if (VT.isScalableVector() || + TLI->isOperationLegal(ISD::SPLAT_VECTOR, VT)) { assert(EltVT.getSizeInBits() % ViaEltSizeInBits == 0 && "Can only handle an even split!"); unsigned Parts = EltVT.getSizeInBits() / ViaEltSizeInBits; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll @@ -1385,72 +1385,82 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_bitreverse_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v9, v0 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v11, v8, a2, v0.t -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v11, v11, a3, v0.t -; RV32-NEXT: vor.vv v10, v11, v10, v0.t -; RV32-NEXT: vsrl.vi v11, v8, 8, v0.t -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 5 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v12, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v12, v12, a4, v0 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vmv1r.v v0, v9 -; RV32-NEXT: vand.vv v11, v11, v12, v0.t -; RV32-NEXT: vsrl.vi v13, v8, 24, v0.t +; RV32-NEXT: vsll.vx v9, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v10, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v10, v10, a3, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v13, v13, a4, v0.t -; RV32-NEXT: vor.vv v11, v11, v13, v0.t -; RV32-NEXT: vor.vv v10, v11, v10, v0.t -; RV32-NEXT: vsll.vx v11, v8, a1, v0.t -; RV32-NEXT: vand.vx v13, v8, a3, v0.t -; RV32-NEXT: vsll.vx v13, v13, a2, v0.t -; RV32-NEXT: vor.vv v11, v11, v13, v0.t -; RV32-NEXT: vand.vx v13, v8, a4, v0.t -; RV32-NEXT: vsll.vi v13, v13, 24, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsll.vi v8, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v13, v8, v0.t -; RV32-NEXT: vor.vv v8, v11, v8, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v11, a1 +; RV32-NEXT: vand.vx v10, v8, a4, v0.t +; RV32-NEXT: vsll.vi v10, v10, 24, v0.t +; RV32-NEXT: mv a5, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v11, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v10, v11, v0.t +; RV32-NEXT: vand.vv v12, v8, v11, v0.t +; RV32-NEXT: vsll.vi v12, v12, 8, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a3, v0.t +; RV32-NEXT: vand.vx v12, v12, a2, v0.t +; RV32-NEXT: vor.vv v10, v12, v10, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 24, v0.t +; RV32-NEXT: vand.vx v12, v12, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t ; RV32-NEXT: vand.vv v8, v8, v11, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vor.vv v8, v9, v8, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v9, v9, v10, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v11, a1 +; RV32-NEXT: vor.vv v8, v9, v8, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v10, v11, v0.t -; RV32-NEXT: vand.vv v8, v8, v11, v0.t +; RV32-NEXT: vand.vv v9, v9, v10, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vsll.vi v8, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v11, a1 +; RV32-NEXT: vor.vv v8, v9, v8, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v10, v11, v0.t -; RV32-NEXT: vand.vv v8, v8, v11, v0.t +; RV32-NEXT: vand.vv v9, v9, v10, v0.t +; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: vor.vv v8, v9, v8, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bitreverse_v2i64: @@ -1519,6 +1529,23 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_bitreverse_v2i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsll.vx v9, v8, a1 @@ -1531,12 +1558,9 @@ ; RV32-NEXT: lui a4, 4080 ; RV32-NEXT: vand.vx v10, v8, a4 ; RV32-NEXT: vsll.vi v10, v10, 24 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 5 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v11, 0 -; RV32-NEXT: lui a5, 1044480 -; RV32-NEXT: vmerge.vxm v11, v11, a5, v0 +; RV32-NEXT: mv a5, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v11, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v12, v8, v11 ; RV32-NEXT: vsll.vi v12, v12, 8 @@ -1546,43 +1570,41 @@ ; RV32-NEXT: vsrl.vx v12, v8, a3 ; RV32-NEXT: vand.vx v12, v12, a2 ; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vand.vv v11, v12, v11 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v11, v8 +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vand.vx v12, v12, a4 +; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v9, v8 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bitreverse_v2i64_unmasked: @@ -1655,73 +1677,82 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_bitreverse_v4i64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v10, v0 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v14, v8, a2, v0.t -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v14, v14, a3, v0.t -; RV32-NEXT: vor.vv v12, v14, v12, v0.t -; RV32-NEXT: vsrl.vi v14, v8, 8, v0.t -; RV32-NEXT: li a4, 85 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.x v0, a4 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vand.vv v14, v14, v16, v0.t -; RV32-NEXT: vsrl.vi v18, v8, 24, v0.t +; RV32-NEXT: vsll.vx v10, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v12, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v12, v12, a3, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v18, v18, a4, v0.t -; RV32-NEXT: vor.vv v14, v14, v18, v0.t -; RV32-NEXT: vor.vv v12, v14, v12, v0.t -; RV32-NEXT: vsll.vx v14, v8, a1, v0.t -; RV32-NEXT: vand.vx v18, v8, a3, v0.t -; RV32-NEXT: vsll.vx v18, v18, a2, v0.t -; RV32-NEXT: vor.vv v14, v14, v18, v0.t -; RV32-NEXT: vand.vx v18, v8, a4, v0.t -; RV32-NEXT: vsll.vi v18, v18, 24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsll.vi v8, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v18, v8, v0.t -; RV32-NEXT: vor.vv v8, v14, v8, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v14, a1 +; RV32-NEXT: vand.vx v12, v8, a4, v0.t +; RV32-NEXT: vsll.vi v12, v12, 24, v0.t +; RV32-NEXT: mv a5, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v14, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v12, v14, v0.t +; RV32-NEXT: vand.vv v16, v8, v14, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: vor.vv v12, v12, v16, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v12, v16, v12, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV32-NEXT: vand.vx v16, v16, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t ; RV32-NEXT: vand.vv v8, v8, v14, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v10, v10, v12, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v14, a1 +; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v12, v14, v0.t -; RV32-NEXT: vand.vv v8, v8, v14, v0.t +; RV32-NEXT: vand.vv v10, v10, v12, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vsll.vi v8, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v14, a1 +; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v12, v14, v0.t -; RV32-NEXT: vand.vv v8, v8, v14, v0.t +; RV32-NEXT: vand.vv v10, v10, v12, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v12, v8, v0.t +; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bitreverse_v4i64: @@ -1790,6 +1821,23 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_bitreverse_v4i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsll.vx v10, v8, a1 @@ -1802,13 +1850,9 @@ ; RV32-NEXT: lui a4, 4080 ; RV32-NEXT: vand.vx v12, v8, a4 ; RV32-NEXT: vsll.vi v12, v12, 24 -; RV32-NEXT: li a5, 85 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.x v0, a5 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v14, 0 -; RV32-NEXT: lui a5, 1044480 -; RV32-NEXT: vmerge.vxm v14, v14, a5, v0 +; RV32-NEXT: mv a5, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v14, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v16, v8, v14 ; RV32-NEXT: vsll.vi v16, v16, 8 @@ -1818,43 +1862,41 @@ ; RV32-NEXT: vsrl.vx v16, v8, a3 ; RV32-NEXT: vand.vx v16, v16, a2 ; RV32-NEXT: vor.vv v12, v16, v12 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v14, v16, v14 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v14, v8 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a4 +; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bitreverse_v4i64_unmasked: @@ -1927,74 +1969,82 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_bitreverse_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v20, v8, a2, v0.t -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v20, v20, a3, v0.t -; RV32-NEXT: vor.vv v20, v20, v16, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV32-NEXT: lui a4, 5 -; RV32-NEXT: addi a4, a4, 1365 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vmv.v.x v0, a4 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmv1r.v v0, v12 -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsrl.vi v28, v8, 24, v0.t +; RV32-NEXT: vsll.vx v12, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v16, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v16, v16, a3, v0.t +; RV32-NEXT: vor.vv v16, v12, v16, v0.t ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v28, v28, a4, v0.t -; RV32-NEXT: vor.vv v24, v24, v28, v0.t +; RV32-NEXT: vand.vx v12, v8, a4, v0.t +; RV32-NEXT: vsll.vi v20, v12, 24, v0.t +; RV32-NEXT: mv a5, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a5), zero +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v24, v8, v12, v0.t +; RV32-NEXT: vsll.vi v24, v24, 8, v0.t +; RV32-NEXT: vor.vv v20, v20, v24, v0.t +; RV32-NEXT: vor.vv v16, v16, v20, v0.t +; RV32-NEXT: vsrl.vx v20, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t +; RV32-NEXT: vand.vx v24, v24, a2, v0.t ; RV32-NEXT: vor.vv v20, v24, v20, v0.t -; RV32-NEXT: vsll.vx v24, v8, a1, v0.t -; RV32-NEXT: vand.vx v28, v8, a3, v0.t -; RV32-NEXT: vsll.vx v28, v28, a2, v0.t -; RV32-NEXT: vor.vv v24, v24, v28, v0.t -; RV32-NEXT: vand.vx v28, v8, a4, v0.t -; RV32-NEXT: vsll.vi v28, v28, 24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsll.vi v8, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v28, v8, v0.t -; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t ; RV32-NEXT: vor.vv v8, v8, v20, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v20, a1 +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v16, v20, v0.t -; RV32-NEXT: vand.vv v8, v8, v20, v0.t +; RV32-NEXT: vand.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v20, a1 +; RV32-NEXT: vor.vv v8, v12, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v16, v20, v0.t -; RV32-NEXT: vand.vv v8, v8, v20, v0.t +; RV32-NEXT: vand.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: vsll.vi v8, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v20, a1 +; RV32-NEXT: vor.vv v8, v12, v8, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v16, v20, v0.t -; RV32-NEXT: vand.vv v8, v8, v20, v0.t +; RV32-NEXT: vand.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vor.vv v8, v12, v8, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bitreverse_v8i64: @@ -2063,6 +2113,23 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_bitreverse_v8i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsll.vx v12, v8, a1 @@ -2075,14 +2142,9 @@ ; RV32-NEXT: lui a4, 4080 ; RV32-NEXT: vand.vx v16, v8, a4 ; RV32-NEXT: vsll.vi v16, v16, 24 -; RV32-NEXT: lui a5, 5 -; RV32-NEXT: addi a5, a5, 1365 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vmv.v.x v0, a5 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v20, 0 -; RV32-NEXT: lui a5, 1044480 -; RV32-NEXT: vmerge.vxm v20, v20, a5, v0 +; RV32-NEXT: mv a5, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v20, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v24, v8, v20 ; RV32-NEXT: vsll.vi v24, v24, 8 @@ -2092,43 +2154,41 @@ ; RV32-NEXT: vsrl.vx v24, v8, a3 ; RV32-NEXT: vand.vx v24, v24, a2 ; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v20, v24, v20 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v20, v8 +; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: vand.vx v24, v24, a4 +; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vand.vv v8, v8, v20 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bitreverse_v8i64_unmasked: @@ -2201,250 +2261,129 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_bitreverse_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; RV32-NEXT: vmv1r.v v1, v0 -; RV32-NEXT: li a3, 56 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb +; RV32-NEXT: sw zero, 20(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v24, v8, a3, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, 16 -; RV32-NEXT: addi a4, a1, -256 -; RV32-NEXT: vand.vx v24, v8, a4, v0.t -; RV32-NEXT: li a5, 40 -; RV32-NEXT: vsll.vx v24, v24, a5, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a4, sp, 48 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 48 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: addi a5, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 3 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 48 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 48 +; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: addi a5, sp, 48 +; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 48 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a6, 4080 -; RV32-NEXT: vand.vx v24, v8, a6, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vmv.v.i v24, 0 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: lui a7, 1044480 -; RV32-NEXT: vmv.v.x v0, a2 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a7, v0 -; RV32-NEXT: addi a7, sp, 16 -; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsll.vi v16, v16, 8, v0.t -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: slli a7, a7, 3 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: slli a7, a7, 4 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: slli a7, a7, 4 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a5, v0.t -; RV32-NEXT: vand.vx v16, v24, a4, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t -; RV32-NEXT: vand.vx v8, v8, a6, v0.t -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 4, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 61681 -; RV32-NEXT: addi a3, a3, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t ; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t ; RV32-NEXT: vsll.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bitreverse_v15i64: @@ -2530,80 +2469,87 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_bitreverse_v15i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb +; RV32-NEXT: sw zero, 20(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v24, v24, a3 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: addi a4, sp, 48 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: li a4, 32 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, 0 -; RV32-NEXT: lui a5, 349525 -; RV32-NEXT: addi a5, a5, 1365 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a5 -; RV32-NEXT: lui a6, 1044480 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a6, v0 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: addi a5, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: lui a6, 4080 -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a6 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: addi a7, sp, 16 -; RV32-NEXT: vl8r.v v0, (a7) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a3 -; RV32-NEXT: vsll.vx v0, v0, a2 -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vand.vx v8, v8, a6 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a5, sp, 48 +; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v0, v8, a3 +; RV32-NEXT: vand.vx v0, v0, a2 +; RV32-NEXT: vsrl.vx v24, v8, a1 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a5 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 @@ -2612,7 +2558,7 @@ ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bitreverse_v15i64_unmasked: @@ -2685,250 +2631,129 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_bitreverse_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; RV32-NEXT: vmv1r.v v1, v0 -; RV32-NEXT: li a3, 56 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb +; RV32-NEXT: sw zero, 20(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v24, v8, a3, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, 16 -; RV32-NEXT: addi a4, a1, -256 -; RV32-NEXT: vand.vx v24, v8, a4, v0.t -; RV32-NEXT: li a5, 40 -; RV32-NEXT: vsll.vx v24, v24, a5, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a4, sp, 48 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 48 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: addi a5, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 3 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 48 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 48 +; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: addi a5, sp, 48 +; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 48 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a1, sp, 48 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a6, 4080 -; RV32-NEXT: vand.vx v24, v8, a6, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vmv.v.i v24, 0 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: lui a7, 1044480 -; RV32-NEXT: vmv.v.x v0, a2 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a7, v0 -; RV32-NEXT: addi a7, sp, 16 -; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsll.vi v16, v16, 8, v0.t -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: slli a7, a7, 3 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v24, v16, v0.t -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: slli a7, a7, 4 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vl8r.v v24, (a7) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t -; RV32-NEXT: csrr a7, vlenb -; RV32-NEXT: slli a7, a7, 4 -; RV32-NEXT: add a7, sp, a7 -; RV32-NEXT: addi a7, a7, 16 -; RV32-NEXT: vs8r.v v24, (a7) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a5, v0.t -; RV32-NEXT: vand.vx v16, v24, a4, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t -; RV32-NEXT: vand.vx v8, v8, a6, v0.t -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 4, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 61681 -; RV32-NEXT: addi a3, a3, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t ; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t ; RV32-NEXT: vsll.vi v8, v8, 2, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v24, (a3) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v24, v8, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t ; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bitreverse_v16i64: @@ -3014,80 +2839,87 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_bitreverse_v16i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb +; RV32-NEXT: sw zero, 20(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v24, v24, a3 -; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: addi a4, sp, 48 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: li a4, 32 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, 0 -; RV32-NEXT: lui a5, 349525 -; RV32-NEXT: addi a5, a5, 1365 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a5 -; RV32-NEXT: lui a6, 1044480 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a6, v0 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: addi a5, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: lui a6, 4080 -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a6 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: addi a7, sp, 16 -; RV32-NEXT: vl8r.v v0, (a7) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vs8r.v v16, (a7) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a3 -; RV32-NEXT: vsll.vx v0, v0, a2 -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vand.vx v8, v8, a6 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a5, sp, 48 +; RV32-NEXT: vl8r.v v0, (a5) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v0, v8, a3 +; RV32-NEXT: vand.vx v0, v0, a2 +; RV32-NEXT: vsrl.vx v24, v8, a1 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vand.vx v8, v8, a4 ; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: addi a1, sp, 48 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: addi a1, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: addi a1, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a5 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vand.vv v8, v8, v24 @@ -3096,7 +2928,7 @@ ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bitreverse_v16i64_unmasked: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -181,8 +181,25 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) { ; RV32-LABEL: bitreverse_v2i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v9, v8, a1 ; RV32-NEXT: li a2, 40 @@ -191,61 +208,49 @@ ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v10, v10, a3 ; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vmv.v.i v0, 5 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v10, v10, a4, v0 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v11, v8, 8 -; RV32-NEXT: vand.vv v11, v11, v10 -; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: mv a4, sp +; RV32-NEXT: vlse64.v v11, (a4), zero ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v12, v12, a4 -; RV32-NEXT: vor.vv v11, v11, v12 -; RV32-NEXT: vor.vv v9, v11, v9 -; RV32-NEXT: vsll.vx v11, v8, a1 +; RV32-NEXT: vand.vx v10, v10, a4 +; RV32-NEXT: vsrl.vi v12, v8, 8 +; RV32-NEXT: vand.vv v12, v12, v11 +; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vor.vv v9, v10, v9 +; RV32-NEXT: vsll.vx v10, v8, a1 ; RV32-NEXT: vand.vx v12, v8, a3 ; RV32-NEXT: vsll.vx v12, v12, a2 -; RV32-NEXT: vor.vv v11, v11, v12 -; RV32-NEXT: vand.vv v10, v8, v10 -; RV32-NEXT: vsll.vi v10, v10, 8 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v11, v8 +; RV32-NEXT: vor.vv v10, v10, v12 +; RV32-NEXT: vand.vx v12, v8, a4 +; RV32-NEXT: vsll.vi v12, v12, 24 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vlse64.v v11, (a1), zero +; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vand.vv v9, v9, v11 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsll.vi v8, v8, 4 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsll.vi v8, v8, 2 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vadd.vv v8, v8, v8 ; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: bitreverse_v2i64: @@ -736,8 +741,25 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) { ; LMULMAX2-RV32-LABEL: bitreverse_v4i64: ; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32-NEXT: sw zero, 4(sp) +; LMULMAX2-RV32-NEXT: lui a1, 1044480 +; LMULMAX2-RV32-NEXT: sw a1, 0(sp) +; LMULMAX2-RV32-NEXT: lui a1, 61681 +; LMULMAX2-RV32-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32-NEXT: sw a1, 24(sp) +; LMULMAX2-RV32-NEXT: lui a1, 209715 +; LMULMAX2-RV32-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32-NEXT: sw a1, 16(sp) +; LMULMAX2-RV32-NEXT: lui a1, 349525 +; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: sw a1, 8(sp) ; LMULMAX2-RV32-NEXT: li a1, 56 ; LMULMAX2-RV32-NEXT: vsrl.vx v10, v8, a1 ; LMULMAX2-RV32-NEXT: li a2, 40 @@ -746,63 +768,49 @@ ; LMULMAX2-RV32-NEXT: addi a3, a3, -256 ; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a3 ; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10 -; LMULMAX2-RV32-NEXT: li a4, 85 -; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v0, a4 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v12, 0 -; LMULMAX2-RV32-NEXT: lui a4, 1044480 -; LMULMAX2-RV32-NEXT: vmerge.vxm v12, v12, a4, v0 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vsrl.vi v14, v8, 8 -; LMULMAX2-RV32-NEXT: vand.vv v14, v14, v12 -; LMULMAX2-RV32-NEXT: vsrl.vi v16, v8, 24 +; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 24 +; LMULMAX2-RV32-NEXT: mv a4, sp +; LMULMAX2-RV32-NEXT: vlse64.v v14, (a4), zero ; LMULMAX2-RV32-NEXT: lui a4, 4080 -; LMULMAX2-RV32-NEXT: vand.vx v16, v16, a4 -; LMULMAX2-RV32-NEXT: vor.vv v14, v14, v16 -; LMULMAX2-RV32-NEXT: vor.vv v10, v14, v10 -; LMULMAX2-RV32-NEXT: vsll.vx v14, v8, a1 +; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a4 +; LMULMAX2-RV32-NEXT: vsrl.vi v16, v8, 8 +; LMULMAX2-RV32-NEXT: vand.vv v16, v16, v14 +; LMULMAX2-RV32-NEXT: vor.vv v12, v16, v12 +; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10 +; LMULMAX2-RV32-NEXT: vsll.vx v12, v8, a1 ; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a3 ; LMULMAX2-RV32-NEXT: vsll.vx v16, v16, a2 -; LMULMAX2-RV32-NEXT: vor.vv v14, v14, v16 -; LMULMAX2-RV32-NEXT: vand.vv v12, v8, v12 -; LMULMAX2-RV32-NEXT: vsll.vi v12, v12, 8 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 24 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12 -; LMULMAX2-RV32-NEXT: vor.vv v8, v14, v8 +; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v16 +; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a4 +; LMULMAX2-RV32-NEXT: vsll.vi v16, v16, 24 +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v14 +; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 8 +; LMULMAX2-RV32-NEXT: vor.vv v8, v16, v8 +; LMULMAX2-RV32-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32-NEXT: vlse64.v v14, (a1), zero +; LMULMAX2-RV32-NEXT: vor.vv v8, v12, v8 ; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 ; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v12 +; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v14 +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v14 +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vlse64.v v12, (a1), zero ; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 4 ; LMULMAX2-RV32-NEXT: vor.vv v8, v10, v8 ; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12 ; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v12 +; LMULMAX2-RV32-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32-NEXT: vlse64.v v12, (a1), zero ; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 2 ; LMULMAX2-RV32-NEXT: vor.vv v8, v10, v8 ; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12 ; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v12 ; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v8 ; LMULMAX2-RV32-NEXT: vor.vv v8, v10, v8 ; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bitreverse_v4i64: @@ -869,110 +877,115 @@ ; ; LMULMAX1-RV32-LABEL: bitreverse_v4i64: ; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v10, (a1) ; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v9, (a1) +; LMULMAX1-RV32-NEXT: sw zero, 4(sp) +; LMULMAX1-RV32-NEXT: lui a2, 1044480 +; LMULMAX1-RV32-NEXT: sw a2, 0(sp) +; LMULMAX1-RV32-NEXT: lui a2, 61681 +; LMULMAX1-RV32-NEXT: addi a2, a2, -241 +; LMULMAX1-RV32-NEXT: sw a2, 28(sp) +; LMULMAX1-RV32-NEXT: sw a2, 24(sp) +; LMULMAX1-RV32-NEXT: lui a2, 209715 +; LMULMAX1-RV32-NEXT: addi a2, a2, 819 +; LMULMAX1-RV32-NEXT: sw a2, 20(sp) +; LMULMAX1-RV32-NEXT: sw a2, 16(sp) +; LMULMAX1-RV32-NEXT: lui a2, 349525 +; LMULMAX1-RV32-NEXT: addi a2, a2, 1365 +; LMULMAX1-RV32-NEXT: sw a2, 12(sp) +; LMULMAX1-RV32-NEXT: sw a2, 8(sp) ; LMULMAX1-RV32-NEXT: li a2, 56 -; LMULMAX1-RV32-NEXT: vsrl.vx v9, v10, a2 +; LMULMAX1-RV32-NEXT: vsrl.vx v10, v9, a2 ; LMULMAX1-RV32-NEXT: li a3, 40 -; LMULMAX1-RV32-NEXT: vsrl.vx v11, v10, a3 +; LMULMAX1-RV32-NEXT: vsrl.vx v11, v9, a3 ; LMULMAX1-RV32-NEXT: lui a4, 16 ; LMULMAX1-RV32-NEXT: addi a4, a4, -256 ; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a4 -; LMULMAX1-RV32-NEXT: vor.vv v11, v11, v9 -; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.i v9, 0 -; LMULMAX1-RV32-NEXT: lui a5, 1044480 -; LMULMAX1-RV32-NEXT: vmerge.vxm v9, v9, a5, v0 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vsrl.vi v12, v10, 8 -; LMULMAX1-RV32-NEXT: vand.vv v12, v12, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v13, v10, 24 +; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10 +; LMULMAX1-RV32-NEXT: vsrl.vi v11, v9, 24 +; LMULMAX1-RV32-NEXT: mv a5, sp +; LMULMAX1-RV32-NEXT: vlse64.v v12, (a5), zero ; LMULMAX1-RV32-NEXT: lui a5, 4080 -; LMULMAX1-RV32-NEXT: vand.vx v13, v13, a5 -; LMULMAX1-RV32-NEXT: vor.vv v12, v12, v13 -; LMULMAX1-RV32-NEXT: vor.vv v11, v12, v11 -; LMULMAX1-RV32-NEXT: vsll.vx v12, v10, a2 -; LMULMAX1-RV32-NEXT: vand.vx v13, v10, a4 -; LMULMAX1-RV32-NEXT: vsll.vx v13, v13, a3 -; LMULMAX1-RV32-NEXT: vor.vv v12, v12, v13 -; LMULMAX1-RV32-NEXT: vand.vv v13, v10, v9 -; LMULMAX1-RV32-NEXT: vsll.vi v13, v13, 8 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a5 -; LMULMAX1-RV32-NEXT: vsll.vi v10, v10, 24 -; LMULMAX1-RV32-NEXT: vor.vv v10, v10, v13 -; LMULMAX1-RV32-NEXT: vor.vv v10, v12, v10 -; LMULMAX1-RV32-NEXT: vor.vv v10, v10, v11 -; LMULMAX1-RV32-NEXT: vsrl.vi v11, v10, 4 -; LMULMAX1-RV32-NEXT: lui a6, 61681 -; LMULMAX1-RV32-NEXT: addi a6, a6, -241 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v12, a6 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v12 -; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v12 -; LMULMAX1-RV32-NEXT: vsll.vi v10, v10, 4 +; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a5 +; LMULMAX1-RV32-NEXT: vsrl.vi v13, v9, 8 +; LMULMAX1-RV32-NEXT: vand.vv v13, v13, v12 +; LMULMAX1-RV32-NEXT: vor.vv v11, v13, v11 ; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v11, v10, 2 -; LMULMAX1-RV32-NEXT: lui a6, 209715 -; LMULMAX1-RV32-NEXT: addi a6, a6, 819 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v13, a6 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v13 +; LMULMAX1-RV32-NEXT: vand.vv v11, v9, v12 +; LMULMAX1-RV32-NEXT: vsll.vi v11, v11, 8 +; LMULMAX1-RV32-NEXT: vand.vx v13, v9, a5 +; LMULMAX1-RV32-NEXT: vsll.vi v13, v13, 24 +; LMULMAX1-RV32-NEXT: vor.vv v11, v13, v11 +; LMULMAX1-RV32-NEXT: vsll.vx v13, v9, a2 +; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4 +; LMULMAX1-RV32-NEXT: vsll.vx v9, v9, a3 +; LMULMAX1-RV32-NEXT: vor.vv v9, v13, v9 +; LMULMAX1-RV32-NEXT: addi a6, sp, 24 +; LMULMAX1-RV32-NEXT: vlse64.v v13, (a6), zero +; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v11 +; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 ; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v13 -; LMULMAX1-RV32-NEXT: vsll.vi v10, v10, 2 -; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v11, v10, 1 -; LMULMAX1-RV32-NEXT: lui a6, 349525 -; LMULMAX1-RV32-NEXT: addi a6, a6, 1365 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v14, a6 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v14 +; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v13 +; LMULMAX1-RV32-NEXT: addi a6, sp, 16 +; LMULMAX1-RV32-NEXT: vlse64.v v11, (a6), zero +; LMULMAX1-RV32-NEXT: vsll.vi v9, v9, 4 +; LMULMAX1-RV32-NEXT: vor.vv v9, v10, v9 +; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 2 +; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v11 +; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v11 +; LMULMAX1-RV32-NEXT: addi a6, sp, 8 +; LMULMAX1-RV32-NEXT: vlse64.v v14, (a6), zero +; LMULMAX1-RV32-NEXT: vsll.vi v9, v9, 2 +; LMULMAX1-RV32-NEXT: vor.vv v9, v10, v9 +; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 ; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v14 -; LMULMAX1-RV32-NEXT: vadd.vv v10, v10, v10 -; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10 -; LMULMAX1-RV32-NEXT: vsrl.vx v11, v8, a2 +; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v14 +; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v9 +; LMULMAX1-RV32-NEXT: vor.vv v9, v10, v9 +; LMULMAX1-RV32-NEXT: vsrl.vx v10, v8, a2 ; LMULMAX1-RV32-NEXT: vsrl.vx v15, v8, a3 ; LMULMAX1-RV32-NEXT: vand.vx v15, v15, a4 -; LMULMAX1-RV32-NEXT: vor.vv v11, v15, v11 -; LMULMAX1-RV32-NEXT: vsrl.vi v15, v8, 8 -; LMULMAX1-RV32-NEXT: vand.vv v15, v15, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v16, v8, 24 -; LMULMAX1-RV32-NEXT: vand.vx v16, v16, a5 -; LMULMAX1-RV32-NEXT: vor.vv v15, v15, v16 -; LMULMAX1-RV32-NEXT: vor.vv v11, v15, v11 +; LMULMAX1-RV32-NEXT: vor.vv v10, v15, v10 +; LMULMAX1-RV32-NEXT: vsrl.vi v15, v8, 24 +; LMULMAX1-RV32-NEXT: vand.vx v15, v15, a5 +; LMULMAX1-RV32-NEXT: vsrl.vi v16, v8, 8 +; LMULMAX1-RV32-NEXT: vand.vv v16, v16, v12 +; LMULMAX1-RV32-NEXT: vor.vv v15, v16, v15 +; LMULMAX1-RV32-NEXT: vor.vv v10, v15, v10 ; LMULMAX1-RV32-NEXT: vsll.vx v15, v8, a2 ; LMULMAX1-RV32-NEXT: vand.vx v16, v8, a4 ; LMULMAX1-RV32-NEXT: vsll.vx v16, v16, a3 ; LMULMAX1-RV32-NEXT: vor.vv v15, v15, v16 -; LMULMAX1-RV32-NEXT: vand.vv v9, v8, v9 -; LMULMAX1-RV32-NEXT: vsll.vi v9, v9, 8 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a5 -; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 24 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vor.vv v8, v15, v8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v11 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v12 +; LMULMAX1-RV32-NEXT: vand.vx v16, v8, a5 +; LMULMAX1-RV32-NEXT: vsll.vi v16, v16, 24 ; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v12 -; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v13 +; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 8 +; LMULMAX1-RV32-NEXT: vor.vv v8, v16, v8 +; LMULMAX1-RV32-NEXT: vor.vv v8, v15, v8 +; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v13 ; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v13 +; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 4 +; LMULMAX1-RV32-NEXT: vor.vv v8, v10, v8 +; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v11 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v11 ; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v14 +; LMULMAX1-RV32-NEXT: vor.vv v8, v10, v8 +; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v14 ; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v14 ; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v9, v8 +; LMULMAX1-RV32-NEXT: vor.vv v8, v10, v8 ; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v10, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v9, (a1) +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bitreverse_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll @@ -425,42 +425,43 @@ define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_bswap_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v9, v0 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v11, v8, a2, v0.t -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v11, v11, a3, v0.t -; RV32-NEXT: vor.vv v10, v11, v10, v0.t -; RV32-NEXT: vsrl.vi v11, v8, 8, v0.t -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 5 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v12, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v12, v12, a4, v0 +; RV32-NEXT: vsll.vx v9, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v10, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v10, v10, a3, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v10, v8, a4, v0.t +; RV32-NEXT: vsll.vi v10, v10, 24, v0.t +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v11, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vmv1r.v v0, v9 -; RV32-NEXT: vand.vv v11, v11, v12, v0.t -; RV32-NEXT: vsrl.vi v13, v8, 24, v0.t -; RV32-NEXT: lui a0, 4080 -; RV32-NEXT: vand.vx v13, v13, a0, v0.t -; RV32-NEXT: vor.vv v11, v11, v13, v0.t -; RV32-NEXT: vor.vv v10, v11, v10, v0.t -; RV32-NEXT: vsll.vx v11, v8, a1, v0.t -; RV32-NEXT: vand.vx v13, v8, a3, v0.t -; RV32-NEXT: vsll.vx v13, v13, a2, v0.t -; RV32-NEXT: vor.vv v11, v11, v13, v0.t -; RV32-NEXT: vand.vx v13, v8, a0, v0.t -; RV32-NEXT: vsll.vi v13, v13, 24, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsll.vi v8, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v13, v8, v0.t -; RV32-NEXT: vor.vv v8, v11, v8, v0.t +; RV32-NEXT: vand.vv v12, v8, v11, v0.t +; RV32-NEXT: vsll.vi v12, v12, 8, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a3, v0.t +; RV32-NEXT: vand.vx v12, v12, a2, v0.t +; RV32-NEXT: vor.vv v10, v12, v10, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 24, v0.t +; RV32-NEXT: vand.vx v12, v12, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vand.vv v8, v8, v11, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t ; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vor.vv v8, v9, v8, v0.t +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bswap_v2i64: @@ -502,6 +503,11 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_bswap_v2i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsll.vx v9, v8, a1 @@ -514,12 +520,9 @@ ; RV32-NEXT: lui a4, 4080 ; RV32-NEXT: vand.vx v10, v8, a4 ; RV32-NEXT: vsll.vi v10, v10, 24 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v0, 5 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v11, 0 -; RV32-NEXT: lui a5, 1044480 -; RV32-NEXT: vmerge.vxm v11, v11, a5, v0 +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v11, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v12, v8, v11 ; RV32-NEXT: vsll.vi v12, v12, 8 @@ -529,13 +532,14 @@ ; RV32-NEXT: vsrl.vx v12, v8, a3 ; RV32-NEXT: vand.vx v12, v12, a2 ; RV32-NEXT: vor.vv v10, v12, v10 -; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vand.vv v11, v12, v11 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v11, v8 +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vand.vx v12, v12, a4 +; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vor.vv v8, v9, v8 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bswap_v2i64_unmasked: @@ -581,43 +585,43 @@ define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_bswap_v4i64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v10, v0 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v14, v8, a2, v0.t -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v14, v14, a3, v0.t -; RV32-NEXT: vor.vv v12, v14, v12, v0.t -; RV32-NEXT: vsrl.vi v14, v8, 8, v0.t -; RV32-NEXT: li a4, 85 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.x v0, a4 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 +; RV32-NEXT: vsll.vx v10, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v12, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v12, v12, a3, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v12, v8, a4, v0.t +; RV32-NEXT: vsll.vi v12, v12, 24, v0.t +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v14, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vmv1r.v v0, v10 -; RV32-NEXT: vand.vv v14, v14, v16, v0.t -; RV32-NEXT: vsrl.vi v18, v8, 24, v0.t -; RV32-NEXT: lui a0, 4080 -; RV32-NEXT: vand.vx v18, v18, a0, v0.t -; RV32-NEXT: vor.vv v14, v14, v18, v0.t -; RV32-NEXT: vor.vv v12, v14, v12, v0.t -; RV32-NEXT: vsll.vx v14, v8, a1, v0.t -; RV32-NEXT: vand.vx v18, v8, a3, v0.t -; RV32-NEXT: vsll.vx v18, v18, a2, v0.t -; RV32-NEXT: vor.vv v14, v14, v18, v0.t -; RV32-NEXT: vand.vx v18, v8, a0, v0.t -; RV32-NEXT: vsll.vi v18, v18, 24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsll.vi v8, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v18, v8, v0.t -; RV32-NEXT: vor.vv v8, v14, v8, v0.t +; RV32-NEXT: vand.vv v16, v8, v14, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: vor.vv v12, v12, v16, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v12, v16, v12, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV32-NEXT: vand.vx v16, v16, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vand.vv v8, v8, v14, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bswap_v4i64: @@ -659,6 +663,11 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_bswap_v4i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsll.vx v10, v8, a1 @@ -671,13 +680,9 @@ ; RV32-NEXT: lui a4, 4080 ; RV32-NEXT: vand.vx v12, v8, a4 ; RV32-NEXT: vsll.vi v12, v12, 24 -; RV32-NEXT: li a5, 85 -; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.x v0, a5 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v14, 0 -; RV32-NEXT: lui a5, 1044480 -; RV32-NEXT: vmerge.vxm v14, v14, a5, v0 +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v14, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v16, v8, v14 ; RV32-NEXT: vsll.vi v16, v16, 8 @@ -687,13 +692,14 @@ ; RV32-NEXT: vsrl.vx v16, v8, a3 ; RV32-NEXT: vand.vx v16, v16, a2 ; RV32-NEXT: vor.vv v12, v16, v12 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vand.vv v14, v16, v14 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v14, v8 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a4 +; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bswap_v4i64_unmasked: @@ -739,44 +745,43 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_bswap_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v20, v8, a2, v0.t -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v20, v20, a3, v0.t -; RV32-NEXT: vor.vv v20, v20, v16, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV32-NEXT: lui a4, 5 -; RV32-NEXT: addi a4, a4, 1365 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vmv.v.x v0, a4 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 +; RV32-NEXT: vsll.vx v12, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v16, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v16, v16, a3, v0.t +; RV32-NEXT: vor.vv v16, v12, v16, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v12, v8, a4, v0.t +; RV32-NEXT: vsll.vi v20, v12, 24, v0.t +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmv1r.v v0, v12 -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsrl.vi v28, v8, 24, v0.t -; RV32-NEXT: lui a0, 4080 -; RV32-NEXT: vand.vx v28, v28, a0, v0.t -; RV32-NEXT: vor.vv v24, v24, v28, v0.t +; RV32-NEXT: vand.vv v24, v8, v12, v0.t +; RV32-NEXT: vsll.vi v24, v24, 8, v0.t +; RV32-NEXT: vor.vv v20, v20, v24, v0.t +; RV32-NEXT: vor.vv v16, v16, v20, v0.t +; RV32-NEXT: vsrl.vx v20, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t +; RV32-NEXT: vand.vx v24, v24, a2, v0.t ; RV32-NEXT: vor.vv v20, v24, v20, v0.t -; RV32-NEXT: vsll.vx v24, v8, a1, v0.t -; RV32-NEXT: vand.vx v28, v8, a3, v0.t -; RV32-NEXT: vsll.vx v28, v28, a2, v0.t -; RV32-NEXT: vor.vv v24, v24, v28, v0.t -; RV32-NEXT: vand.vx v28, v8, a0, v0.t -; RV32-NEXT: vsll.vi v28, v28, 24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsll.vi v8, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v28, v8, v0.t -; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t ; RV32-NEXT: vor.vv v8, v8, v20, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bswap_v8i64: @@ -818,6 +823,11 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_bswap_v8i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsll.vx v12, v8, a1 @@ -830,14 +840,9 @@ ; RV32-NEXT: lui a4, 4080 ; RV32-NEXT: vand.vx v16, v8, a4 ; RV32-NEXT: vsll.vi v16, v16, 24 -; RV32-NEXT: lui a5, 5 -; RV32-NEXT: addi a5, a5, 1365 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vmv.v.x v0, a5 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v20, 0 -; RV32-NEXT: lui a5, 1044480 -; RV32-NEXT: vmerge.vxm v20, v20, a5, v0 +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v20, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v24, v8, v20 ; RV32-NEXT: vsll.vi v24, v24, 8 @@ -847,13 +852,14 @@ ; RV32-NEXT: vsrl.vx v24, v8, a3 ; RV32-NEXT: vand.vx v24, v24, a2 ; RV32-NEXT: vor.vv v16, v24, v16 -; RV32-NEXT: vsrl.vi v24, v8, 8 -; RV32-NEXT: vand.vv v20, v24, v20 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vor.vv v8, v20, v8 +; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: vand.vx v24, v24, a4 +; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vand.vv v8, v8, v20 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_bswap_v8i64_unmasked: @@ -906,114 +912,72 @@ ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; RV32-NEXT: vmv1r.v v1, v0 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v24, v8, a1, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a2, 16 ; RV32-NEXT: addi a2, a2, -256 ; RV32-NEXT: vand.vx v24, v8, a2, v0.t ; RV32-NEXT: li a3, 40 ; RV32-NEXT: vsll.vx v24, v24, a3, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v24, v8, a4, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: vand.vx v16, v8, a4, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero ; RV32-NEXT: csrr a5, vlenb ; RV32-NEXT: slli a5, a5, 3 ; RV32-NEXT: add a5, sp, a5 ; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: li a5, 32 -; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma -; RV32-NEXT: lui a6, 349525 -; RV32-NEXT: addi a6, a6, 1365 -; RV32-NEXT: vmv.v.i v24, 0 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: lui a7, 1044480 -; RV32-NEXT: vmv.v.x v0, a6 -; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a7, v0 -; RV32-NEXT: addi a5, sp, 16 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsll.vi v16, v16, 8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v16, v24, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t +; RV32-NEXT: vor.vv v16, v24, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: vand.vx v16, v24, a2, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t -; RV32-NEXT: vand.vx v8, v8, a4, v0.t -; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 @@ -1089,51 +1053,46 @@ ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v24, v24, a3 -; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v16, v24 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: li a4, 32 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, 0 -; RV32-NEXT: lui a5, 349525 -; RV32-NEXT: addi a5, a5, 1365 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a5 -; RV32-NEXT: lui a5, 1044480 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a5, v0 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: lui a0, 4080 -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a3 -; RV32-NEXT: vsll.vx v0, v0, a2 -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v24, v0, v24 ; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v0, v8, a3 +; RV32-NEXT: vand.vx v0, v0, a2 +; RV32-NEXT: vsrl.vx v24, v8, a1 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 @@ -1190,114 +1149,72 @@ ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb -; RV32-NEXT: vmv1r.v v1, v0 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vx v24, v8, a1, v0.t -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t ; RV32-NEXT: lui a2, 16 ; RV32-NEXT: addi a2, a2, -256 ; RV32-NEXT: vand.vx v24, v8, a2, v0.t ; RV32-NEXT: li a3, 40 ; RV32-NEXT: vsll.vx v24, v24, a3, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v24, v8, a4, v0.t -; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: vand.vx v16, v8, a4, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero ; RV32-NEXT: csrr a5, vlenb ; RV32-NEXT: slli a5, a5, 3 ; RV32-NEXT: add a5, sp, a5 ; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: li a5, 32 -; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma -; RV32-NEXT: lui a6, 349525 -; RV32-NEXT: addi a6, a6, 1365 -; RV32-NEXT: vmv.v.i v24, 0 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: lui a7, 1044480 -; RV32-NEXT: vmv.v.x v0, a6 -; RV32-NEXT: vsetvli zero, a5, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a7, v0 -; RV32-NEXT: addi a5, sp, 16 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsll.vi v16, v16, 8, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vor.vv v16, v24, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v24, v16, v0.t +; RV32-NEXT: vor.vv v16, v24, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t -; RV32-NEXT: vand.vx v16, v24, a2, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v24, v16, v24, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t -; RV32-NEXT: vand.vx v8, v8, a4, v0.t -; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 @@ -1373,51 +1290,46 @@ ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: sub sp, sp, a1 ; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v8, a1 -; RV32-NEXT: li a2, 40 -; RV32-NEXT: vsrl.vx v24, v8, a2 -; RV32-NEXT: lui a3, 16 -; RV32-NEXT: addi a3, a3, -256 -; RV32-NEXT: vand.vx v24, v24, a3 -; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v16, v24 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: li a4, 32 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, 0 -; RV32-NEXT: lui a5, 349525 -; RV32-NEXT: addi a5, a5, 1365 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v0, a5 -; RV32-NEXT: lui a5, 1044480 -; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma -; RV32-NEXT: vmerge.vxm v24, v24, a5, v0 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: lui a0, 4080 -; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vor.vv v16, v0, v16 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vl8r.v v0, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vand.vx v0, v8, a3 -; RV32-NEXT: vsll.vx v0, v0, a2 -; RV32-NEXT: vsll.vx v16, v8, a1 -; RV32-NEXT: vor.vv v16, v16, v0 -; RV32-NEXT: vand.vv v24, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsll.vi v24, v24, 8 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v8, v8, v24 -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v24, v0, v24 ; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v0, v8, a3 +; RV32-NEXT: vand.vx v0, v0, a2 +; RV32-NEXT: vsrl.vx v24, v8, a1 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v8, v24 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v16, v8 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -87,8 +87,13 @@ define void @bswap_v2i64(ptr %x, ptr %y) { ; RV32-LABEL: bswap_v2i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v9, v8, a1 ; RV32-NEXT: li a2, 40 @@ -97,31 +102,28 @@ ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: vand.vx v10, v10, a3 ; RV32-NEXT: vor.vv v9, v10, v9 -; RV32-NEXT: vmv.v.i v0, 5 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v10, 0 -; RV32-NEXT: lui a4, 1044480 -; RV32-NEXT: vmerge.vxm v10, v10, a4, v0 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v11, v8, 8 -; RV32-NEXT: vand.vv v11, v11, v10 -; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: addi a4, sp, 8 +; RV32-NEXT: vlse64.v v11, (a4), zero ; RV32-NEXT: lui a4, 4080 -; RV32-NEXT: vand.vx v12, v12, a4 -; RV32-NEXT: vor.vv v11, v11, v12 -; RV32-NEXT: vor.vv v9, v11, v9 -; RV32-NEXT: vsll.vx v11, v8, a1 +; RV32-NEXT: vand.vx v10, v10, a4 +; RV32-NEXT: vsrl.vi v12, v8, 8 +; RV32-NEXT: vand.vv v12, v12, v11 +; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vor.vv v9, v10, v9 +; RV32-NEXT: vsll.vx v10, v8, a1 ; RV32-NEXT: vand.vx v12, v8, a3 ; RV32-NEXT: vsll.vx v12, v12, a2 -; RV32-NEXT: vor.vv v11, v11, v12 -; RV32-NEXT: vand.vv v10, v8, v10 -; RV32-NEXT: vsll.vi v10, v10, 8 -; RV32-NEXT: vand.vx v8, v8, a4 -; RV32-NEXT: vsll.vi v8, v8, 24 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v11, v8 +; RV32-NEXT: vor.vv v10, v10, v12 +; RV32-NEXT: vand.vx v12, v8, a4 +; RV32-NEXT: vsll.vi v12, v12, 24 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: bswap_v2i64: @@ -357,8 +359,13 @@ define void @bswap_v4i64(ptr %x, ptr %y) { ; LMULMAX2-RV32-LABEL: bswap_v4i64: ; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: addi sp, sp, -16 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32-NEXT: sw zero, 12(sp) +; LMULMAX2-RV32-NEXT: lui a1, 1044480 +; LMULMAX2-RV32-NEXT: sw a1, 8(sp) ; LMULMAX2-RV32-NEXT: li a1, 56 ; LMULMAX2-RV32-NEXT: vsrl.vx v10, v8, a1 ; LMULMAX2-RV32-NEXT: li a2, 40 @@ -367,33 +374,28 @@ ; LMULMAX2-RV32-NEXT: addi a3, a3, -256 ; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a3 ; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10 -; LMULMAX2-RV32-NEXT: li a4, 85 -; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v0, a4 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.i v12, 0 -; LMULMAX2-RV32-NEXT: lui a4, 1044480 -; LMULMAX2-RV32-NEXT: vmerge.vxm v12, v12, a4, v0 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vsrl.vi v14, v8, 8 -; LMULMAX2-RV32-NEXT: vand.vv v14, v14, v12 -; LMULMAX2-RV32-NEXT: vsrl.vi v16, v8, 24 +; LMULMAX2-RV32-NEXT: vsrl.vi v12, v8, 24 +; LMULMAX2-RV32-NEXT: addi a4, sp, 8 +; LMULMAX2-RV32-NEXT: vlse64.v v14, (a4), zero ; LMULMAX2-RV32-NEXT: lui a4, 4080 -; LMULMAX2-RV32-NEXT: vand.vx v16, v16, a4 -; LMULMAX2-RV32-NEXT: vor.vv v14, v14, v16 -; LMULMAX2-RV32-NEXT: vor.vv v10, v14, v10 -; LMULMAX2-RV32-NEXT: vsll.vx v14, v8, a1 +; LMULMAX2-RV32-NEXT: vand.vx v12, v12, a4 +; LMULMAX2-RV32-NEXT: vsrl.vi v16, v8, 8 +; LMULMAX2-RV32-NEXT: vand.vv v16, v16, v14 +; LMULMAX2-RV32-NEXT: vor.vv v12, v16, v12 +; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10 +; LMULMAX2-RV32-NEXT: vsll.vx v12, v8, a1 ; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a3 ; LMULMAX2-RV32-NEXT: vsll.vx v16, v16, a2 -; LMULMAX2-RV32-NEXT: vor.vv v14, v14, v16 -; LMULMAX2-RV32-NEXT: vand.vv v12, v8, v12 -; LMULMAX2-RV32-NEXT: vsll.vi v12, v12, 8 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 24 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12 -; LMULMAX2-RV32-NEXT: vor.vv v8, v14, v8 +; LMULMAX2-RV32-NEXT: vor.vv v12, v12, v16 +; LMULMAX2-RV32-NEXT: vand.vx v16, v8, a4 +; LMULMAX2-RV32-NEXT: vsll.vi v16, v16, 24 +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v14 +; LMULMAX2-RV32-NEXT: vsll.vi v8, v8, 8 +; LMULMAX2-RV32-NEXT: vor.vv v8, v16, v8 +; LMULMAX2-RV32-NEXT: vor.vv v8, v12, v8 ; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 ; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32-NEXT: addi sp, sp, 16 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bswap_v4i64: @@ -433,65 +435,67 @@ ; ; LMULMAX1-RV32-LABEL: bswap_v4i64: ; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi sp, sp, -16 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV32-NEXT: vle64.v v9, (a1) +; LMULMAX1-RV32-NEXT: sw zero, 12(sp) +; LMULMAX1-RV32-NEXT: lui a2, 1044480 +; LMULMAX1-RV32-NEXT: sw a2, 8(sp) ; LMULMAX1-RV32-NEXT: li a2, 56 -; LMULMAX1-RV32-NEXT: vsrl.vx v10, v8, a2 +; LMULMAX1-RV32-NEXT: vsrl.vx v10, v9, a2 ; LMULMAX1-RV32-NEXT: li a3, 40 -; LMULMAX1-RV32-NEXT: vsrl.vx v11, v8, a3 +; LMULMAX1-RV32-NEXT: vsrl.vx v11, v9, a3 ; LMULMAX1-RV32-NEXT: lui a4, 16 ; LMULMAX1-RV32-NEXT: addi a4, a4, -256 ; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a4 ; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10 -; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0 -; LMULMAX1-RV32-NEXT: lui a5, 1044480 -; LMULMAX1-RV32-NEXT: vmerge.vxm v11, v11, a5, v0 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vsrl.vi v12, v8, 8 -; LMULMAX1-RV32-NEXT: vand.vv v12, v12, v11 -; LMULMAX1-RV32-NEXT: vsrl.vi v13, v8, 24 +; LMULMAX1-RV32-NEXT: vsrl.vi v11, v9, 24 +; LMULMAX1-RV32-NEXT: addi a5, sp, 8 +; LMULMAX1-RV32-NEXT: vlse64.v v12, (a5), zero ; LMULMAX1-RV32-NEXT: lui a5, 4080 -; LMULMAX1-RV32-NEXT: vand.vx v13, v13, a5 -; LMULMAX1-RV32-NEXT: vor.vv v12, v12, v13 -; LMULMAX1-RV32-NEXT: vor.vv v10, v12, v10 -; LMULMAX1-RV32-NEXT: vsll.vx v12, v8, a2 -; LMULMAX1-RV32-NEXT: vand.vx v13, v8, a4 -; LMULMAX1-RV32-NEXT: vsll.vx v13, v13, a3 -; LMULMAX1-RV32-NEXT: vor.vv v12, v12, v13 -; LMULMAX1-RV32-NEXT: vand.vv v13, v8, v11 -; LMULMAX1-RV32-NEXT: vsll.vi v13, v13, 8 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a5 -; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 24 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v13 -; LMULMAX1-RV32-NEXT: vor.vv v8, v12, v8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vx v10, v9, a2 -; LMULMAX1-RV32-NEXT: vsrl.vx v12, v9, a3 -; LMULMAX1-RV32-NEXT: vand.vx v12, v12, a4 -; LMULMAX1-RV32-NEXT: vor.vv v10, v12, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v12, v9, 8 -; LMULMAX1-RV32-NEXT: vand.vv v12, v12, v11 -; LMULMAX1-RV32-NEXT: vsrl.vi v13, v9, 24 -; LMULMAX1-RV32-NEXT: vand.vx v13, v13, a5 -; LMULMAX1-RV32-NEXT: vor.vv v12, v12, v13 -; LMULMAX1-RV32-NEXT: vor.vv v10, v12, v10 -; LMULMAX1-RV32-NEXT: vsll.vx v12, v9, a2 -; LMULMAX1-RV32-NEXT: vand.vx v13, v9, a4 -; LMULMAX1-RV32-NEXT: vsll.vx v13, v13, a3 -; LMULMAX1-RV32-NEXT: vor.vv v12, v12, v13 -; LMULMAX1-RV32-NEXT: vand.vv v11, v9, v11 +; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a5 +; LMULMAX1-RV32-NEXT: vsrl.vi v13, v9, 8 +; LMULMAX1-RV32-NEXT: vand.vv v13, v13, v12 +; LMULMAX1-RV32-NEXT: vor.vv v11, v13, v11 +; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10 +; LMULMAX1-RV32-NEXT: vand.vv v11, v9, v12 ; LMULMAX1-RV32-NEXT: vsll.vi v11, v11, 8 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a5 -; LMULMAX1-RV32-NEXT: vsll.vi v9, v9, 24 +; LMULMAX1-RV32-NEXT: vand.vx v13, v9, a5 +; LMULMAX1-RV32-NEXT: vsll.vi v13, v13, 24 +; LMULMAX1-RV32-NEXT: vor.vv v11, v13, v11 +; LMULMAX1-RV32-NEXT: vsll.vx v13, v9, a2 +; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4 +; LMULMAX1-RV32-NEXT: vsll.vx v9, v9, a3 +; LMULMAX1-RV32-NEXT: vor.vv v9, v13, v9 ; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v11 -; LMULMAX1-RV32-NEXT: vor.vv v9, v12, v9 ; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vsrl.vx v10, v8, a2 +; LMULMAX1-RV32-NEXT: vsrl.vx v11, v8, a3 +; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a4 +; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10 +; LMULMAX1-RV32-NEXT: vsrl.vi v11, v8, 24 +; LMULMAX1-RV32-NEXT: vand.vx v11, v11, a5 +; LMULMAX1-RV32-NEXT: vsrl.vi v13, v8, 8 +; LMULMAX1-RV32-NEXT: vand.vv v13, v13, v12 +; LMULMAX1-RV32-NEXT: vor.vv v11, v13, v11 +; LMULMAX1-RV32-NEXT: vor.vv v10, v11, v10 +; LMULMAX1-RV32-NEXT: vsll.vx v11, v8, a2 +; LMULMAX1-RV32-NEXT: vand.vx v13, v8, a4 +; LMULMAX1-RV32-NEXT: vsll.vx v13, v13, a3 +; LMULMAX1-RV32-NEXT: vor.vv v11, v11, v13 +; LMULMAX1-RV32-NEXT: vand.vx v13, v8, a5 +; LMULMAX1-RV32-NEXT: vsll.vi v13, v13, 24 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v12 +; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 8 +; LMULMAX1-RV32-NEXT: vor.vv v8, v13, v8 +; LMULMAX1-RV32-NEXT: vor.vv v8, v11, v8 +; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v9, (a1) +; LMULMAX1-RV32-NEXT: addi sp, sp, 16 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bswap_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -1449,6 +1449,24 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v2i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v9, v0.t @@ -1463,22 +1481,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v9, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v9, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v10, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1486,20 +1499,19 @@ ; RV32-NEXT: vadd.vv v8, v10, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v2i64: @@ -1556,6 +1568,24 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v2i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v9 @@ -1570,22 +1600,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v9, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v10, v8, v9 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1593,20 +1618,19 @@ ; RV32-NEXT: vadd.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v9, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v2i64_unmasked: @@ -1667,6 +1691,24 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v4i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v10, v0.t @@ -1681,22 +1723,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v10, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12, v0.t ; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v12, v8, v10, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1704,20 +1741,19 @@ ; RV32-NEXT: vadd.vv v8, v12, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v4i64: @@ -1774,6 +1810,24 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v4i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v10 @@ -1788,22 +1842,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v10, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v10 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v12, v8, v10 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1811,20 +1860,19 @@ ; RV32-NEXT: vadd.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v10, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v4i64_unmasked: @@ -1885,6 +1933,24 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v8i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v12, v0.t @@ -1899,22 +1965,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v12, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16, v0.t ; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v16, v8, v12, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1922,20 +1983,19 @@ ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v12, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v8i64: @@ -1992,6 +2052,24 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v8i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v12 @@ -2006,22 +2084,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v12, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v12 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v16, v8, v12 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -2029,20 +2102,19 @@ ; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v12, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v8i64_unmasked: @@ -2103,6 +2175,24 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v15i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -2117,22 +2207,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -2140,20 +2225,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v15i64: @@ -2210,6 +2294,24 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v15i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 @@ -2224,22 +2326,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -2247,20 +2344,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v15i64_unmasked: @@ -2321,6 +2417,24 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v16i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -2335,22 +2449,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -2358,20 +2467,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v16i64: @@ -2428,6 +2536,24 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v16i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 @@ -2442,22 +2568,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -2465,20 +2586,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v16i64_unmasked: @@ -2539,23 +2659,39 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: li a1, 16 ; RV32-NEXT: vslidedown.vi v24, v0, 2 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a3, 16 +; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a1, .LBB34_2 +; RV32-NEXT: bltu a0, a3, .LBB34_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB34_2: @@ -2573,124 +2709,115 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 +; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: lui a3, 61681 -; RV32-NEXT: addi a3, a3, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: lui a3, 4112 -; RV32-NEXT: addi a3, a3, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 @@ -2701,7 +2828,7 @@ ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t @@ -2715,67 +2842,54 @@ ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t @@ -2784,23 +2898,26 @@ ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v32i64: @@ -2920,21 +3037,36 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: vmv8r.v v0, v16 -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a1, .LBB35_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: .LBB35_2: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb +; RV32-NEXT: vmv8r.v v24, v16 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a2, 16 +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB35_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB35_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 2 @@ -2945,127 +3077,109 @@ ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 16 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsrl.vx v16, v8, a2 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a3), zero +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 -; RV32-NEXT: lui a3, 61681 -; RV32-NEXT: addi a3, a3, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: addi a3, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: lui a3, 4112 -; RV32-NEXT: addi a3, a3, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: li a2, 56 -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: addi a3, sp, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v0, 1 -; RV32-NEXT: vor.vv v8, v0, v8 -; RV32-NEXT: vsrl.vi v0, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vx v0, v8, a1 -; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v8, v24, 1 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: vsub.vv v8, v8, v0 -; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v0, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v32i64_unmasked: @@ -4576,6 +4690,24 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v2i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v9, v0.t @@ -4590,22 +4722,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v9, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v9, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v10, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -4613,20 +4740,19 @@ ; RV32-NEXT: vadd.vv v8, v10, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v2i64: @@ -4683,6 +4809,24 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v2i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v9 @@ -4697,22 +4841,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v9, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v9, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v10, v8, v9 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -4720,20 +4859,19 @@ ; RV32-NEXT: vadd.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v9, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v2i64_unmasked: @@ -4792,6 +4930,24 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v4i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v10, v0.t @@ -4806,22 +4962,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v10, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12, v0.t ; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v12, v8, v10, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -4829,20 +4980,19 @@ ; RV32-NEXT: vadd.vv v8, v12, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v4i64: @@ -4899,6 +5049,24 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v4i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v10 @@ -4913,22 +5081,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v10, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v10 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v12, v8, v10 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -4936,20 +5099,19 @@ ; RV32-NEXT: vadd.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v10, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v4i64_unmasked: @@ -5008,6 +5170,24 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v8i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v12, v0.t @@ -5022,22 +5202,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v12, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16, v0.t ; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v16, v8, v12, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -5045,20 +5220,19 @@ ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v12, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v8i64: @@ -5115,6 +5289,24 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v8i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v12 @@ -5129,22 +5321,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v12, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v12 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v16, v8, v12 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -5152,20 +5339,19 @@ ; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v12, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v8i64_unmasked: @@ -5224,6 +5410,24 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v15i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -5238,22 +5442,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -5261,20 +5460,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v15i64: @@ -5331,6 +5529,24 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v15i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 @@ -5345,22 +5561,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -5368,20 +5579,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v15i64_unmasked: @@ -5440,6 +5650,24 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v16i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t @@ -5454,22 +5682,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -5477,20 +5700,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v16i64: @@ -5547,6 +5769,24 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v16i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 @@ -5561,22 +5801,17 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -5584,20 +5819,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v16i64_unmasked: @@ -5656,23 +5890,39 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: li a1, 16 ; RV32-NEXT: vslidedown.vi v24, v0, 2 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a3, 16 +; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a1, .LBB70_2 +; RV32-NEXT: bltu a0, a3, .LBB70_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB70_2: @@ -5690,124 +5940,115 @@ ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 +; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: lui a3, 61681 -; RV32-NEXT: addi a3, a3, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: lui a3, 4112 -; RV32-NEXT: addi a3, a3, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 3 +; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 @@ -5818,7 +6059,7 @@ ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t @@ -5832,67 +6073,54 @@ ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 48 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t @@ -5901,23 +6129,26 @@ ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v32i64: @@ -6037,21 +6268,36 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_zero_undef_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: vmv8r.v v0, v16 -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a1, .LBB71_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: .LBB71_2: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb +; RV32-NEXT: vmv8r.v v24, v16 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a2, 16 +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB71_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB71_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 2 @@ -6062,127 +6308,109 @@ ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 16 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsrl.vx v16, v8, a2 ; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: li a4, 24 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a3), zero +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 -; RV32-NEXT: lui a3, 61681 -; RV32-NEXT: addi a3, a3, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: addi a3, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: lui a3, 4112 -; RV32-NEXT: addi a3, a3, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: li a2, 56 -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: addi a3, sp, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v8, v0, 1 -; RV32-NEXT: vor.vv v8, v0, v8 -; RV32-NEXT: vsrl.vi v0, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vx v0, v8, a1 -; RV32-NEXT: vor.vv v8, v8, v0 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vsrl.vi v8, v24, 1 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: vsub.vv v8, v8, v0 -; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v0, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_zero_undef_v32i64_unmasked: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -460,8 +460,25 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind { ; LMULMAX2-RV32I-LABEL: ctlz_v2i64: ; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: addi sp, sp, -32 ; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 24(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 16(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 0(sp) ; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 ; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 ; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2 @@ -475,44 +492,30 @@ ; LMULMAX2-RV32I-NEXT: li a1, 32 ; LMULMAX2-RV32I-NEXT: vsrl.vx v9, v8, a1 ; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.i v9, -1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32I-NEXT: lui a1, 349525 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v9, v9, v10 +; LMULMAX2-RV32I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32I-NEXT: vlse64.v v9, (a1), zero +; LMULMAX2-RV32I-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v11, v8, 1 +; LMULMAX2-RV32I-NEXT: vand.vv v9, v11, v9 ; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: lui a1, 209715 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v9 +; LMULMAX2-RV32I-NEXT: vand.vv v9, v8, v10 ; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32I-NEXT: vlse64.v v9, (a1), zero +; LMULMAX2-RV32I-NEXT: mv a1, sp +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v11, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v11 ; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: lui a1, 61681 -; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: lui a1, 4112 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10 ; LMULMAX2-RV32I-NEXT: li a1, 56 ; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32I-NEXT: ret ; ; LMULMAX2-RV64I-LABEL: ctlz_v2i64: @@ -1157,8 +1160,25 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { ; LMULMAX2-RV32I-LABEL: ctlz_v4i64: ; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: addi sp, sp, -32 ; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 24(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 16(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 0(sp) ; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 ; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 ; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 2 @@ -1172,44 +1192,30 @@ ; LMULMAX2-RV32I-NEXT: li a1, 32 ; LMULMAX2-RV32I-NEXT: vsrl.vx v10, v8, a1 ; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.i v10, -1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32I-NEXT: lui a1, 349525 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v10, v10, v12 +; LMULMAX2-RV32I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32I-NEXT: vlse64.v v12, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v14, v8, 1 +; LMULMAX2-RV32I-NEXT: vand.vv v10, v14, v10 ; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: lui a1, 209715 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v12, v8, v10 +; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v12 ; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v12 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: mv a1, sp +; LMULMAX2-RV32I-NEXT: vlse64.v v12, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v14, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v14 ; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v12, v8 -; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: lui a1, 61681 -; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: lui a1, 4112 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v12 ; LMULMAX2-RV32I-NEXT: li a1, 56 ; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32I-NEXT: ret ; ; LMULMAX2-RV64I-LABEL: ctlz_v4i64: @@ -1782,8 +1788,25 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; LMULMAX2-RV32I-LABEL: ctlz_zero_undef_v2i64: ; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: addi sp, sp, -32 ; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 24(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 16(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 0(sp) ; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 ; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 ; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2 @@ -1797,44 +1820,30 @@ ; LMULMAX2-RV32I-NEXT: li a1, 32 ; LMULMAX2-RV32I-NEXT: vsrl.vx v9, v8, a1 ; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.i v9, -1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32I-NEXT: lui a1, 349525 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v9, v9, v10 +; LMULMAX2-RV32I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32I-NEXT: vlse64.v v9, (a1), zero +; LMULMAX2-RV32I-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v11, v8, 1 +; LMULMAX2-RV32I-NEXT: vand.vv v9, v11, v9 ; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: lui a1, 209715 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v9 +; LMULMAX2-RV32I-NEXT: vand.vv v9, v8, v10 ; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32I-NEXT: vlse64.v v9, (a1), zero +; LMULMAX2-RV32I-NEXT: mv a1, sp +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v11, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v11 ; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: lui a1, 61681 -; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: lui a1, 4112 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10 ; LMULMAX2-RV32I-NEXT: li a1, 56 ; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32I-NEXT: ret ; ; LMULMAX2-RV64I-LABEL: ctlz_zero_undef_v2i64: @@ -2449,8 +2458,25 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; LMULMAX2-RV32I-LABEL: ctlz_zero_undef_v4i64: ; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: addi sp, sp, -32 ; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 24(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 16(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 0(sp) ; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 ; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 ; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 2 @@ -2464,44 +2490,30 @@ ; LMULMAX2-RV32I-NEXT: li a1, 32 ; LMULMAX2-RV32I-NEXT: vsrl.vx v10, v8, a1 ; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.i v10, -1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32I-NEXT: lui a1, 349525 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v10, v10, v12 +; LMULMAX2-RV32I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32I-NEXT: vlse64.v v12, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v14, v8, 1 +; LMULMAX2-RV32I-NEXT: vand.vv v10, v14, v10 ; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: lui a1, 209715 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v12, v8, v10 +; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v12 ; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v12 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: mv a1, sp +; LMULMAX2-RV32I-NEXT: vlse64.v v12, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v14, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v14 ; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v12, v8 -; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: lui a1, 61681 -; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: lui a1, 4112 -; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v12 ; LMULMAX2-RV32I-NEXT: li a1, 56 ; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32I-NEXT: ret ; ; LMULMAX2-RV64I-LABEL: ctlz_zero_undef_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -1073,19 +1073,35 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v10, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1093,20 +1109,19 @@ ; RV32-NEXT: vadd.vv v8, v10, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v2i64: @@ -1149,19 +1164,35 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v2i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v10, v8, v9 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1169,20 +1200,19 @@ ; RV32-NEXT: vadd.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v9, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v2i64_unmasked: @@ -1229,19 +1259,35 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v4i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12, v0.t ; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v12, v8, v10, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1249,20 +1295,19 @@ ; RV32-NEXT: vadd.vv v8, v12, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v4i64: @@ -1305,19 +1350,35 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v4i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v12, v8, v10 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1325,20 +1386,19 @@ ; RV32-NEXT: vadd.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v10, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v4i64_unmasked: @@ -1385,19 +1445,35 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16, v0.t ; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v16, v8, v12, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1405,20 +1481,19 @@ ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v12, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v8i64: @@ -1461,19 +1536,35 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v8i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v16, v8, v12 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1481,20 +1572,19 @@ ; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v12, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v8i64_unmasked: @@ -1541,20 +1631,35 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v15i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1562,20 +1667,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v15i64: @@ -1618,20 +1722,35 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v15i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1639,20 +1758,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v15i64_unmasked: @@ -1699,20 +1817,35 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1720,20 +1853,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v16i64: @@ -1776,20 +1908,35 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v16i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1797,20 +1944,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v16i64_unmasked: @@ -1857,22 +2003,38 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: li a2, 16 ; RV32-NEXT: vslidedown.vi v24, v0, 2 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a2, 16 +; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a2, .LBB34_2 ; RV32-NEXT: # %bb.1: @@ -1880,93 +2042,88 @@ ; RV32-NEXT: .LBB34_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 40 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: addi a2, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a2, a2, a4 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a2, a2, a4 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a2, a2, a4 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t -; RV32-NEXT: vadd.vv v16, v16, v8, v0.t -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a2, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV32-NEXT: addi a2, a2, 48 +; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t @@ -1975,7 +2132,7 @@ ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a2, a0, -16 ; RV32-NEXT: sltu a0, a0, a2 @@ -1987,53 +2144,53 @@ ; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v16, v8, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t @@ -2041,26 +2198,26 @@ ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 56 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v32i64: @@ -2153,19 +2310,35 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctpop_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; RV32-NEXT: li a2, 16 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 40 * vlenb ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a2, 16 +; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a2, .LBB35_2 ; RV32-NEXT: # %bb.1: @@ -2173,24 +2346,21 @@ ; RV32-NEXT: .LBB35_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a2, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a2), zero ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a2, a2, a4 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v0, a2 +; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a2), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -2198,31 +2368,29 @@ ; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a2, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a2), zero ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: vand.vv v16, v8, v16 ; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a2), zero +; RV32-NEXT: addi a2, sp, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v24, v24, v8 +; RV32-NEXT: vmul.vv v16, v16, v8 ; RV32-NEXT: li a1, 56 -; RV32-NEXT: vsrl.vx v8, v24, a1 +; RV32-NEXT: vsrl.vx v8, v16, a1 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, a2, 48 ; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV32-NEXT: addi a2, a0, -16 ; RV32-NEXT: sltu a0, a0, a2 @@ -2232,43 +2400,43 @@ ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 -; RV32-NEXT: vsub.vv v24, v8, v24 -; RV32-NEXT: vand.vv v8, v24, v0 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vadd.vv v8, v8, v24 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v16, v8, v16 +; RV32-NEXT: vand.vv v8, v16, v0 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctpop_v32i64_unmasked: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll @@ -288,42 +288,49 @@ define void @ctpop_v2i64(ptr %x, ptr %y) { ; LMULMAX2-RV32-LABEL: ctpop_v2i64: ; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32-NEXT: lui a1, 209715 ; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v10, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32-NEXT: sw a1, 16(sp) ; LMULMAX2-RV32-NEXT: lui a1, 61681 ; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: sw a1, 8(sp) ; LMULMAX2-RV32-NEXT: lui a1, 4112 ; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v9 +; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: sw a1, 0(sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32-NEXT: vlse64.v v9, (a1), zero +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32-NEXT: vsrl.vi v11, v8, 1 +; LMULMAX2-RV32-NEXT: vand.vv v9, v11, v9 +; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32-NEXT: vand.vv v9, v8, v10 +; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32-NEXT: vlse64.v v9, (a1), zero +; LMULMAX2-RV32-NEXT: mv a1, sp +; LMULMAX2-RV32-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32-NEXT: vsrl.vi v11, v8, 4 +; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v11 +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v10 ; LMULMAX2-RV32-NEXT: li a1, 56 ; LMULMAX2-RV32-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: ctpop_v2i64: @@ -364,42 +371,49 @@ ; ; LMULMAX1-RV32-LABEL: ctpop_v2i64: ; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 ; LMULMAX1-RV32-NEXT: lui a1, 349525 ; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) ; LMULMAX1-RV32-NEXT: lui a1, 209715 ; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v10, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: lui a1, 61681 ; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: sw a1, 8(sp) ; LMULMAX1-RV32-NEXT: lui a1, 4112 ; LMULMAX1-RV32-NEXT: addi a1, a1, 257 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v9 +; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 24 +; LMULMAX1-RV32-NEXT: vlse64.v v9, (a1), zero +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vlse64.v v10, (a1), zero +; LMULMAX1-RV32-NEXT: vsrl.vi v11, v8, 1 +; LMULMAX1-RV32-NEXT: vand.vv v9, v11, v9 +; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 +; LMULMAX1-RV32-NEXT: vand.vv v9, v8, v10 +; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8 +; LMULMAX1-RV32-NEXT: addi a1, sp, 8 +; LMULMAX1-RV32-NEXT: vlse64.v v9, (a1), zero +; LMULMAX1-RV32-NEXT: mv a1, sp +; LMULMAX1-RV32-NEXT: vlse64.v v10, (a1), zero +; LMULMAX1-RV32-NEXT: vsrl.vi v11, v8, 4 +; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v11 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v10 ; LMULMAX1-RV32-NEXT: li a1, 56 ; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctpop_v2i64: @@ -829,42 +843,49 @@ define void @ctpop_v4i64(ptr %x, ptr %y) { ; LMULMAX2-RV32-LABEL: ctpop_v4i64: ; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: addi sp, sp, -32 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 ; LMULMAX2-RV32-NEXT: lui a1, 349525 ; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32-NEXT: lui a1, 209715 ; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v12, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v12, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32-NEXT: sw a1, 16(sp) ; LMULMAX2-RV32-NEXT: lui a1, 61681 ; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: sw a1, 8(sp) ; LMULMAX2-RV32-NEXT: lui a1, 4112 ; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v10 +; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: sw a1, 0(sp) +; LMULMAX2-RV32-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32-NEXT: vlse64.v v12, (a1), zero +; LMULMAX2-RV32-NEXT: vsrl.vi v14, v8, 1 +; LMULMAX2-RV32-NEXT: vand.vv v10, v14, v10 +; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32-NEXT: vand.vv v10, v8, v12 +; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v12 +; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32-NEXT: mv a1, sp +; LMULMAX2-RV32-NEXT: vlse64.v v12, (a1), zero +; LMULMAX2-RV32-NEXT: vsrl.vi v14, v8, 4 +; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v14 +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v12 ; LMULMAX2-RV32-NEXT: li a1, 56 ; LMULMAX2-RV32-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: ctpop_v4i64: @@ -905,57 +926,64 @@ ; ; LMULMAX1-RV32-LABEL: ctpop_v4i64: ; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV32-NEXT: vle64.v v9, (a1) ; LMULMAX1-RV32-NEXT: lui a2, 349525 ; LMULMAX1-RV32-NEXT: addi a2, a2, 1365 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v11, a2 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v11 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: sw a2, 28(sp) +; LMULMAX1-RV32-NEXT: sw a2, 24(sp) ; LMULMAX1-RV32-NEXT: lui a2, 209715 ; LMULMAX1-RV32-NEXT: addi a2, a2, 819 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v10, a2 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v12, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v12, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v12, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v12 +; LMULMAX1-RV32-NEXT: sw a2, 20(sp) +; LMULMAX1-RV32-NEXT: sw a2, 16(sp) ; LMULMAX1-RV32-NEXT: lui a2, 61681 ; LMULMAX1-RV32-NEXT: addi a2, a2, -241 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v12, a2 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v12 +; LMULMAX1-RV32-NEXT: sw a2, 12(sp) +; LMULMAX1-RV32-NEXT: sw a2, 8(sp) ; LMULMAX1-RV32-NEXT: lui a2, 4112 ; LMULMAX1-RV32-NEXT: addi a2, a2, 257 -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmv.v.x v13, a2 -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v13 -; LMULMAX1-RV32-NEXT: li a2, 56 -; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a2 -; LMULMAX1-RV32-NEXT: vsrl.vi v14, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vv v11, v14, v11 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v11 -; LMULMAX1-RV32-NEXT: vand.vv v11, v9, v10 +; LMULMAX1-RV32-NEXT: sw a2, 4(sp) +; LMULMAX1-RV32-NEXT: sw a2, 0(sp) +; LMULMAX1-RV32-NEXT: addi a2, sp, 24 +; LMULMAX1-RV32-NEXT: vlse64.v v10, (a2), zero +; LMULMAX1-RV32-NEXT: addi a2, sp, 16 +; LMULMAX1-RV32-NEXT: vlse64.v v11, (a2), zero +; LMULMAX1-RV32-NEXT: vsrl.vi v12, v9, 1 +; LMULMAX1-RV32-NEXT: vand.vv v12, v12, v10 +; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v12 +; LMULMAX1-RV32-NEXT: vand.vv v12, v9, v11 ; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v11, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v11 +; LMULMAX1-RV32-NEXT: vadd.vv v9, v12, v9 +; LMULMAX1-RV32-NEXT: addi a2, sp, 8 +; LMULMAX1-RV32-NEXT: vlse64.v v12, (a2), zero +; LMULMAX1-RV32-NEXT: mv a2, sp +; LMULMAX1-RV32-NEXT: vlse64.v v13, (a2), zero +; LMULMAX1-RV32-NEXT: vsrl.vi v14, v9, 4 +; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v14 ; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v12 ; LMULMAX1-RV32-NEXT: vmul.vv v9, v9, v13 +; LMULMAX1-RV32-NEXT: li a2, 56 ; LMULMAX1-RV32-NEXT: vsrl.vx v9, v9, a2 -; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vsrl.vi v14, v8, 1 +; LMULMAX1-RV32-NEXT: vand.vv v10, v14, v10 +; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vand.vv v10, v8, v11 +; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v11 +; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 +; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v12 +; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v13 +; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a2 +; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v9, (a1) +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctpop_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -1233,26 +1233,39 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v2i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v10, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v10, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1260,20 +1273,19 @@ ; RV32-NEXT: vadd.vv v8, v10, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v2i64: @@ -1320,26 +1332,39 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v2i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsub.vx v9, v8, a1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v10 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v10, v8, v9 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1347,20 +1372,19 @@ ; RV32-NEXT: vadd.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v9, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v2i64_unmasked: @@ -1411,26 +1435,39 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v4i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v12, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12, v0.t ; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v12, v8, v10, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1438,20 +1475,19 @@ ; RV32-NEXT: vadd.vv v8, v12, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v4i64: @@ -1498,26 +1534,39 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v4i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsub.vx v10, v8, a1 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v12 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v12, v8, v10 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1525,20 +1574,19 @@ ; RV32-NEXT: vadd.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v10, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v4i64_unmasked: @@ -1589,26 +1637,39 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v8i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16, v0.t ; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v16, v8, v12, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1616,20 +1677,19 @@ ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v12, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v8i64: @@ -1676,26 +1736,39 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v8i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsub.vx v12, v8, a1 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v16, v8, v12 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1703,20 +1776,19 @@ ; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v12, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v8i64_unmasked: @@ -1767,27 +1839,39 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v15i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1795,20 +1879,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v15i64: @@ -1855,27 +1938,39 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v15i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -1883,20 +1978,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v15i64_unmasked: @@ -1947,27 +2041,39 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v16i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -1975,20 +2081,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v16i64: @@ -2035,27 +2140,39 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v16i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -2063,20 +2180,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v16i64_unmasked: @@ -2127,165 +2243,155 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv8r.v v16, v8 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: li a1, 16 ; RV32-NEXT: vslidedown.vi v24, v0, 2 -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a1, .LBB34_2 -; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: li a2, 16 +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB34_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB34_2: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t +; RV32-NEXT: li a2, 1 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a2, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 56 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v8, -1 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, 349525 -; RV32-NEXT: addi a4, a4, 1365 ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 56 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, 209715 -; RV32-NEXT: addi a4, a4, 819 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: lui a4, 61681 -; RV32-NEXT: addi a4, a4, -241 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: lui a4, 4112 -; RV32-NEXT: addi a4, a4, 257 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a4 +; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a3, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: li a2, 56 -; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 @@ -2294,107 +2400,88 @@ ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v16, v16, v8, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vsub.vx v16, v8, a2, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 6 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v32i64: @@ -2494,133 +2581,129 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: vmv8r.v v0, v16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb +; RV32-NEXT: vmv8r.v v24, v16 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: li a2, 16 +; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a2, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; RV32-NEXT: li a2, 1 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a2 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a4, 349525 -; RV32-NEXT: addi a4, a4, 1365 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a4, 209715 -; RV32-NEXT: addi a4, a4, 819 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a4 +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 -; RV32-NEXT: lui a4, 61681 -; RV32-NEXT: addi a4, a4, -241 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: addi a3, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: lui a4, 4112 -; RV32-NEXT: addi a4, a4, 257 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a4 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: addi a3, sp, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v24 +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v8, v0, v8 -; RV32-NEXT: vsub.vx v0, v0, a2 -; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vsub.vx v8, v24, a2 +; RV32-NEXT: vnot.v v24, v24 +; RV32-NEXT: vand.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: vsub.vv v8, v8, v0 -; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v0, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v32i64_unmasked: @@ -3895,26 +3978,39 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v2i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v10, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10, v0.t ; RV32-NEXT: vsub.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v10, v8, v9, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -3922,20 +4018,19 @@ ; RV32-NEXT: vadd.vv v8, v10, v8, v0.t ; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v9, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v2i64: @@ -3982,26 +4077,39 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v2i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vsub.vx v9, v8, a1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v10, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v10 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v9, v10 ; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v10, v8, v9 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -4009,20 +4117,19 @@ ; RV32-NEXT: vadd.vv v8, v10, v8 ; RV32-NEXT: vsrl.vi v9, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v2i64_unmasked: @@ -4071,26 +4178,39 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v4i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v12, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v10, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12, v0.t ; RV32-NEXT: vsub.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v12, v8, v10, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -4098,20 +4218,19 @@ ; RV32-NEXT: vadd.vv v8, v12, v8, v0.t ; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v10, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v4i64: @@ -4158,26 +4277,39 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v4i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vsub.vx v10, v8, a1 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v12, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v12 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v10, v12 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v12, v8, v10 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -4185,20 +4317,19 @@ ; RV32-NEXT: vadd.vv v8, v12, v8 ; RV32-NEXT: vsrl.vi v10, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v4i64_unmasked: @@ -4247,26 +4378,39 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v8i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16, v0.t ; RV32-NEXT: vsub.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v16, v8, v12, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -4274,20 +4418,19 @@ ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v12, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v8i64: @@ -4334,26 +4477,39 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v8i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vsub.vx v12, v8, a1 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a1, 349525 -; RV32-NEXT: addi a1, a1, 1365 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v12, v12, v16 ; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a1, 209715 -; RV32-NEXT: addi a1, a1, 819 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v16, v8, v12 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -4361,20 +4517,19 @@ ; RV32-NEXT: vadd.vv v8, v16, v8 ; RV32-NEXT: vsrl.vi v12, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a1, 61681 -; RV32-NEXT: addi a1, a1, -241 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: lui a1, 4112 -; RV32-NEXT: addi a1, a1, 257 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v8i64_unmasked: @@ -4423,27 +4578,39 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v15i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -4451,20 +4618,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v15i64: @@ -4511,27 +4677,39 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v15i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -4539,20 +4717,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v15i64_unmasked: @@ -4601,27 +4778,39 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v16i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24, v0.t ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t @@ -4629,20 +4818,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v16i64: @@ -4689,27 +4877,39 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v16i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 20(sp) +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 4(sp) +; RV32-NEXT: sw a1, 0(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a1 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a2, 349525 -; RV32-NEXT: addi a2, a2, 1365 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a2 +; RV32-NEXT: addi a1, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a2, 209715 -; RV32-NEXT: addi a2, a2, 819 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 @@ -4717,20 +4917,19 @@ ; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v16, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: addi a1, sp, 8 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: lui a2, 4112 -; RV32-NEXT: addi a2, a2, 257 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 ; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v16i64_unmasked: @@ -4779,165 +4978,155 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: addi a1, a1, 48 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv8r.v v16, v8 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: li a1, 16 ; RV32-NEXT: vslidedown.vi v24, v0, 2 -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a1, .LBB70_2 -; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: li a2, 16 +; RV32-NEXT: sw a1, 16(sp) +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB70_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB70_2: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t +; RV32-NEXT: li a2, 1 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a2, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 56 +; RV32-NEXT: li a4, 40 ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v8, -1 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, 349525 -; RV32-NEXT: addi a4, a4, 1365 ; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 56 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v8, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, 209715 -; RV32-NEXT: addi a4, a4, 819 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v8, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v16, (a4) # Unknown-size Folded Reload +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v16, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vl8r.v v8, (a3) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v16, v8, v16, v0.t -; RV32-NEXT: lui a4, 61681 -; RV32-NEXT: addi a4, a4, -241 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: lui a4, 4112 -; RV32-NEXT: addi a4, a4, 257 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a4 +; RV32-NEXT: vsrl.vi v8, v16, 4, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a3, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: li a2, 56 -; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 @@ -4946,107 +5135,88 @@ ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmv1r.v v0, v24 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v8, v16, a1, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v16, v16, v8, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vsub.vx v16, v8, a2, v0.t +; RV32-NEXT: vnot.v v8, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 48 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 48 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 6 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v32i64: @@ -5146,133 +5316,129 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_zero_undef_v32i64_unmasked: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 32 * vlenb +; RV32-NEXT: vmv8r.v v24, v16 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a1, a1, 1365 +; RV32-NEXT: sw a1, 44(sp) +; RV32-NEXT: sw a1, 40(sp) +; RV32-NEXT: lui a1, 209715 +; RV32-NEXT: addi a1, a1, 819 +; RV32-NEXT: sw a1, 36(sp) +; RV32-NEXT: sw a1, 32(sp) +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: sw a1, 28(sp) +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: lui a1, 4112 +; RV32-NEXT: addi a1, a1, 257 +; RV32-NEXT: sw a1, 20(sp) ; RV32-NEXT: li a2, 16 -; RV32-NEXT: vmv8r.v v0, v16 +; RV32-NEXT: sw a1, 16(sp) ; RV32-NEXT: mv a1, a0 ; RV32-NEXT: bltu a0, a2, .LBB71_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB71_2: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; RV32-NEXT: li a2, 1 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsub.vx v16, v8, a2 -; RV32-NEXT: li a3, 32 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v24, -1 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24 +; RV32-NEXT: vnot.v v8, v8 ; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a4, 349525 -; RV32-NEXT: addi a4, a4, 1365 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: addi a3, sp, 40 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a4, 209715 -; RV32-NEXT: addi a4, a4, 819 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a4 +; RV32-NEXT: addi a3, sp, 32 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v0, (a3), zero ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v24, v8 -; RV32-NEXT: vsrl.vi v24, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v24 -; RV32-NEXT: lui a4, 61681 -; RV32-NEXT: addi a4, a4, -241 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: addi a3, sp, 24 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: lui a4, 4112 -; RV32-NEXT: addi a4, a4, 257 -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a4 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a3), zero +; RV32-NEXT: addi a3, sp, 48 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v24 +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a0, -16 ; RV32-NEXT: sltu a0, a0, a3 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v8, v0, v8 -; RV32-NEXT: vsub.vx v0, v0, a2 -; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vsub.vx v8, v24, a2 +; RV32-NEXT: vnot.v v24, v24 +; RV32-NEXT: vand.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v0, v0, v24 -; RV32-NEXT: vsub.vv v8, v8, v0 -; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: addi a0, a0, 48 +; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v0, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: addi a0, sp, 48 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 48 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi sp, sp, 48 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_zero_undef_v32i64_unmasked: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -456,49 +456,52 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind { ; LMULMAX2-RV32I-LABEL: cttz_v2i64: ; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: addi sp, sp, -32 ; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32I-NEXT: li a1, 1 -; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.i v10, -1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 ; LMULMAX2-RV32I-NEXT: lui a1, 349525 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v9, v9, v10 -; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 209715 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v9 -; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 16(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 61681 ; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 8(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 4112 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 0(sp) +; LMULMAX2-RV32I-NEXT: li a1, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32I-NEXT: vlse64.v v9, (a1), zero +; LMULMAX2-RV32I-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v11, v8, 1 +; LMULMAX2-RV32I-NEXT: vand.vv v9, v11, v9 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vand.vv v9, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32I-NEXT: vlse64.v v9, (a1), zero +; LMULMAX2-RV32I-NEXT: mv a1, sp +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v11, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v11 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10 ; LMULMAX2-RV32I-NEXT: li a1, 56 ; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32I-NEXT: ret ; ; LMULMAX2-RV64I-LABEL: cttz_v2i64: @@ -545,23 +548,20 @@ ; LMULMAX2-RV32F: # %bb.0: ; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX2-RV32F-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32F-NEXT: vmv.v.i v9, 0 -; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32F-NEXT: vmseq.vv v0, v8, v9 -; LMULMAX2-RV32F-NEXT: vsub.vv v9, v9, v8 -; LMULMAX2-RV32F-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32F-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV32F-NEXT: vand.vv v9, v8, v9 ; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 ; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v9, v8 +; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v10, v9 ; LMULMAX2-RV32F-NEXT: fsrm a1 -; LMULMAX2-RV32F-NEXT: vsrl.vi v8, v9, 23 +; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v10, 23 ; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; LMULMAX2-RV32F-NEXT: vzext.vf2 v9, v8 +; LMULMAX2-RV32F-NEXT: vzext.vf2 v10, v9 ; LMULMAX2-RV32F-NEXT: li a1, 127 -; LMULMAX2-RV32F-NEXT: vsub.vx v8, v9, a1 +; LMULMAX2-RV32F-NEXT: vsub.vx v9, v10, a1 +; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0 ; LMULMAX2-RV32F-NEXT: li a1, 64 -; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v8, a1, v0 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0 ; LMULMAX2-RV32F-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV32F-NEXT: ret ; @@ -589,21 +589,18 @@ ; LMULMAX2-RV32D: # %bb.0: ; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX2-RV32D-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32D-NEXT: vmv.v.i v9, 0 -; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32D-NEXT: vmseq.vv v0, v8, v9 -; LMULMAX2-RV32D-NEXT: vsub.vv v9, v9, v8 -; LMULMAX2-RV32D-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32D-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vv v9, v8, v9 ; LMULMAX2-RV32D-NEXT: fsrmi a1, 1 -; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v8, v8 +; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v9, v9 ; LMULMAX2-RV32D-NEXT: fsrm a1 ; LMULMAX2-RV32D-NEXT: li a1, 52 -; LMULMAX2-RV32D-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vx v9, v9, a1 ; LMULMAX2-RV32D-NEXT: li a1, 1023 -; LMULMAX2-RV32D-NEXT: vsub.vx v8, v8, a1 +; LMULMAX2-RV32D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 ; LMULMAX2-RV32D-NEXT: li a1, 64 -; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v8, a1, v0 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 ; LMULMAX2-RV32D-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV32D-NEXT: ret ; @@ -626,46 +623,24 @@ ; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV64D-NEXT: ret ; -; LMULMAX8-RV32-LABEL: cttz_v2i64: -; LMULMAX8-RV32: # %bb.0: -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.i v9, 0 -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmseq.vv v0, v8, v9 -; LMULMAX8-RV32-NEXT: vsub.vv v9, v9, v8 -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: fsrmi a1, 1 -; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v8, v8 -; LMULMAX8-RV32-NEXT: fsrm a1 -; LMULMAX8-RV32-NEXT: li a1, 52 -; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX8-RV32-NEXT: li a1, 1023 -; LMULMAX8-RV32-NEXT: vsub.vx v8, v8, a1 -; LMULMAX8-RV32-NEXT: li a1, 64 -; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v8, a1, v0 -; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX8-RV32-NEXT: ret -; -; LMULMAX8-RV64-LABEL: cttz_v2i64: -; LMULMAX8-RV64: # %bb.0: -; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0 -; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9 -; LMULMAX8-RV64-NEXT: fsrmi a1, 1 -; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v9, v9 -; LMULMAX8-RV64-NEXT: fsrm a1 -; LMULMAX8-RV64-NEXT: li a1, 52 -; LMULMAX8-RV64-NEXT: vsrl.vx v9, v9, a1 -; LMULMAX8-RV64-NEXT: li a1, 1023 -; LMULMAX8-RV64-NEXT: vsub.vx v9, v9, a1 -; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 -; LMULMAX8-RV64-NEXT: li a1, 64 -; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 -; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX8-RV64-NEXT: ret +; LMULMAX8-LABEL: cttz_v2i64: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX8-NEXT: vle64.v v8, (a0) +; LMULMAX8-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-NEXT: fsrmi a1, 1 +; LMULMAX8-NEXT: vfcvt.f.xu.v v9, v9 +; LMULMAX8-NEXT: fsrm a1 +; LMULMAX8-NEXT: li a1, 52 +; LMULMAX8-NEXT: vsrl.vx v9, v9, a1 +; LMULMAX8-NEXT: li a1, 1023 +; LMULMAX8-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-NEXT: li a1, 64 +; LMULMAX8-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-NEXT: vse64.v v8, (a0) +; LMULMAX8-NEXT: ret ; ; ZVBB-LABEL: cttz_v2i64: ; ZVBB: # %bb.0: @@ -1140,49 +1115,52 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ; LMULMAX2-RV32I-LABEL: cttz_v4i64: ; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: addi sp, sp, -32 ; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32I-NEXT: li a1, 1 -; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.i v12, -1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v12 -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 ; LMULMAX2-RV32I-NEXT: lui a1, 349525 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v10, v10, v12 -; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 209715 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v12, v8, v10 -; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v12, v8 -; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 16(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 61681 ; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 8(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 4112 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 0(sp) +; LMULMAX2-RV32I-NEXT: li a1, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32I-NEXT: vlse64.v v12, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v14, v8, 1 +; LMULMAX2-RV32I-NEXT: vand.vv v10, v14, v10 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v12 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v12 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: mv a1, sp +; LMULMAX2-RV32I-NEXT: vlse64.v v12, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v14, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v14 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v12 ; LMULMAX2-RV32I-NEXT: li a1, 56 ; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32I-NEXT: ret ; ; LMULMAX2-RV64I-LABEL: cttz_v4i64: @@ -1229,23 +1207,20 @@ ; LMULMAX2-RV32F: # %bb.0: ; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32F-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32F-NEXT: vmv.v.i v10, 0 -; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32F-NEXT: vmseq.vv v0, v8, v10 -; LMULMAX2-RV32F-NEXT: vsub.vv v10, v10, v8 -; LMULMAX2-RV32F-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32F-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX2-RV32F-NEXT: vand.vv v10, v8, v10 ; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 ; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v10, v8 +; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v12, v10 ; LMULMAX2-RV32F-NEXT: fsrm a1 -; LMULMAX2-RV32F-NEXT: vsrl.vi v8, v10, 23 +; LMULMAX2-RV32F-NEXT: vsrl.vi v10, v12, 23 ; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; LMULMAX2-RV32F-NEXT: vzext.vf2 v10, v8 +; LMULMAX2-RV32F-NEXT: vzext.vf2 v12, v10 ; LMULMAX2-RV32F-NEXT: li a1, 127 -; LMULMAX2-RV32F-NEXT: vsub.vx v8, v10, a1 +; LMULMAX2-RV32F-NEXT: vsub.vx v10, v12, a1 +; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0 ; LMULMAX2-RV32F-NEXT: li a1, 64 -; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v8, a1, v0 +; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v10, a1, v0 ; LMULMAX2-RV32F-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV32F-NEXT: ret ; @@ -1273,21 +1248,18 @@ ; LMULMAX2-RV32D: # %bb.0: ; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32D-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32D-NEXT: vmv.v.i v10, 0 -; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32D-NEXT: vmseq.vv v0, v8, v10 -; LMULMAX2-RV32D-NEXT: vsub.vv v10, v10, v8 -; LMULMAX2-RV32D-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32D-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vv v10, v8, v10 ; LMULMAX2-RV32D-NEXT: fsrmi a1, 1 -; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v8, v8 +; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v10, v10 ; LMULMAX2-RV32D-NEXT: fsrm a1 ; LMULMAX2-RV32D-NEXT: li a1, 52 -; LMULMAX2-RV32D-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vx v10, v10, a1 ; LMULMAX2-RV32D-NEXT: li a1, 1023 -; LMULMAX2-RV32D-NEXT: vsub.vx v8, v8, a1 +; LMULMAX2-RV32D-NEXT: vsub.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 ; LMULMAX2-RV32D-NEXT: li a1, 64 -; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v8, a1, v0 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0 ; LMULMAX2-RV32D-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV32D-NEXT: ret ; @@ -1310,46 +1282,24 @@ ; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV64D-NEXT: ret ; -; LMULMAX8-RV32-LABEL: cttz_v4i64: -; LMULMAX8-RV32: # %bb.0: -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.i v10, 0 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmseq.vv v0, v8, v10 -; LMULMAX8-RV32-NEXT: vsub.vv v10, v10, v8 -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: fsrmi a1, 1 -; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v8, v8 -; LMULMAX8-RV32-NEXT: fsrm a1 -; LMULMAX8-RV32-NEXT: li a1, 52 -; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX8-RV32-NEXT: li a1, 1023 -; LMULMAX8-RV32-NEXT: vsub.vx v8, v8, a1 -; LMULMAX8-RV32-NEXT: li a1, 64 -; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v8, a1, v0 -; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX8-RV32-NEXT: ret -; -; LMULMAX8-RV64-LABEL: cttz_v4i64: -; LMULMAX8-RV64: # %bb.0: -; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0 -; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10 -; LMULMAX8-RV64-NEXT: fsrmi a1, 1 -; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v10, v10 -; LMULMAX8-RV64-NEXT: fsrm a1 -; LMULMAX8-RV64-NEXT: li a1, 52 -; LMULMAX8-RV64-NEXT: vsrl.vx v10, v10, a1 -; LMULMAX8-RV64-NEXT: li a1, 1023 -; LMULMAX8-RV64-NEXT: vsub.vx v10, v10, a1 -; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 -; LMULMAX8-RV64-NEXT: li a1, 64 -; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 -; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX8-RV64-NEXT: ret +; LMULMAX8-LABEL: cttz_v4i64: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX8-NEXT: vle64.v v8, (a0) +; LMULMAX8-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-NEXT: fsrmi a1, 1 +; LMULMAX8-NEXT: vfcvt.f.xu.v v10, v10 +; LMULMAX8-NEXT: fsrm a1 +; LMULMAX8-NEXT: li a1, 52 +; LMULMAX8-NEXT: vsrl.vx v10, v10, a1 +; LMULMAX8-NEXT: li a1, 1023 +; LMULMAX8-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-NEXT: li a1, 64 +; LMULMAX8-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-NEXT: vse64.v v8, (a0) +; LMULMAX8-NEXT: ret ; ; ZVBB-LABEL: cttz_v4i64: ; ZVBB: # %bb.0: @@ -1773,49 +1723,52 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; LMULMAX2-RV32I-LABEL: cttz_zero_undef_v2i64: ; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: addi sp, sp, -32 ; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32I-NEXT: li a1, 1 -; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.i v10, -1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 ; LMULMAX2-RV32I-NEXT: lui a1, 349525 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v9, v9, v10 -; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 209715 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v9 -; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 16(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 61681 ; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 8(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 4112 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 0(sp) +; LMULMAX2-RV32I-NEXT: li a1, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32I-NEXT: vlse64.v v9, (a1), zero +; LMULMAX2-RV32I-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v11, v8, 1 +; LMULMAX2-RV32I-NEXT: vand.vv v9, v11, v9 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vand.vv v9, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32I-NEXT: vlse64.v v9, (a1), zero +; LMULMAX2-RV32I-NEXT: mv a1, sp +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v11, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v11 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10 ; LMULMAX2-RV32I-NEXT: li a1, 56 ; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32I-NEXT: ret ; ; LMULMAX2-RV64I-LABEL: cttz_zero_undef_v2i64: @@ -1862,10 +1815,7 @@ ; LMULMAX2-RV32F: # %bb.0: ; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX2-RV32F-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32F-NEXT: vmv.v.i v9, 0 -; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32F-NEXT: vsub.vv v9, v9, v8 +; LMULMAX2-RV32F-NEXT: vrsub.vi v9, v8, 0 ; LMULMAX2-RV32F-NEXT: vand.vv v8, v8, v9 ; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 ; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma @@ -1899,10 +1849,7 @@ ; LMULMAX2-RV32D: # %bb.0: ; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX2-RV32D-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX2-RV32D-NEXT: vmv.v.i v9, 0 -; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX2-RV32D-NEXT: vsub.vv v9, v9, v8 +; LMULMAX2-RV32D-NEXT: vrsub.vi v9, v8, 0 ; LMULMAX2-RV32D-NEXT: vand.vv v8, v8, v9 ; LMULMAX2-RV32D-NEXT: fsrmi a1, 1 ; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v8, v8 @@ -1930,40 +1877,21 @@ ; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV64D-NEXT: ret ; -; LMULMAX8-RV32-LABEL: cttz_zero_undef_v2i64: -; LMULMAX8-RV32: # %bb.0: -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.i v9, 0 -; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV32-NEXT: vsub.vv v9, v9, v8 -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX8-RV32-NEXT: fsrmi a1, 1 -; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v8, v8 -; LMULMAX8-RV32-NEXT: fsrm a1 -; LMULMAX8-RV32-NEXT: li a1, 52 -; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX8-RV32-NEXT: li a1, 1023 -; LMULMAX8-RV32-NEXT: vsub.vx v8, v8, a1 -; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX8-RV32-NEXT: ret -; -; LMULMAX8-RV64-LABEL: cttz_zero_undef_v2i64: -; LMULMAX8-RV64: # %bb.0: -; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0 -; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX8-RV64-NEXT: fsrmi a1, 1 -; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v8, v8 -; LMULMAX8-RV64-NEXT: fsrm a1 -; LMULMAX8-RV64-NEXT: li a1, 52 -; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX8-RV64-NEXT: li a1, 1023 -; LMULMAX8-RV64-NEXT: vsub.vx v8, v8, a1 -; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX8-RV64-NEXT: ret +; LMULMAX8-LABEL: cttz_zero_undef_v2i64: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX8-NEXT: vle64.v v8, (a0) +; LMULMAX8-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-NEXT: fsrmi a1, 1 +; LMULMAX8-NEXT: vfcvt.f.xu.v v8, v8 +; LMULMAX8-NEXT: fsrm a1 +; LMULMAX8-NEXT: li a1, 52 +; LMULMAX8-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-NEXT: li a1, 1023 +; LMULMAX8-NEXT: vsub.vx v8, v8, a1 +; LMULMAX8-NEXT: vse64.v v8, (a0) +; LMULMAX8-NEXT: ret ; ; ZVBB-LABEL: cttz_zero_undef_v2i64: ; ZVBB: # %bb.0: @@ -2414,49 +2342,52 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; LMULMAX2-RV32I-LABEL: cttz_zero_undef_v4i64: ; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: addi sp, sp, -32 ; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32I-NEXT: li a1, 1 -; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.i v12, -1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v12 -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 ; LMULMAX2-RV32I-NEXT: lui a1, 349525 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v12, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v10, v10, v12 -; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: sw a1, 28(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 24(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 209715 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v12, v8, v10 -; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v12, v8 -; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: sw a1, 20(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 16(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 61681 ; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 8(sp) ; LMULMAX2-RV32I-NEXT: lui a1, 4112 ; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1 -; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32I-NEXT: sw a1, 0(sp) +; LMULMAX2-RV32I-NEXT: li a1, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vnot.v v8, v8 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 24 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: addi a1, sp, 16 +; LMULMAX2-RV32I-NEXT: vlse64.v v12, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v14, v8, 1 +; LMULMAX2-RV32I-NEXT: vand.vv v10, v14, v10 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v12 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v12 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: addi a1, sp, 8 +; LMULMAX2-RV32I-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV32I-NEXT: mv a1, sp +; LMULMAX2-RV32I-NEXT: vlse64.v v12, (a1), zero +; LMULMAX2-RV32I-NEXT: vsrl.vi v14, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v14 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v12 ; LMULMAX2-RV32I-NEXT: li a1, 56 ; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi sp, sp, 32 ; LMULMAX2-RV32I-NEXT: ret ; ; LMULMAX2-RV64I-LABEL: cttz_zero_undef_v4i64: @@ -2503,10 +2434,7 @@ ; LMULMAX2-RV32F: # %bb.0: ; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32F-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32F-NEXT: vmv.v.i v10, 0 -; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32F-NEXT: vsub.vv v10, v10, v8 +; LMULMAX2-RV32F-NEXT: vrsub.vi v10, v8, 0 ; LMULMAX2-RV32F-NEXT: vand.vv v8, v8, v10 ; LMULMAX2-RV32F-NEXT: fsrmi a1, 1 ; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma @@ -2540,10 +2468,7 @@ ; LMULMAX2-RV32D: # %bb.0: ; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32D-NEXT: vle64.v v8, (a0) -; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-RV32D-NEXT: vmv.v.i v10, 0 -; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV32D-NEXT: vsub.vv v10, v10, v8 +; LMULMAX2-RV32D-NEXT: vrsub.vi v10, v8, 0 ; LMULMAX2-RV32D-NEXT: vand.vv v8, v8, v10 ; LMULMAX2-RV32D-NEXT: fsrmi a1, 1 ; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v8, v8 @@ -2571,40 +2496,21 @@ ; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV64D-NEXT: ret ; -; LMULMAX8-RV32-LABEL: cttz_zero_undef_v4i64: -; LMULMAX8-RV32: # %bb.0: -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX8-RV32-NEXT: vmv.v.i v10, 0 -; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV32-NEXT: vsub.vv v10, v10, v8 -; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX8-RV32-NEXT: fsrmi a1, 1 -; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v8, v8 -; LMULMAX8-RV32-NEXT: fsrm a1 -; LMULMAX8-RV32-NEXT: li a1, 52 -; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX8-RV32-NEXT: li a1, 1023 -; LMULMAX8-RV32-NEXT: vsub.vx v8, v8, a1 -; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX8-RV32-NEXT: ret -; -; LMULMAX8-RV64-LABEL: cttz_zero_undef_v4i64: -; LMULMAX8-RV64: # %bb.0: -; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0 -; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX8-RV64-NEXT: fsrmi a1, 1 -; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v8, v8 -; LMULMAX8-RV64-NEXT: fsrm a1 -; LMULMAX8-RV64-NEXT: li a1, 52 -; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX8-RV64-NEXT: li a1, 1023 -; LMULMAX8-RV64-NEXT: vsub.vx v8, v8, a1 -; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX8-RV64-NEXT: ret +; LMULMAX8-LABEL: cttz_zero_undef_v4i64: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX8-NEXT: vle64.v v8, (a0) +; LMULMAX8-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-NEXT: fsrmi a1, 1 +; LMULMAX8-NEXT: vfcvt.f.xu.v v8, v8 +; LMULMAX8-NEXT: fsrm a1 +; LMULMAX8-NEXT: li a1, 52 +; LMULMAX8-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-NEXT: li a1, 1023 +; LMULMAX8-NEXT: vsub.vx v8, v8, a1 +; LMULMAX8-NEXT: vse64.v v8, (a0) +; LMULMAX8-NEXT: ret ; ; ZVBB-LABEL: cttz_zero_undef_v4i64: ; ZVBB: # %bb.0: @@ -2619,3 +2525,6 @@ store <4 x i64> %c, ptr %x ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; LMULMAX8-RV32: {{.*}} +; LMULMAX8-RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll @@ -522,408 +522,216 @@ declare <2 x i64> @llvm.vp.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i1>, i32) define <2 x i64> @fshr_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: fshr_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vx v11, v10, a1, v0.t -; RV32-NEXT: vsrl.vv v9, v9, v11, v0.t -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v11, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v10, v10, v11, v0.t -; RV32-NEXT: vand.vx v10, v10, a1, v0.t -; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vsll.vv v8, v8, v10, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: fshr_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 63 -; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vand.vx v11, v10, a1, v0.t -; RV64-NEXT: vsrl.vv v9, v9, v11, v0.t -; RV64-NEXT: vnot.v v10, v10, v0.t -; RV64-NEXT: vand.vx v10, v10, a1, v0.t -; RV64-NEXT: vsll.vi v8, v8, 1, v0.t -; RV64-NEXT: vsll.vv v8, v8, v10, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: fshr_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vand.vx v11, v10, a1, v0.t +; CHECK-NEXT: vsrl.vv v9, v9, v11, v0.t +; CHECK-NEXT: vnot.v v10, v10, v0.t +; CHECK-NEXT: vand.vx v10, v10, a1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v10, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret %res = call <2 x i64> @llvm.vp.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 %evl) ret <2 x i64> %res } declare <2 x i64> @llvm.vp.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i1>, i32) define <2 x i64> @fshl_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: fshl_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vx v11, v10, a1, v0.t -; RV32-NEXT: vsll.vv v8, v8, v11, v0.t -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v11, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v10, v10, v11, v0.t -; RV32-NEXT: vand.vx v10, v10, a1, v0.t -; RV32-NEXT: vsrl.vi v9, v9, 1, v0.t -; RV32-NEXT: vsrl.vv v9, v9, v10, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: fshl_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 63 -; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vand.vx v11, v10, a1, v0.t -; RV64-NEXT: vsll.vv v8, v8, v11, v0.t -; RV64-NEXT: vnot.v v10, v10, v0.t -; RV64-NEXT: vand.vx v10, v10, a1, v0.t -; RV64-NEXT: vsrl.vi v9, v9, 1, v0.t -; RV64-NEXT: vsrl.vv v9, v9, v10, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: fshl_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vand.vx v11, v10, a1, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v11, v0.t +; CHECK-NEXT: vnot.v v10, v10, v0.t +; CHECK-NEXT: vand.vx v10, v10, a1, v0.t +; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vv v9, v9, v10, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret %res = call <2 x i64> @llvm.vp.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 %evl) ret <2 x i64> %res } declare <4 x i64> @llvm.vp.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>, <4 x i1>, i32) define <4 x i64> @fshr_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: fshr_v4i64: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vx v14, v12, a1, v0.t -; RV32-NEXT: vsrl.vv v10, v10, v14, v0.t -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v14, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v12, v12, v14, v0.t -; RV32-NEXT: vand.vx v12, v12, a1, v0.t -; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vsll.vv v8, v8, v12, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: fshr_v4i64: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 63 -; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vand.vx v14, v12, a1, v0.t -; RV64-NEXT: vsrl.vv v10, v10, v14, v0.t -; RV64-NEXT: vnot.v v12, v12, v0.t -; RV64-NEXT: vand.vx v12, v12, a1, v0.t -; RV64-NEXT: vsll.vi v8, v8, 1, v0.t -; RV64-NEXT: vsll.vv v8, v8, v12, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: fshr_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vand.vx v14, v12, a1, v0.t +; CHECK-NEXT: vsrl.vv v10, v10, v14, v0.t +; CHECK-NEXT: vnot.v v12, v12, v0.t +; CHECK-NEXT: vand.vx v12, v12, a1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v12, v0.t +; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret %res = call <4 x i64> @llvm.vp.fshr.v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 %evl) ret <4 x i64> %res } declare <4 x i64> @llvm.vp.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>, <4 x i1>, i32) define <4 x i64> @fshl_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: fshl_v4i64: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vx v14, v12, a1, v0.t -; RV32-NEXT: vsll.vv v8, v8, v14, v0.t -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.i v14, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v12, v12, v14, v0.t -; RV32-NEXT: vand.vx v12, v12, a1, v0.t -; RV32-NEXT: vsrl.vi v10, v10, 1, v0.t -; RV32-NEXT: vsrl.vv v10, v10, v12, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: fshl_v4i64: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 63 -; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vand.vx v14, v12, a1, v0.t -; RV64-NEXT: vsll.vv v8, v8, v14, v0.t -; RV64-NEXT: vnot.v v12, v12, v0.t -; RV64-NEXT: vand.vx v12, v12, a1, v0.t -; RV64-NEXT: vsrl.vi v10, v10, 1, v0.t -; RV64-NEXT: vsrl.vv v10, v10, v12, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: fshl_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vand.vx v14, v12, a1, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v14, v0.t +; CHECK-NEXT: vnot.v v12, v12, v0.t +; CHECK-NEXT: vand.vx v12, v12, a1, v0.t +; CHECK-NEXT: vsrl.vi v10, v10, 1, v0.t +; CHECK-NEXT: vsrl.vv v10, v10, v12, v0.t +; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret %res = call <4 x i64> @llvm.vp.fshl.v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 %evl) ret <4 x i64> %res } declare <7 x i64> @llvm.vp.fshr.v7i64(<7 x i64>, <7 x i64>, <7 x i64>, <7 x i1>, i32) define <7 x i64> @fshr_v7i64(<7 x i64> %a, <7 x i64> %b, <7 x i64> %c, <7 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: fshr_v7i64: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vx v20, v16, a1, v0.t -; RV32-NEXT: vsrl.vv v12, v12, v20, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v20, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v16, v16, v20, v0.t -; RV32-NEXT: vand.vx v16, v16, a1, v0.t -; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vsll.vv v8, v8, v16, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: fshr_v7i64: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 63 -; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vand.vx v20, v16, a1, v0.t -; RV64-NEXT: vsrl.vv v12, v12, v20, v0.t -; RV64-NEXT: vnot.v v16, v16, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t -; RV64-NEXT: vsll.vi v8, v8, 1, v0.t -; RV64-NEXT: vsll.vv v8, v8, v16, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: fshr_v7i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vand.vx v20, v16, a1, v0.t +; CHECK-NEXT: vsrl.vv v12, v12, v20, v0.t +; CHECK-NEXT: vnot.v v16, v16, v0.t +; CHECK-NEXT: vand.vx v16, v16, a1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v16, v0.t +; CHECK-NEXT: vor.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret %res = call <7 x i64> @llvm.vp.fshr.v7i64(<7 x i64> %a, <7 x i64> %b, <7 x i64> %c, <7 x i1> %m, i32 %evl) ret <7 x i64> %res } declare <7 x i64> @llvm.vp.fshl.v7i64(<7 x i64>, <7 x i64>, <7 x i64>, <7 x i1>, i32) define <7 x i64> @fshl_v7i64(<7 x i64> %a, <7 x i64> %b, <7 x i64> %c, <7 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: fshl_v7i64: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vx v20, v16, a1, v0.t -; RV32-NEXT: vsll.vv v8, v8, v20, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v20, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v16, v16, v20, v0.t -; RV32-NEXT: vand.vx v16, v16, a1, v0.t -; RV32-NEXT: vsrl.vi v12, v12, 1, v0.t -; RV32-NEXT: vsrl.vv v12, v12, v16, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: fshl_v7i64: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 63 -; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vand.vx v20, v16, a1, v0.t -; RV64-NEXT: vsll.vv v8, v8, v20, v0.t -; RV64-NEXT: vnot.v v16, v16, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t -; RV64-NEXT: vsrl.vi v12, v12, 1, v0.t -; RV64-NEXT: vsrl.vv v12, v12, v16, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: fshl_v7i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vand.vx v20, v16, a1, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v20, v0.t +; CHECK-NEXT: vnot.v v16, v16, v0.t +; CHECK-NEXT: vand.vx v16, v16, a1, v0.t +; CHECK-NEXT: vsrl.vi v12, v12, 1, v0.t +; CHECK-NEXT: vsrl.vv v12, v12, v16, v0.t +; CHECK-NEXT: vor.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret %res = call <7 x i64> @llvm.vp.fshl.v7i64(<7 x i64> %a, <7 x i64> %b, <7 x i64> %c, <7 x i1> %m, i32 %evl) ret <7 x i64> %res } declare <8 x i64> @llvm.vp.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i1>, i32) define <8 x i64> @fshr_v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: fshr_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vx v20, v16, a1, v0.t -; RV32-NEXT: vsrl.vv v12, v12, v20, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v20, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v16, v16, v20, v0.t -; RV32-NEXT: vand.vx v16, v16, a1, v0.t -; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vsll.vv v8, v8, v16, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: fshr_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 63 -; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vand.vx v20, v16, a1, v0.t -; RV64-NEXT: vsrl.vv v12, v12, v20, v0.t -; RV64-NEXT: vnot.v v16, v16, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t -; RV64-NEXT: vsll.vi v8, v8, 1, v0.t -; RV64-NEXT: vsll.vv v8, v8, v16, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: fshr_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vand.vx v20, v16, a1, v0.t +; CHECK-NEXT: vsrl.vv v12, v12, v20, v0.t +; CHECK-NEXT: vnot.v v16, v16, v0.t +; CHECK-NEXT: vand.vx v16, v16, a1, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v16, v0.t +; CHECK-NEXT: vor.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret %res = call <8 x i64> @llvm.vp.fshr.v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 %evl) ret <8 x i64> %res } declare <8 x i64> @llvm.vp.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i1>, i32) define <8 x i64> @fshl_v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: fshl_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vx v20, v16, a1, v0.t -; RV32-NEXT: vsll.vv v8, v8, v20, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv.v.i v20, -1 -; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v16, v16, v20, v0.t -; RV32-NEXT: vand.vx v16, v16, a1, v0.t -; RV32-NEXT: vsrl.vi v12, v12, 1, v0.t -; RV32-NEXT: vsrl.vv v12, v12, v16, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: fshl_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 63 -; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vand.vx v20, v16, a1, v0.t -; RV64-NEXT: vsll.vv v8, v8, v20, v0.t -; RV64-NEXT: vnot.v v16, v16, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t -; RV64-NEXT: vsrl.vi v12, v12, 1, v0.t -; RV64-NEXT: vsrl.vv v12, v12, v16, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: ret +; CHECK-LABEL: fshl_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vand.vx v20, v16, a1, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v20, v0.t +; CHECK-NEXT: vnot.v v16, v16, v0.t +; CHECK-NEXT: vand.vx v16, v16, a1, v0.t +; CHECK-NEXT: vsrl.vi v12, v12, 1, v0.t +; CHECK-NEXT: vsrl.vv v12, v12, v16, v0.t +; CHECK-NEXT: vor.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret %res = call <8 x i64> @llvm.vp.fshl.v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 %evl) ret <8 x i64> %res } declare <16 x i64> @llvm.vp.fshr.v16i64(<16 x i64>, <16 x i64>, <16 x i64>, <16 x i1>, i32) define <16 x i64> @fshr_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: fshr_v16i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v24, (a0) -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vx v8, v24, a0, v0.t -; RV32-NEXT: vsrl.vv v16, v16, v8, v0.t -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v8, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v24, v8, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsll.vi v24, v24, 1, v0.t -; RV32-NEXT: vsll.vv v8, v24, v8, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: fshr_v16i64: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; RV64-NEXT: li a0, 63 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vand.vx v8, v24, a0, v0.t -; RV64-NEXT: vsrl.vv v16, v16, v8, v0.t -; RV64-NEXT: vnot.v v8, v24, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsll.vi v24, v24, 1, v0.t -; RV64-NEXT: vsll.vv v8, v24, v8, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: ret +; CHECK-LABEL: fshr_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vand.vx v8, v24, a0, v0.t +; CHECK-NEXT: vsrl.vv v16, v16, v8, v0.t +; CHECK-NEXT: vnot.v v8, v24, v0.t +; CHECK-NEXT: vand.vx v8, v8, a0, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsll.vi v24, v24, 1, v0.t +; CHECK-NEXT: vsll.vv v8, v24, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v16, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret %res = call <16 x i64> @llvm.vp.fshr.v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 x i1> %m, i32 %evl) ret <16 x i64> %res } declare <16 x i64> @llvm.vp.fshl.v16i64(<16 x i64>, <16 x i64>, <16 x i64>, <16 x i1>, i32) define <16 x i64> @fshl_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 x i1> %m, i32 zeroext %evl) { -; RV32-LABEL: fshl_v16i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v24, (a0) -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV32-NEXT: vmv8r.v v16, v8 -; RV32-NEXT: li a0, 63 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vx v8, v24, a0, v0.t -; RV32-NEXT: vsll.vv v8, v16, v8, v0.t -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmv.v.i v16, -1 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v16, v24, v16, v0.t -; RV32-NEXT: vand.vx v16, v16, a0, v0.t -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v24, v24, 1, v0.t -; RV32-NEXT: vsrl.vv v16, v24, v16, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: fshl_v16i64: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v24, (a0) -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; RV64-NEXT: vmv8r.v v16, v8 -; RV64-NEXT: li a0, 63 -; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vand.vx v8, v24, a0, v0.t -; RV64-NEXT: vsll.vv v8, v16, v8, v0.t -; RV64-NEXT: vnot.v v16, v24, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vsrl.vi v24, v24, 1, v0.t -; RV64-NEXT: vsrl.vv v16, v24, v16, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: ret +; CHECK-LABEL: fshl_v16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vand.vx v8, v24, a0, v0.t +; CHECK-NEXT: vsll.vv v8, v16, v8, v0.t +; CHECK-NEXT: vnot.v v16, v24, v0.t +; CHECK-NEXT: vand.vx v16, v16, a0, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsrl.vi v24, v24, 1, v0.t +; CHECK-NEXT: vsrl.vv v16, v24, v16, v0.t +; CHECK-NEXT: vor.vv v8, v8, v16, v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret %res = call <16 x i64> @llvm.vp.fshl.v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 x i1> %m, i32 %evl) ret <16 x i64> %res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll @@ -875,34 +875,17 @@ declare <1 x i64> @llvm.fshl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) define <1 x i64> @vrol_vv_v1i64(<1 x i64> %a, <1 x i64> %b) { -; CHECK-RV32-LABEL: vrol_vv_v1i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v10, 0 -; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vv v11, v10, v9 -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v10, a0 -; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vv v11, v11, v10 -; CHECK-RV32-NEXT: vsrl.vv v11, v8, v11 -; CHECK-RV32-NEXT: vand.vv v9, v9, v10 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v8, v11 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vrol_vv_v1i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 -; CHECK-RV64-NEXT: vand.vx v9, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV64-NEXT: vor.vv v8, v10, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vrol_vv_v1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vand.vx v10, v9, a0 +; CHECK-NEXT: vsll.vv v10, v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vrol_vv_v1i64: ; CHECK-ZVBB: # %bb.0: @@ -914,35 +897,18 @@ } define <1 x i64> @vrol_vx_v1i64(<1 x i64> %a, i64 %b) { -; CHECK-RV32-LABEL: vrol_vx_v1i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v9, 0 -; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vx v10, v9, a0 -; CHECK-RV32-NEXT: li a1, 63 -; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v9, a1 -; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vv v10, v10, v9 -; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vrol_vx_v1i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV64-NEXT: vmv.v.x v9, a0 -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 -; CHECK-RV64-NEXT: vand.vx v9, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV64-NEXT: vor.vv v8, v10, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vrol_vx_v1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vand.vx v10, v9, a0 +; CHECK-NEXT: vsll.vv v10, v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vrol_vx_v1i64: ; CHECK-ZVBB: # %bb.0: @@ -958,32 +924,17 @@ declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) define <2 x i64> @vrol_vv_v2i64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-RV32-LABEL: vrol_vv_v2i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vx v10, v9, a0 -; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v11, 0 -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vv v9, v11, v9 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v10, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vrol_vv_v2i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 -; CHECK-RV64-NEXT: vand.vx v9, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV64-NEXT: vor.vv v8, v10, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vrol_vv_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vand.vx v10, v9, a0 +; CHECK-NEXT: vsll.vv v10, v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vrol_vv_v2i64: ; CHECK-ZVBB: # %bb.0: @@ -995,34 +946,18 @@ } define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) { -; CHECK-RV32-LABEL: vrol_vx_v2i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v9, a0 -; CHECK-RV32-NEXT: li a1, 63 -; CHECK-RV32-NEXT: vand.vx v9, v9, a1 -; CHECK-RV32-NEXT: vsll.vv v9, v8, v9 -; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v10, 0 -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vx v10, v10, a0 -; CHECK-RV32-NEXT: vand.vx v10, v10, a1 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v9, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vrol_vx_v2i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vmv.v.x v9, a0 -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 -; CHECK-RV64-NEXT: vand.vx v9, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV64-NEXT: vor.vv v8, v10, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vrol_vx_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vand.vx v10, v9, a0 +; CHECK-NEXT: vsll.vv v10, v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsrl.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vrol_vx_v2i64: ; CHECK-ZVBB: # %bb.0: @@ -1038,32 +973,17 @@ declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) define <4 x i64> @vrol_vv_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-RV32-LABEL: vrol_vv_v4i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vand.vx v12, v10, a0 -; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v14, 0 -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vsub.vv v10, v14, v10 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v12, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vrol_vv_v4i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV64-NEXT: vand.vx v12, v10, a0 -; CHECK-RV64-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV64-NEXT: vrsub.vi v10, v10, 0 -; CHECK-RV64-NEXT: vand.vx v10, v10, a0 -; CHECK-RV64-NEXT: vsrl.vv v8, v8, v10 -; CHECK-RV64-NEXT: vor.vv v8, v12, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vrol_vv_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vand.vx v12, v10, a0 +; CHECK-NEXT: vsll.vv v12, v8, v12 +; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vrol_vv_v4i64: ; CHECK-ZVBB: # %bb.0: @@ -1075,34 +995,18 @@ } define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) { -; CHECK-RV32-LABEL: vrol_vx_v4i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v10, a0 -; CHECK-RV32-NEXT: li a1, 63 -; CHECK-RV32-NEXT: vand.vx v10, v10, a1 -; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v12, 0 -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vsub.vx v12, v12, a0 -; CHECK-RV32-NEXT: vand.vx v12, v12, a1 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v10, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vrol_vx_v4i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV64-NEXT: vmv.v.x v10, a0 -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vand.vx v12, v10, a0 -; CHECK-RV64-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV64-NEXT: vrsub.vi v10, v10, 0 -; CHECK-RV64-NEXT: vand.vx v10, v10, a0 -; CHECK-RV64-NEXT: vsrl.vv v8, v8, v10 -; CHECK-RV64-NEXT: vor.vv v8, v12, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vrol_vx_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vand.vx v12, v10, a0 +; CHECK-NEXT: vsll.vv v12, v8, v12 +; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vrol_vx_v4i64: ; CHECK-ZVBB: # %bb.0: @@ -1118,32 +1022,17 @@ declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) define <8 x i64> @vrol_vv_v8i64(<8 x i64> %a, <8 x i64> %b) { -; CHECK-RV32-LABEL: vrol_vv_v8i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vand.vx v16, v12, a0 -; CHECK-RV32-NEXT: vsll.vv v16, v8, v16 -; CHECK-RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v20, 0 -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vsub.vv v12, v20, v12 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v16, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vrol_vv_v8i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV64-NEXT: vand.vx v16, v12, a0 -; CHECK-RV64-NEXT: vsll.vv v16, v8, v16 -; CHECK-RV64-NEXT: vrsub.vi v12, v12, 0 -; CHECK-RV64-NEXT: vand.vx v12, v12, a0 -; CHECK-RV64-NEXT: vsrl.vv v8, v8, v12 -; CHECK-RV64-NEXT: vor.vv v8, v16, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vrol_vv_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vand.vx v16, v12, a0 +; CHECK-NEXT: vsll.vv v16, v8, v16 +; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsrl.vv v8, v8, v12 +; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vrol_vv_v8i64: ; CHECK-ZVBB: # %bb.0: @@ -1155,34 +1044,18 @@ } define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) { -; CHECK-RV32-LABEL: vrol_vx_v8i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v12, a0 -; CHECK-RV32-NEXT: li a1, 63 -; CHECK-RV32-NEXT: vand.vx v12, v12, a1 -; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v16, 0 -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vsub.vx v16, v16, a0 -; CHECK-RV32-NEXT: vand.vx v16, v16, a1 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v16 -; CHECK-RV32-NEXT: vor.vv v8, v12, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vrol_vx_v8i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV64-NEXT: vmv.v.x v12, a0 -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vand.vx v16, v12, a0 -; CHECK-RV64-NEXT: vsll.vv v16, v8, v16 -; CHECK-RV64-NEXT: vrsub.vi v12, v12, 0 -; CHECK-RV64-NEXT: vand.vx v12, v12, a0 -; CHECK-RV64-NEXT: vsrl.vv v8, v8, v12 -; CHECK-RV64-NEXT: vor.vv v8, v16, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vrol_vx_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vand.vx v16, v12, a0 +; CHECK-NEXT: vsll.vv v16, v8, v16 +; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsrl.vv v8, v8, v12 +; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vrol_vx_v8i64: ; CHECK-ZVBB: # %bb.0: @@ -1196,5 +1069,7 @@ } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-RV32: {{.*}} +; CHECK-RV64: {{.*}} ; CHECK-ZVBB32: {{.*}} ; CHECK-ZVBB64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll @@ -1554,34 +1554,17 @@ declare <1 x i64> @llvm.fshl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) define <1 x i64> @vror_vv_v1i64(<1 x i64> %a, <1 x i64> %b) { -; CHECK-RV32-LABEL: vror_vv_v1i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v10, 0 -; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vv v11, v10, v9 -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v10, a0 -; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vv v11, v11, v10 -; CHECK-RV32-NEXT: vsll.vv v11, v8, v11 -; CHECK-RV32-NEXT: vand.vv v9, v9, v10 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v8, v11 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vror_vv_v1i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 -; CHECK-RV64-NEXT: vand.vx v9, v9, a0 -; CHECK-RV64-NEXT: vsll.vv v8, v8, v9 -; CHECK-RV64-NEXT: vor.vv v8, v10, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vror_vv_v1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vand.vx v10, v9, a0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsll.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vror_vv_v1i64: ; CHECK-ZVBB: # %bb.0: @@ -1593,35 +1576,18 @@ } define <1 x i64> @vror_vx_v1i64(<1 x i64> %a, i64 %b) { -; CHECK-RV32-LABEL: vror_vx_v1i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v9, 0 -; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vx v10, v9, a0 -; CHECK-RV32-NEXT: li a1, 63 -; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v9, a1 -; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vv v10, v10, v9 -; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vror_vx_v1i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV64-NEXT: vmv.v.x v9, a0 -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 -; CHECK-RV64-NEXT: vand.vx v9, v9, a0 -; CHECK-RV64-NEXT: vsll.vv v8, v8, v9 -; CHECK-RV64-NEXT: vor.vv v8, v10, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vror_vx_v1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vand.vx v10, v9, a0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsll.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vror_vx_v1i64: ; CHECK-ZVBB: # %bb.0: @@ -1637,20 +1603,16 @@ define <1 x i64> @vror_vi_v1i64(<1 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v1i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v9, 0 -; CHECK-RV32-NEXT: li a0, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vx v10, v9, a0 +; CHECK-RV32-NEXT: vmv.v.i v9, 1 +; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v9, a0 -; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vv v10, v10, v9 -; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vand.vi v9, v9, 1 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vsll.vv v9, v8, v9 +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vand.vi v10, v10, 1 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v8, v9 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v1i64: @@ -1674,20 +1636,16 @@ define <1 x i64> @vror_vi_rotl_v1i64(<1 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v1i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v9, 0 -; CHECK-RV32-NEXT: li a0, 1 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vx v10, v9, a0 +; CHECK-RV32-NEXT: vmv.v.i v9, 1 +; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 ; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, tu, ma -; CHECK-RV32-NEXT: vmv.s.x v9, a0 -; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vv v10, v10, v9 -; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV32-NEXT: vand.vi v9, v9, 1 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vand.vi v10, v10, 1 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v8, v9 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v1i64: @@ -1712,32 +1670,17 @@ declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) define <2 x i64> @vror_vv_v2i64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-RV32-LABEL: vror_vv_v2i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vx v10, v9, a0 -; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v11, 0 -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vv v9, v11, v9 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v10, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vror_vv_v2i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 -; CHECK-RV64-NEXT: vand.vx v9, v9, a0 -; CHECK-RV64-NEXT: vsll.vv v8, v8, v9 -; CHECK-RV64-NEXT: vor.vv v8, v10, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vror_vv_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vand.vx v10, v9, a0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsll.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vror_vv_v2i64: ; CHECK-ZVBB: # %bb.0: @@ -1749,34 +1692,18 @@ } define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) { -; CHECK-RV32-LABEL: vror_vx_v2i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v9, a0 -; CHECK-RV32-NEXT: li a1, 63 -; CHECK-RV32-NEXT: vand.vx v9, v9, a1 -; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 -; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v10, 0 -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vx v10, v10, a0 -; CHECK-RV32-NEXT: vand.vx v10, v10, a1 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v9, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vror_vx_v2i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV64-NEXT: vmv.v.x v9, a0 -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vand.vx v10, v9, a0 -; CHECK-RV64-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV64-NEXT: vrsub.vi v9, v9, 0 -; CHECK-RV64-NEXT: vand.vx v9, v9, a0 -; CHECK-RV64-NEXT: vsll.vv v8, v8, v9 -; CHECK-RV64-NEXT: vor.vv v8, v10, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vror_vx_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vand.vx v10, v9, a0 +; CHECK-NEXT: vsrl.vv v10, v8, v10 +; CHECK-NEXT: vrsub.vi v9, v9, 0 +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsll.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vror_vx_v2i64: ; CHECK-ZVBB: # %bb.0: @@ -1792,19 +1719,16 @@ define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v2i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v9, a0 -; CHECK-RV32-NEXT: vand.vi v9, v9, 1 -; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 -; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v10, 0 -; CHECK-RV32-NEXT: li a1, 1 -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vx v10, v10, a1 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v9, v8 +; CHECK-RV32-NEXT: vmv.v.i v9, 1 +; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vsll.vv v9, v8, v9 +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vand.vi v10, v10, 1 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v8, v9 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v2i64: @@ -1828,19 +1752,16 @@ define <2 x i64> @vror_vi_rotl_v2i64(<2 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v2i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v9, a0 -; CHECK-RV32-NEXT: vand.vi v9, v9, 1 -; CHECK-RV32-NEXT: vsll.vv v9, v8, v9 -; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v10, 0 -; CHECK-RV32-NEXT: li a1, 1 -; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-RV32-NEXT: vsub.vx v10, v10, a1 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v9, v8 +; CHECK-RV32-NEXT: vmv.v.i v9, 1 +; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 +; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vand.vi v10, v10, 1 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v8, v9 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v2i64: @@ -1865,32 +1786,17 @@ declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) define <4 x i64> @vror_vv_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-RV32-LABEL: vror_vv_v4i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vand.vx v12, v10, a0 -; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 -; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v14, 0 -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vsub.vv v10, v14, v10 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v12, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vror_vv_v4i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV64-NEXT: vand.vx v12, v10, a0 -; CHECK-RV64-NEXT: vsrl.vv v12, v8, v12 -; CHECK-RV64-NEXT: vrsub.vi v10, v10, 0 -; CHECK-RV64-NEXT: vand.vx v10, v10, a0 -; CHECK-RV64-NEXT: vsll.vv v8, v8, v10 -; CHECK-RV64-NEXT: vor.vv v8, v12, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vror_vv_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vand.vx v12, v10, a0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 +; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsll.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vror_vv_v4i64: ; CHECK-ZVBB: # %bb.0: @@ -1902,34 +1808,18 @@ } define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) { -; CHECK-RV32-LABEL: vror_vx_v4i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v10, a0 -; CHECK-RV32-NEXT: li a1, 63 -; CHECK-RV32-NEXT: vand.vx v10, v10, a1 -; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v12, 0 -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vsub.vx v12, v12, a0 -; CHECK-RV32-NEXT: vand.vx v12, v12, a1 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v10, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vror_vx_v4i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV64-NEXT: vmv.v.x v10, a0 -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vand.vx v12, v10, a0 -; CHECK-RV64-NEXT: vsrl.vv v12, v8, v12 -; CHECK-RV64-NEXT: vrsub.vi v10, v10, 0 -; CHECK-RV64-NEXT: vand.vx v10, v10, a0 -; CHECK-RV64-NEXT: vsll.vv v8, v8, v10 -; CHECK-RV64-NEXT: vor.vv v8, v12, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vror_vx_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vand.vx v12, v10, a0 +; CHECK-NEXT: vsrl.vv v12, v8, v12 +; CHECK-NEXT: vrsub.vi v10, v10, 0 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsll.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vror_vx_v4i64: ; CHECK-ZVBB: # %bb.0: @@ -1945,19 +1835,16 @@ define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v4i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v10, a0 -; CHECK-RV32-NEXT: vand.vi v10, v10, 1 -; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v12, 0 -; CHECK-RV32-NEXT: li a1, 1 ; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vsub.vx v12, v12, a1 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v10, v8 +; CHECK-RV32-NEXT: vmv.v.i v10, 1 +; CHECK-RV32-NEXT: vrsub.vi v10, v10, 0 +; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vand.vx v10, v10, a0 +; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 +; CHECK-RV32-NEXT: vmv.v.x v12, a0 +; CHECK-RV32-NEXT: vand.vi v12, v12, 1 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 +; CHECK-RV32-NEXT: vor.vv v8, v8, v10 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v4i64: @@ -1981,19 +1868,16 @@ define <4 x i64> @vror_vi_rotl_v4i64(<4 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v4i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v10, a0 -; CHECK-RV32-NEXT: vand.vi v10, v10, 1 -; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v12, 0 -; CHECK-RV32-NEXT: li a1, 1 ; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV32-NEXT: vsub.vx v12, v12, a1 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v10, v8 +; CHECK-RV32-NEXT: vmv.v.i v10, 1 +; CHECK-RV32-NEXT: vrsub.vi v10, v10, 0 +; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vand.vx v10, v10, a0 +; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 +; CHECK-RV32-NEXT: vmv.v.x v12, a0 +; CHECK-RV32-NEXT: vand.vi v12, v12, 1 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 +; CHECK-RV32-NEXT: vor.vv v8, v8, v10 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v4i64: @@ -2018,32 +1902,17 @@ declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) define <8 x i64> @vror_vv_v8i64(<8 x i64> %a, <8 x i64> %b) { -; CHECK-RV32-LABEL: vror_vv_v8i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vand.vx v16, v12, a0 -; CHECK-RV32-NEXT: vsrl.vv v16, v8, v16 -; CHECK-RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v20, 0 -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vsub.vv v12, v20, v12 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v16, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vror_vv_v8i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV64-NEXT: vand.vx v16, v12, a0 -; CHECK-RV64-NEXT: vsrl.vv v16, v8, v16 -; CHECK-RV64-NEXT: vrsub.vi v12, v12, 0 -; CHECK-RV64-NEXT: vand.vx v12, v12, a0 -; CHECK-RV64-NEXT: vsll.vv v8, v8, v12 -; CHECK-RV64-NEXT: vor.vv v8, v16, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vror_vv_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vand.vx v16, v12, a0 +; CHECK-NEXT: vsrl.vv v16, v8, v16 +; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsll.vv v8, v8, v12 +; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vror_vv_v8i64: ; CHECK-ZVBB: # %bb.0: @@ -2055,34 +1924,18 @@ } define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) { -; CHECK-RV32-LABEL: vror_vx_v8i64: -; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v12, a0 -; CHECK-RV32-NEXT: li a1, 63 -; CHECK-RV32-NEXT: vand.vx v12, v12, a1 -; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 -; CHECK-RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v16, 0 -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vsub.vx v16, v16, a0 -; CHECK-RV32-NEXT: vand.vx v16, v16, a1 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v16 -; CHECK-RV32-NEXT: vor.vv v8, v12, v8 -; CHECK-RV32-NEXT: ret -; -; CHECK-RV64-LABEL: vror_vx_v8i64: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV64-NEXT: vmv.v.x v12, a0 -; CHECK-RV64-NEXT: li a0, 63 -; CHECK-RV64-NEXT: vand.vx v16, v12, a0 -; CHECK-RV64-NEXT: vsrl.vv v16, v8, v16 -; CHECK-RV64-NEXT: vrsub.vi v12, v12, 0 -; CHECK-RV64-NEXT: vand.vx v12, v12, a0 -; CHECK-RV64-NEXT: vsll.vv v8, v8, v12 -; CHECK-RV64-NEXT: vor.vv v8, v16, v8 -; CHECK-RV64-NEXT: ret +; CHECK-LABEL: vror_vx_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vmv.v.x v12, a0 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vand.vx v16, v12, a0 +; CHECK-NEXT: vsrl.vv v16, v8, v16 +; CHECK-NEXT: vrsub.vi v12, v12, 0 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsll.vv v8, v8, v12 +; CHECK-NEXT: vor.vv v8, v16, v8 +; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vror_vx_v8i64: ; CHECK-ZVBB: # %bb.0: @@ -2098,19 +1951,16 @@ define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v8i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v12, a0 -; CHECK-RV32-NEXT: vand.vi v12, v12, 1 -; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 -; CHECK-RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v16, 0 -; CHECK-RV32-NEXT: li a1, 1 ; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vsub.vx v16, v16, a1 -; CHECK-RV32-NEXT: vand.vx v16, v16, a0 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v16 -; CHECK-RV32-NEXT: vor.vv v8, v12, v8 +; CHECK-RV32-NEXT: vmv.v.i v12, 1 +; CHECK-RV32-NEXT: vrsub.vi v12, v12, 0 +; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vand.vx v12, v12, a0 +; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 +; CHECK-RV32-NEXT: vmv.v.x v16, a0 +; CHECK-RV32-NEXT: vand.vi v16, v16, 1 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v16 +; CHECK-RV32-NEXT: vor.vv v8, v8, v12 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v8i64: @@ -2134,19 +1984,16 @@ define <8 x i64> @vror_vi_rotl_v8i64(<8 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v8i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v12, a0 -; CHECK-RV32-NEXT: vand.vi v12, v12, 1 -; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v16, 0 -; CHECK-RV32-NEXT: li a1, 1 -; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-RV32-NEXT: vsub.vx v16, v16, a1 -; CHECK-RV32-NEXT: vand.vx v16, v16, a0 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v16 -; CHECK-RV32-NEXT: vor.vv v8, v12, v8 +; CHECK-RV32-NEXT: vmv.v.i v12, 1 +; CHECK-RV32-NEXT: vrsub.vi v12, v12, 0 +; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vand.vx v12, v12, a0 +; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 +; CHECK-RV32-NEXT: vmv.v.x v16, a0 +; CHECK-RV32-NEXT: vand.vi v16, v16, 1 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v16 +; CHECK-RV32-NEXT: vor.vv v8, v8, v12 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v8i64: