diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1267,6 +1267,12 @@ /// Whether this is a vector-predicated Opcode. bool isVPOpcode(unsigned Opcode); +/// Whether this is a vector-predicated binary operation opcode. +bool isVPBinaryOp(unsigned Opcode); + +/// Whether this is a vector-predicated reduction opcode. +bool isVPReduction(unsigned Opcode); + /// The operand position of the vector mask. Optional getVPMaskIdx(unsigned Opcode); diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -116,6 +116,16 @@ #define HANDLE_VP_REDUCTION(ID, STARTPOS, VECTORPOS) #endif +// A property to infer VP binary-op SDNode opcodes automatically. +#ifndef PROPERTY_VP_BINARYOP_SDNODE +#define PROPERTY_VP_BINARYOP_SDNODE(ID) +#endif + +// A property to infer VP reduction SDNode opcodes automatically. +#ifndef PROPERTY_VP_REDUCTION_SDNODE +#define PROPERTY_VP_REDUCTION_SDNODE(ID) +#endif + /// } Property Macros ///// Integer Arithmetic { @@ -127,6 +137,7 @@ #define HELPER_REGISTER_BINARY_INT_VP(INTRIN, SDOPC, OPC) \ BEGIN_REGISTER_VP(INTRIN, 2, 3, SDOPC, -1) \ HANDLE_VP_TO_OPC(OPC) \ +PROPERTY_VP_BINARYOP_SDNODE(SDOPC) \ END_REGISTER_VP(INTRIN, SDOPC) @@ -186,6 +197,7 @@ BEGIN_REGISTER_VP(vp_##OPSUFFIX, 2, 3, SDOPC, -1) \ HANDLE_VP_TO_OPC(OPC) \ HANDLE_VP_TO_CONSTRAINEDFP(1, 1, experimental_constrained_##OPSUFFIX) \ + PROPERTY_VP_BINARYOP_SDNODE(SDOPC) \ END_REGISTER_VP(vp_##OPSUFFIX, SDOPC) // llvm.vp.fadd(x,y,mask,vlen) @@ -254,6 +266,7 @@ BEGIN_REGISTER_VP(VPINTRIN, 2, 3, SDOPC, -1) \ HANDLE_VP_TO_INTRIN(INTRIN) \ HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \ +PROPERTY_VP_REDUCTION_SDNODE(SDOPC) \ END_REGISTER_VP(VPINTRIN, SDOPC) // llvm.vp.reduce.add(start,x,mask,vlen) @@ -317,6 +330,8 @@ END_REGISTER_VP_SDNODE(SEQ_SDOPC) \ HANDLE_VP_TO_INTRIN(INTRIN) \ HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \ +PROPERTY_VP_REDUCTION_SDNODE(SDOPC) \ +PROPERTY_VP_REDUCTION_SDNODE(SEQ_SDOPC) \ END_REGISTER_VP_INTRINSIC(VPINTRIN) // llvm.vp.reduce.fadd(start,x,mask,vlen) @@ -354,3 +369,5 @@ #undef HANDLE_VP_TO_INTRIN #undef HANDLE_VP_IS_MEMOP #undef HANDLE_VP_REDUCTION +#undef PROPERTY_VP_BINARYOP_SDNODE +#undef PROPERTY_VP_REDUCTION_SDNODE diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -515,6 +515,7 @@ SDValue visitFP_TO_FP16(SDNode *N); SDValue visitFP16_TO_FP(SDNode *N); SDValue visitVECREDUCE(SDNode *N); + SDValue visitVPOp(SDNode *N); SDValue visitFADDForFMACombine(SDNode *N); SDValue visitFSUBForFMACombine(SDNode *N); @@ -1738,6 +1739,9 @@ case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N); +#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) case ISD::SDOPC: +#include "llvm/IR/VPIntrinsics.def" + return visitVPOp(N); } return SDValue(); } @@ -22038,6 +22042,40 @@ return SDValue(); } +SDValue DAGCombiner::visitVPOp(SDNode *N) { + // VP operations in which all vector elements are disabled - either by + // determining that the mask is all false or that the EVL is 0 - can be + // eliminated. + bool AreAllEltsDisabled = false; + if (auto EVLIdx = ISD::getVPExplicitVectorLengthIdx(N->getOpcode())) + AreAllEltsDisabled |= isNullConstant(N->getOperand(*EVLIdx)); + if (auto MaskIdx = ISD::getVPMaskIdx(N->getOpcode())) + AreAllEltsDisabled |= + ISD::isConstantSplatVectorAllZeros(N->getOperand(*MaskIdx).getNode()); + + // This is the only generic VP combine we support for now. + if (!AreAllEltsDisabled) + return SDValue(); + + // Binary operations can be replaced by UNDEF. + if (ISD::isVPBinaryOp(N->getOpcode())) + return DAG.getUNDEF(N->getValueType(0)); + + // VP Memory operations can be replaced by either the chain (stores) or the + // chain + undef (loads). + if (const auto *MemSD = dyn_cast(N)) { + if (MemSD->writeMem()) + return MemSD->getChain(); + return CombineTo(N, DAG.getUNDEF(N->getValueType(0)), MemSD->getChain()); + } + + // Reduction operations return the start operand when no elements are active. + if (ISD::isVPReduction(N->getOpcode())) + return N->getOperand(0); + + return SDValue(); +} + /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle /// with the destination vector and a zero vector. /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -412,6 +412,28 @@ } } +bool ISD::isVPBinaryOp(unsigned Opcode) { + switch (Opcode) { + default: + return false; +#define PROPERTY_VP_BINARYOP_SDNODE(SDOPC) \ + case ISD::SDOPC: \ + return true; +#include "llvm/IR/VPIntrinsics.def" + } +} + +bool ISD::isVPReduction(unsigned Opcode) { + switch (Opcode) { + default: + return false; +#define PROPERTY_VP_REDUCTION_SDNODE(SDOPC) \ + case ISD::SDOPC: \ + return true; +#include "llvm/IR/VPIntrinsics.def" + } +} + /// The operand position of the vector mask. Optional ISD::getVPMaskIdx(unsigned Opcode) { switch (Opcode) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll @@ -479,13 +479,9 @@ define <256 x i8> @vadd_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { ; CHECK-LABEL: vadd_vi_v258i8_evl128: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, zero, 128 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vle1.v v25, (a0) +; CHECK-NEXT: addi a0, zero, 128 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu ; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t -; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t ; CHECK-NEXT: ret %elt.head = insertelement <256 x i8> undef, i8 -1, i32 0 %vb = shufflevector <256 x i8> %elt.head, <256 x i8> undef, <256 x i32> zeroinitializer @@ -1625,33 +1621,22 @@ ret <32 x i64> %v } -; FIXME: After splitting, the "high" vadd.vv is doing nothing; could be -; replaced by undef. +; FIXME: We don't match vadd.vi on RV32. define <32 x i64> @vadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; RV32-LABEL: vadd_vx_v32i64_evl12: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v1, v0, 2 ; RV32-NEXT: addi a0, zero, 32 ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vmv.v.i v16, -1 ; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, mu -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, mu -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vadd_vx_v32i64_evl12: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v25, v0, 2 ; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, mu ; RV64-NEXT: vadd.vi v8, v8, -1, v0.t -; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v25 -; RV64-NEXT: vadd.vi v16, v16, -1, v0.t ; RV64-NEXT: ret %elt.head = insertelement <32 x i64> undef, i64 -1, i32 0 %vb = shufflevector <32 x i64> %elt.head, <32 x i64> undef, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll b/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll @@ -0,0 +1,633 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=lp64d -riscv-v-vector-bits-min=128 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +; Test that we can remove trivially-undef VP operations of various kinds. + +declare <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>*, <4 x i1>, i32) + +define <4 x i32> @vload_v4i32_zero_evl(<4 x i32>* %ptr, <4 x i1> %m) { +; CHECK-LABEL: vload_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* %ptr, <4 x i1> %m, i32 0) + ret <4 x i32> %v +} + +define <4 x i32> @vload_v4i32_false_mask(<4 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: vload_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* %ptr, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %v +} + +declare <4 x i32> @llvm.vp.gather.v4i32.v4p0i32(<4 x i32*>, <4 x i1>, i32) + +define <4 x i32> @vgather_v4i32_v4i32_zero_evl(<4 x i32*> %ptrs, <4 x i1> %m) { +; CHECK-LABEL: vgather_v4i32_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, <4 x i1> %m, i32 0) + ret <4 x i32> %v +} + +define <4 x i32> @vgather_v4i32_v4i32_false_mask(<4 x i32*> %ptrs, i32 %evl) { +; CHECK-LABEL: vgather_v4i32_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %v +} + +declare void @llvm.vp.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, <4 x i1>, i32) + +define void @vstore_v4i32_zero_evl(<4 x i32> %val, <4 x i32>* %ptr, <4 x i1> %m) { +; CHECK-LABEL: vstore_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + call void @llvm.vp.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %ptr, <4 x i1> %m, i32 0) + ret void +} + +define void @vstore_v4i32_false_mask(<4 x i32> %val, <4 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: vstore_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + call void @llvm.vp.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %ptr, <4 x i1> zeroinitializer, i32 %evl) + ret void +} + +declare void @llvm.vp.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, <4 x i1>, i32) + +define void @vscatter_v4i32_zero_evl(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %m) { +; CHECK-LABEL: vscatter_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + call void @llvm.vp.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %m, i32 0) + ret void +} + +define void @vscatter_v4i32_false_mask(<4 x i32> %val, <4 x i32*> %ptrs, i32 %evl) { +; CHECK-LABEL: vscatter_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + call void @llvm.vp.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> zeroinitializer, i32 %evl) + ret void +} + +declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vadd_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vadd_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vadd_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vadd_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.and.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vand_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vand_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vand_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vand_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vlshr_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vlshr_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vlshr_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vlshr_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vmul_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vmul_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vmul_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vmul_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.or.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vor_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vor_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vor_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vor_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vsdiv_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vsdiv_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vsdiv_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vsdiv_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vsrem_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vsrem_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vsrem_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vsrem_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vsub_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vsub_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vsub_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vsub_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vudiv_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vudiv_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vudiv_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vudiv_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vurem_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vurem_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vurem_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vurem_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.xor.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vxor_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vxor_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vxor_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vxor_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x float> @llvm.vp.fadd.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define <4 x float> @vfadd_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { +; CHECK-LABEL: vfadd_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) + ret <4 x float> %s +} + +define <4 x float> @vfadd_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { +; CHECK-LABEL: vfadd_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x float> %s +} + +declare <4 x float> @llvm.vp.fsub.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define <4 x float> @vfsub_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { +; CHECK-LABEL: vfsub_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) + ret <4 x float> %s +} + +define <4 x float> @vfsub_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { +; CHECK-LABEL: vfsub_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x float> %s +} + +declare <4 x float> @llvm.vp.fmul.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define <4 x float> @vfmul_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { +; CHECK-LABEL: vfmul_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) + ret <4 x float> %s +} + +define <4 x float> @vfmul_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { +; CHECK-LABEL: vfmul_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x float> %s +} + +declare <4 x float> @llvm.vp.fdiv.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define <4 x float> @vfdiv_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { +; CHECK-LABEL: vfdiv_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) + ret <4 x float> %s +} + +define <4 x float> @vfdiv_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { +; CHECK-LABEL: vfdiv_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x float> %s +} + +declare <4 x float> @llvm.vp.frem.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define <4 x float> @vfrem_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { +; CHECK-LABEL: vfrem_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) + ret <4 x float> %s +} + +define <4 x float> @vfrem_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { +; CHECK-LABEL: vfrem_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x float> %s +} + +declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_add_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_add_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_add_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_add_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.mul.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_mul_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_mul_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_mul_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_mul_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.and.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_and_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_and_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_and_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_and_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.or.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_or_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_or_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_or_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_or_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.xor.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_xor_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_xor_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_xor_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_xor_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.smax.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_smax_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_smax_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_smax_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_smax_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.smin.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_smin_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_smin_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_smin_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_smin_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.umax.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_umax_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_umax_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_umax_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_umax_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.umin.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_umin_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_umin_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_umin_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_umin_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32) + +define float @vreduce_seq_fadd_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_seq_fadd_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_seq_fadd_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_seq_fadd_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} + +define float @vreduce_fadd_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_fadd_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_fadd_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_fadd_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} + +declare float @llvm.vp.reduce.fmul.v4f32(float, <4 x float>, <4 x i1>, i32) + +define float @vreduce_seq_fmul_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_seq_fmul_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_seq_fmul_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_seq_fmul_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} + +define float @vreduce_fmul_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_fmul_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_fmul_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_fmul_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} + +declare float @llvm.vp.reduce.fmin.v4f32(float, <4 x float>, <4 x i1>, i32) + +define float @vreduce_fmin_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_fmin_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmin.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_fmin_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_fmin_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmin.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} + +declare float @llvm.vp.reduce.fmax.v4f32(float, <4 x float>, <4 x i1>, i32) + +define float @vreduce_fmax_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_fmax_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmax.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_fmax_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_fmax_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmax.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll @@ -1637,22 +1637,35 @@ } ; FIXME: The first vadd.vi should be able to infer that its AVL is equivalent to VLMAX. -; FIXME: The upper half of the operation is doing nothing. +; FIXME: The upper half of the operation is doing nothing but we don't catch +; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16) +; (the "original" %evl is the "and", due to known-bits issues with legalizing +; the i32 %evl to i64) and this isn't detected as 0. +; This could be resolved in the future with more detailed KnownBits analysis +; for ISD::VSCALE. define @vadd_vi_nxv32i32_evl_nx16( %va, %m) { -; CHECK-LABEL: vadd_vi_nxv32i32_evl_nx16: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a1, a0, 2 -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v25, v0, a1 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t -; CHECK-NEXT: vsetivli zero, 0, e32, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vi_nxv32i32_evl_nx16: +; RV32: # %bb.0: +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-NEXT: vadd.vi v8, v8, -1, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vi_nxv32i32_evl_nx16: +; RV64: # %bb.0: +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: srli a1, a0, 2 +; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; RV64-NEXT: vslidedown.vx v25, v0, a1 +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-NEXT: vadd.vi v8, v8, -1, v0.t +; RV64-NEXT: vsetivli zero, 0, e32, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v25 +; RV64-NEXT: vadd.vi v16, v16, -1, v0.t +; RV64-NEXT: ret %elt.head = insertelement undef, i32 -1, i32 0 %vb = shufflevector %elt.head, undef, zeroinitializer %evl = call i32 @llvm.vscale.i32()