diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -515,6 +515,7 @@ SDValue visitFP_TO_FP16(SDNode *N); SDValue visitFP16_TO_FP(SDNode *N); SDValue visitVECREDUCE(SDNode *N); + SDValue visitVPOp(SDNode *N); SDValue visitFADDForFMACombine(SDNode *N); SDValue visitFSUBForFMACombine(SDNode *N); @@ -1738,6 +1739,9 @@ case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N); +#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) case ISD::SDOPC: +#include "llvm/IR/VPIntrinsics.def" + return visitVPOp(N); } return SDValue(); } @@ -22019,6 +22023,79 @@ return SDValue(); } +SDValue DAGCombiner::visitVPOp(SDNode *N) { + // VP operations in which all vector elements are disabled - either by + // determining that the mask is all false or that the EVL is 0 - can be + // eliminated. + bool AreAllEltsDisabled = false; + if (auto EVLIdx = ISD::getVPExplicitVectorLengthIdx(N->getOpcode())) + AreAllEltsDisabled |= isNullConstant(N->getOperand(*EVLIdx)); + if (auto MaskIdx = ISD::getVPMaskIdx(N->getOpcode())) + AreAllEltsDisabled |= + ISD::isConstantSplatVectorAllZeros(N->getOperand(*MaskIdx).getNode()); + + // This is the only generic VP combine we support for now. + if (!AreAllEltsDisabled) + return SDValue(); + + switch (N->getOpcode()) { + default: + break; + case ISD::VP_ADD: + case ISD::VP_AND: + case ISD::VP_ASHR: + case ISD::VP_LSHR: + case ISD::VP_MUL: + case ISD::VP_OR: + case ISD::VP_SDIV: + case ISD::VP_SHL: + case ISD::VP_SREM: + case ISD::VP_SUB: + case ISD::VP_UDIV: + case ISD::VP_UREM: + case ISD::VP_XOR: + case ISD::VP_FADD: + case ISD::VP_FSUB: + case ISD::VP_FMUL: + case ISD::VP_FDIV: + case ISD::VP_FREM: + // These nodes can be replaced by UNDEF. + return DAG.getUNDEF(N->getValueType(0)); + case ISD::VP_STORE: + // VP_STORE/VP_SCATTER can be eliminated but replaced by their chains. + return cast(N)->getChain(); + case ISD::VP_SCATTER: + return cast(N)->getChain(); + case ISD::VP_LOAD: + // VP_LOAD/VP_GATHER can be replaced by UNDEF, forwarding on their chains. + return CombineTo(N, DAG.getUNDEF(N->getValueType(0)), + cast(N)->getChain()); + case ISD::VP_GATHER: + return CombineTo(N, DAG.getUNDEF(N->getValueType(0)), + cast(N)->getChain()); + case ISD::VP_REDUCE_ADD: + case ISD::VP_REDUCE_MUL: + case ISD::VP_REDUCE_AND: + case ISD::VP_REDUCE_OR: + case ISD::VP_REDUCE_XOR: + case ISD::VP_REDUCE_SMAX: + case ISD::VP_REDUCE_SMIN: + case ISD::VP_REDUCE_UMAX: + case ISD::VP_REDUCE_UMIN: + case ISD::VP_REDUCE_FMAX: + case ISD::VP_REDUCE_FMIN: + case ISD::VP_REDUCE_FADD: + case ISD::VP_REDUCE_FMUL: + case ISD::VP_REDUCE_SEQ_FADD: + case ISD::VP_REDUCE_SEQ_FMUL: + // Reduction operations return the start operand when no elements are + // active. + return N->getOperand(0); + } + + return SDValue(); +} + /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle /// with the destination vector and a zero vector. /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll @@ -467,13 +467,9 @@ define <256 x i8> @vadd_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { ; CHECK-LABEL: vadd_vi_v258i8_evl128: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, zero, 128 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu -; CHECK-NEXT: vle1.v v25, (a0) +; CHECK-NEXT: addi a0, zero, 128 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu ; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t -; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t ; CHECK-NEXT: ret %elt.head = insertelement <256 x i8> undef, i8 -1, i32 0 %vb = shufflevector <256 x i8> %elt.head, <256 x i8> undef, <256 x i32> zeroinitializer @@ -1613,33 +1609,22 @@ ret <32 x i64> %v } -; FIXME: After splitting, the "high" vadd.vv is doing nothing; could be -; replaced by undef. +; FIXME: We don't match vadd.vi on RV32. define <32 x i64> @vadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; RV32-LABEL: vadd_vx_v32i64_evl12: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV32-NEXT: vslidedown.vi v1, v0, 2 ; RV32-NEXT: addi a0, zero, 32 ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; RV32-NEXT: vmv.v.i v24, -1 +; RV32-NEXT: vmv.v.i v16, -1 ; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, mu -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t -; RV32-NEXT: vsetivli zero, 0, e64, m8, ta, mu -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vadd_vx_v32i64_evl12: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vi v25, v0, 2 ; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, mu ; RV64-NEXT: vadd.vi v8, v8, -1, v0.t -; RV64-NEXT: vsetivli zero, 0, e64, m8, ta, mu -; RV64-NEXT: vmv1r.v v0, v25 -; RV64-NEXT: vadd.vi v16, v16, -1, v0.t ; RV64-NEXT: ret %elt.head = insertelement <32 x i64> undef, i64 -1, i32 0 %vb = shufflevector <32 x i64> %elt.head, <32 x i64> undef, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll b/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/undef-vp-ops.ll @@ -0,0 +1,633 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +; Test that we can remove trivially-undef VP operations of various kinds. + +declare <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>*, <4 x i1>, i32) + +define <4 x i32> @vload_v4i32_zero_evl(<4 x i32>* %ptr, <4 x i1> %m) { +; CHECK-LABEL: vload_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* %ptr, <4 x i1> %m, i32 0) + ret <4 x i32> %v +} + +define <4 x i32> @vload_v4i32_false_mask(<4 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: vload_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* %ptr, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %v +} + +declare <4 x i32> @llvm.vp.gather.v4i32.v4p0i32(<4 x i32*>, <4 x i1>, i32) + +define <4 x i32> @vgather_v4i32_v4i32_zero_evl(<4 x i32*> %ptrs, <4 x i1> %m) { +; CHECK-LABEL: vgather_v4i32_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, <4 x i1> %m, i32 0) + ret <4 x i32> %v +} + +define <4 x i32> @vgather_v4i32_v4i32_false_mask(<4 x i32*> %ptrs, i32 %evl) { +; CHECK-LABEL: vgather_v4i32_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %v = call <4 x i32> @llvm.vp.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %v +} + +declare void @llvm.vp.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, <4 x i1>, i32) + +define void @vstore_v4i32_zero_evl(<4 x i32> %val, <4 x i32>* %ptr, <4 x i1> %m) { +; CHECK-LABEL: vstore_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + call void @llvm.vp.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %ptr, <4 x i1> %m, i32 0) + ret void +} + +define void @vstore_v4i32_false_mask(<4 x i32> %val, <4 x i32>* %ptr, i32 %evl) { +; CHECK-LABEL: vstore_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + call void @llvm.vp.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %ptr, <4 x i1> zeroinitializer, i32 %evl) + ret void +} + +declare void @llvm.vp.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, <4 x i1>, i32) + +define void @vscatter_v4i32_zero_evl(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %m) { +; CHECK-LABEL: vscatter_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + call void @llvm.vp.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %m, i32 0) + ret void +} + +define void @vscatter_v4i32_false_mask(<4 x i32> %val, <4 x i32*> %ptrs, i32 %evl) { +; CHECK-LABEL: vscatter_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + call void @llvm.vp.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> zeroinitializer, i32 %evl) + ret void +} + +declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vadd_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vadd_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vadd_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vadd_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.and.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vand_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vand_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vand_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vand_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vlshr_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vlshr_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vlshr_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vlshr_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vmul_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vmul_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vmul_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vmul_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.or.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vor_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vor_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vor_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vor_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vsdiv_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vsdiv_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vsdiv_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vsdiv_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vsrem_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vsrem_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vsrem_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vsrem_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vsub_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vsub_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vsub_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vsub_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vudiv_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vudiv_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vudiv_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vudiv_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vurem_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vurem_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vurem_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vurem_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x i32> @llvm.vp.xor.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vxor_v4i32_zero_evl(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m) { +; CHECK-LABEL: vxor_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 0) + ret <4 x i32> %s +} + +define <4 x i32> @vxor_v4i32_false_mask(<4 x i32> %va, <4 x i32> %vb, i32 %evl) { +; CHECK-LABEL: vxor_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x i32> %s +} + +declare <4 x float> @llvm.vp.fadd.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define <4 x float> @vfadd_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { +; CHECK-LABEL: vfadd_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) + ret <4 x float> %s +} + +define <4 x float> @vfadd_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { +; CHECK-LABEL: vfadd_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x float> %s +} + +declare <4 x float> @llvm.vp.fsub.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define <4 x float> @vfsub_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { +; CHECK-LABEL: vfsub_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) + ret <4 x float> %s +} + +define <4 x float> @vfsub_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { +; CHECK-LABEL: vfsub_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x float> %s +} + +declare <4 x float> @llvm.vp.fmul.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define <4 x float> @vfmul_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { +; CHECK-LABEL: vfmul_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) + ret <4 x float> %s +} + +define <4 x float> @vfmul_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { +; CHECK-LABEL: vfmul_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x float> %s +} + +declare <4 x float> @llvm.vp.fdiv.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define <4 x float> @vfdiv_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { +; CHECK-LABEL: vfdiv_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) + ret <4 x float> %s +} + +define <4 x float> @vfdiv_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { +; CHECK-LABEL: vfdiv_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x float> %s +} + +declare <4 x float> @llvm.vp.frem.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define <4 x float> @vfrem_v4f32_zero_evl(<4 x float> %va, <4 x float> %vb, <4 x i1> %m) { +; CHECK-LABEL: vfrem_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 0) + ret <4 x float> %s +} + +define <4 x float> @vfrem_v4f32_false_mask(<4 x float> %va, <4 x float> %vb, i32 %evl) { +; CHECK-LABEL: vfrem_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> zeroinitializer, i32 %evl) + ret <4 x float> %s +} + +declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_add_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_add_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_add_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_add_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.mul.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_mul_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_mul_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_mul_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_mul_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.and.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_and_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_and_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_and_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_and_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.or.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_or_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_or_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_or_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_or_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.xor.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_xor_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_xor_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_xor_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_xor_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.smax.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_smax_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_smax_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_smax_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_smax_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.smin.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_smin_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_smin_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_smin_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_smin_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.umax.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_umax_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_umax_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_umax_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_umax_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare i32 @llvm.vp.reduce.umin.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define i32 @vreduce_umin_v4i32_zero_evl(i32 %start, <4 x i32> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_umin_v4i32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %val, <4 x i1> %m, i32 0) + ret i32 %s +} + +define i32 @vreduce_umin_v4i32_false_mask(i32 %start, <4 x i32> %val, i32 %evl) { +; CHECK-LABEL: vreduce_umin_v4i32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %val, <4 x i1> zeroinitializer, i32 %evl) + ret i32 %s +} + +declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32) + +define float @vreduce_seq_fadd_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_seq_fadd_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_seq_fadd_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_seq_fadd_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} + +define float @vreduce_fadd_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_fadd_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_fadd_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_fadd_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} + +declare float @llvm.vp.reduce.fmul.v4f32(float, <4 x float>, <4 x i1>, i32) + +define float @vreduce_seq_fmul_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_seq_fmul_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_seq_fmul_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_seq_fmul_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} + +define float @vreduce_fmul_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_fmul_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_fmul_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_fmul_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} + +declare float @llvm.vp.reduce.fmin.v4f32(float, <4 x float>, <4 x i1>, i32) + +define float @vreduce_fmin_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_fmin_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmin.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_fmin_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_fmin_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmin.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} + +declare float @llvm.vp.reduce.fmax.v4f32(float, <4 x float>, <4 x i1>, i32) + +define float @vreduce_fmax_v4f32_zero_evl(float %start, <4 x float> %val, <4 x i1> %m) { +; CHECK-LABEL: vreduce_fmax_v4f32_zero_evl: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmax.v4f32(float %start, <4 x float> %val, <4 x i1> %m, i32 0) + ret float %s +} + +define float @vreduce_fmax_v4f32_false_mask(float %start, <4 x float> %val, i32 %evl) { +; CHECK-LABEL: vreduce_fmax_v4f32_false_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %s = call float @llvm.vp.reduce.fmax.v4f32(float %start, <4 x float> %val, <4 x i1> zeroinitializer, i32 %evl) + ret float %s +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll @@ -1624,22 +1624,35 @@ } ; FIXME: The first vadd.vi should be able to infer that its AVL is equivalent to VLMAX. -; FIXME: The upper half of the operation is doing nothing. +; FIXME: The upper half of the operation is doing nothing but we don't catch +; that on RV64; we issue a usubsat(and (vscale x 16), 0xffffffff, vscale x 16) +; (the "original" %evl is the "and", due to known-bits issues with legalizing +; the i32 %evl to i64) and this isn't detected as 0. +; This could be resolved in the future with more detailed KnownBits analysis +; for ISD::VSCALE. define @vadd_vi_nxv32i32_evl_nx16( %va, %m) { -; CHECK-LABEL: vadd_vi_nxv32i32_evl_nx16: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a1, a0, 2 -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vx v25, v0, a1 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t -; CHECK-NEXT: vsetivli zero, 0, e32, m8, ta, mu -; CHECK-NEXT: vmv1r.v v0, v25 -; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vi_nxv32i32_evl_nx16: +; RV32: # %bb.0: +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV32-NEXT: vadd.vi v8, v8, -1, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vi_nxv32i32_evl_nx16: +; RV64: # %bb.0: +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: srli a1, a0, 2 +; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; RV64-NEXT: vslidedown.vx v25, v0, a1 +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; RV64-NEXT: vadd.vi v8, v8, -1, v0.t +; RV64-NEXT: vsetivli zero, 0, e32, m8, ta, mu +; RV64-NEXT: vmv1r.v v0, v25 +; RV64-NEXT: vadd.vi v16, v16, -1, v0.t +; RV64-NEXT: ret %elt.head = insertelement undef, i32 -1, i32 0 %vb = shufflevector %elt.head, undef, zeroinitializer %evl = call i32 @llvm.vscale.i32()