diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1260,6 +1260,30 @@ switch (II->getIntrinsicID()) { case Intrinsic::fma: return Operand == 0 || Operand == 1; + // FIXME: Our patterns can only match vx/vf instructions when the splat + // it on the RHS, because TableGen doesn't recognize our VP operations + // as commutative. + case Intrinsic::vp_add: + case Intrinsic::vp_mul: + case Intrinsic::vp_and: + case Intrinsic::vp_or: + case Intrinsic::vp_xor: + case Intrinsic::vp_fadd: + case Intrinsic::vp_fsub: + case Intrinsic::vp_fmul: + case Intrinsic::vp_fdiv: + case Intrinsic::vp_shl: + case Intrinsic::vp_lshr: + case Intrinsic::vp_ashr: + case Intrinsic::vp_udiv: + case Intrinsic::vp_sdiv: + case Intrinsic::vp_urem: + case Intrinsic::vp_srem: + return Operand == 1; + // ... the one exception is vp.sub which has explicit patterns for both + // LHS and RHS (as vrsub). + case Intrinsic::vp_sub: + return Operand == 0 || Operand == 1; default: return false; } diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -2961,3 +2961,774 @@ %cmp.not = icmp eq i64 %indvars.iv.next, 1024 br i1 %cmp.not, label %for.cond.cleanup, label %for.body } + +declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_mul(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_mul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB46_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vmul.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB46_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_add(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB47_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vadd.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB47_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +; FIXME: This doesn't match against vadd.vx because our patterns aren't +; commutative. + +define void @sink_splat_vp_add_commute(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_add_commute: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: .LBB48_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vadd.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: addi a1, a1, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a1, .LBB48_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_sub(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_sub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB49_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vsub.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB49_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @sink_splat_vp_rsub(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_rsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB50_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vrsub.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB50_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.shl.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_shl(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_shl: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB51_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vsll.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB51_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_lshr(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_lshr: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB52_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vsrl.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB52_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_ashr(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_ashr: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB53_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vsra.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB53_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x float> @llvm.vp.fmul.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @sink_splat_vp_fmul(float* nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fmv.w.x ft0, a1 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: .LBB54_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vfmul.vf v8, v8, ft0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a1, a1, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a1, .LBB54_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, float* %a, i64 %index + %1 = bitcast float* %0 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %1, align 4 + %2 = call <4 x float> @llvm.vp.fmul.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast float* %0 to <4 x float>* + store <4 x float> %2, <4 x float>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x float> @llvm.vp.fdiv.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @sink_splat_vp_fdiv(float* nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fdiv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fmv.w.x ft0, a1 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: .LBB55_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vfdiv.vf v8, v8, ft0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a1, a1, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a1, .LBB55_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, float* %a, i64 %index + %1 = bitcast float* %0 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %1, align 4 + %2 = call <4 x float> @llvm.vp.fdiv.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast float* %0 to <4 x float>* + store <4 x float> %2, <4 x float>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +; FIXME: vfrdiv.vf doesn't match against masked instructions + +define void @sink_splat_vp_frdiv(float* nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_frdiv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fmv.w.x ft0, a1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v8, ft0 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: .LBB56_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vfdiv.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: addi a1, a1, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a1, .LBB56_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, float* %a, i64 %index + %1 = bitcast float* %0 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %1, align 4 + %2 = call <4 x float> @llvm.vp.fdiv.v4i32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x i1> %m, i32 %vl) + %3 = bitcast float* %0 to <4 x float>* + store <4 x float> %2, <4 x float>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x float> @llvm.vp.fadd.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @sink_splat_vp_fadd(float* nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fmv.w.x ft0, a1 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: .LBB57_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vfadd.vf v8, v8, ft0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a1, a1, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a1, .LBB57_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, float* %a, i64 %index + %1 = bitcast float* %0 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %1, align 4 + %2 = call <4 x float> @llvm.vp.fadd.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast float* %0 to <4 x float>* + store <4 x float> %2, <4 x float>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x float> @llvm.vp.fsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @sink_splat_vp_fsub(float* nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fmv.w.x ft0, a1 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: .LBB58_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vfsub.vf v8, v8, ft0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a1, a1, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a1, .LBB58_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, float* %a, i64 %index + %1 = bitcast float* %0 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %1, align 4 + %2 = call <4 x float> @llvm.vp.fsub.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast float* %0 to <4 x float>* + store <4 x float> %2, <4 x float>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x float> @llvm.vp.frsub.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) + +; FIXME: vfrsub.vf doesn't match against masked instructions + +define void @sink_splat_vp_frsub(float* nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_frsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fmv.w.x ft0, a1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v8, ft0 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: .LBB59_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vfsub.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: addi a1, a1, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a1, .LBB59_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float, float* %a, i64 %index + %1 = bitcast float* %0 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %1, align 4 + %2 = call <4 x float> @llvm.vp.fsub.v4i32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x i1> %m, i32 %vl) + %3 = bitcast float* %0 to <4 x float>* + store <4 x float> %2, <4 x float>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_udiv(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_udiv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB60_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vdivu.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB60_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_sdiv(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_sdiv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB61_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB61_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_urem(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_urem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB62_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vremu.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB62_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) + +define void @sink_splat_vp_srem(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_srem: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: .LBB63_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vrem.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: addi a3, a3, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a3, .LBB63_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +; Check that we don't sink a splat operand that has no chance of being folded. + +define void @sink_splat_vp_srem_commute(i32* nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { +; CHECK-LABEL: sink_splat_vp_srem_commute: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: li a1, 1024 +; CHECK-NEXT: .LBB64_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, mu +; CHECK-NEXT: vrem.vv v9, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: addi a1, a1, -4 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: bnez a1, .LBB64_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %a, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %broadcast.splat, <4 x i32> %wide.load, <4 x i1> %m, i32 %vl) + %3 = bitcast i32* %0 to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + %index.next = add nuw i64 %index, 4 + %4 = icmp eq i64 %index.next, 1024 + br i1 %4, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +}