diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td --- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td @@ -20,8 +20,22 @@ multiclass Binary_rv { - // Masked with select, broadcast. - // TODO + // Masked with passthru, broadcast. + def : Pat<(vvp_select + (OpNode + (any_broadcast ScalarVT:$sx), + DataVT:$vy, + (MaskVT srcvalue), + (i32 srcvalue)), + DataVT:$vfalse, + MaskVT:$mask, + i32:$pivot), + (!cast(OpBaseName#"rvml_v") + ScalarVT:$sx, + $vy, + $mask, + $pivot, + $vfalse)>; // Unmasked, broadcast. def : Pat<(OpNode @@ -42,8 +56,22 @@ multiclass Binary_vr { - // Masked with select, broadcast. - // TODO + // Masked with passthru, broadcast. + def : Pat<(vvp_select + (OpNode + DataVT:$vx, + (any_broadcast ScalarVT:$sy), + (MaskVT srcvalue), + (i32 srcvalue)), + DataVT:$vfalse, + MaskVT:$mask, + i32:$pivot), + (!cast(OpBaseName#"vrml_v") + $vx, + ScalarVT:$sy, + $mask, + $pivot, + $vfalse)>; // Unmasked, broadcast. def : Pat<(OpNode @@ -64,6 +92,23 @@ multiclass Binary_vv { + // Masked with passthru, broadcast. + def : Pat<(vvp_select + (OpNode + DataVT:$vx, + DataVT:$vy, + (MaskVT srcvalue), + (i32 srcvalue)), + DataVT:$vfalse, + MaskVT:$mask, + i32:$pivot), + (!cast(OpBaseName#"vvml_v") + $vx, + $vy, + $mask, + $pivot, + $vfalse)>; + // Masked with select. // TODO diff --git a/llvm/test/CodeGen/VE/Vector/vp_fadd_merge.ll b/llvm/test/CodeGen/VE/Vector/vp_fadd_merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_fadd_merge.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <256 x float> @llvm.vp.merge.v256f32(<256 x i1>, <256 x float>, <256 x float>, i32) +declare <256 x float> @llvm.vp.fadd.v256f32(<256 x float>, <256 x float>, <256 x i1>, i32) + +define fastcc <256 x float> @test_vp_fadd_v256f32_vv_merge(<256 x float> %passthru, <256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fadd_v256f32_vv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: pvfadd.up %v0, %v1, %v2, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %vr = call <256 x float> @llvm.vp.fadd.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fadd_v256f32_rv_merge(<256 x float> %passthru, float %s0, <256 x float> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fadd_v256f32_rv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: pvfadd.up %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x float> undef, float %s0, i32 0 + %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fadd.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fadd_v256f32_vr_merge(<256 x float> %passthru, <256 x float> %i0, float %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fadd_v256f32_vr_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: pvfadd.up %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x float> undef, float %s1, i32 0 + %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fadd.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + + +declare <256 x double> @llvm.vp.merge.v256f64(<256 x i1>, <256 x double>, <256 x double>, i32) +declare <256 x double> @llvm.vp.fadd.v256f64(<256 x double>, <256 x double>, <256 x i1>, i32) + +define fastcc <256 x double> @test_vp_fadd_v256f64_vv_merge(<256 x double> %passthru, <256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fadd_v256f64_vv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfadd.d %v0, %v1, %v2, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %vr = call <256 x double> @llvm.vp.fadd.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fadd_v256f64_rv_merge(<256 x double> %passthru, double %s0, <256 x double> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fadd_v256f64_rv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfadd.d %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x double> undef, double %s0, i32 0 + %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer + %vr = call <256 x double> @llvm.vp.fadd.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fadd_v256f64_vr_merge(<256 x double> %passthru, <256 x double> %i0, double %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fadd_v256f64_vr_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfadd.d %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x double> undef, double %s1, i32 0 + %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer + %vr = call <256 x double> @llvm.vp.fadd.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} diff --git a/llvm/test/CodeGen/VE/Vector/vp_fdiv_merge.ll b/llvm/test/CodeGen/VE/Vector/vp_fdiv_merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_fdiv_merge.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <256 x float> @llvm.vp.merge.v256f32(<256 x i1>, <256 x float>, <256 x float>, i32) +declare <256 x float> @llvm.vp.fdiv.v256f32(<256 x float>, <256 x float>, <256 x i1>, i32) + +define fastcc <256 x float> @test_vp_fdiv_v256f32_vv_merge(<256 x float> %passthru, <256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fdiv_v256f32_vv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfdiv.s %v0, %v1, %v2, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %vr = call <256 x float> @llvm.vp.fdiv.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fdiv_v256f32_rv_merge(<256 x float> %passthru, float %s0, <256 x float> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fdiv_v256f32_rv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfdiv.s %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x float> undef, float %s0, i32 0 + %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fdiv.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fdiv_v256f32_vr_merge(<256 x float> %passthru, <256 x float> %i0, float %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fdiv_v256f32_vr_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfdiv.s %v0, %v1, %s0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x float> undef, float %s1, i32 0 + %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fdiv.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + + +declare <256 x double> @llvm.vp.merge.v256f64(<256 x i1>, <256 x double>, <256 x double>, i32) +declare <256 x double> @llvm.vp.fdiv.v256f64(<256 x double>, <256 x double>, <256 x i1>, i32) + +define fastcc <256 x double> @test_vp_fdiv_v256f64_vv_merge(<256 x double> %passthru, <256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fdiv_v256f64_vv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfdiv.d %v0, %v1, %v2, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %vr = call <256 x double> @llvm.vp.fdiv.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fdiv_v256f64_rv_merge(<256 x double> %passthru, double %s0, <256 x double> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fdiv_v256f64_rv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfdiv.d %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x double> undef, double %s0, i32 0 + %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer + %vr = call <256 x double> @llvm.vp.fdiv.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fdiv_v256f64_vr_merge(<256 x double> %passthru, <256 x double> %i0, double %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fdiv_v256f64_vr_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfdiv.d %v0, %v1, %s0, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x double> undef, double %s1, i32 0 + %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer + %vr = call <256 x double> @llvm.vp.fdiv.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} diff --git a/llvm/test/CodeGen/VE/Vector/vp_fmul_merge.ll b/llvm/test/CodeGen/VE/Vector/vp_fmul_merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_fmul_merge.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <256 x float> @llvm.vp.merge.v256f32(<256 x i1>, <256 x float>, <256 x float>, i32) +declare <256 x float> @llvm.vp.fmul.v256f32(<256 x float>, <256 x float>, <256 x i1>, i32) + +define fastcc <256 x float> @test_vp_fmul_v256f32_vv_merge(<256 x float> %passthru, <256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fmul_v256f32_vv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: pvfmul.up %v0, %v1, %v2, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %vr = call <256 x float> @llvm.vp.fmul.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fmul_v256f32_rv_merge(<256 x float> %passthru, float %s0, <256 x float> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fmul_v256f32_rv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: pvfmul.up %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x float> undef, float %s0, i32 0 + %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fmul.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fmul_v256f32_vr_merge(<256 x float> %passthru, <256 x float> %i0, float %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fmul_v256f32_vr_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: pvfmul.up %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x float> undef, float %s1, i32 0 + %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fmul.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + + +declare <256 x double> @llvm.vp.merge.v256f64(<256 x i1>, <256 x double>, <256 x double>, i32) +declare <256 x double> @llvm.vp.fmul.v256f64(<256 x double>, <256 x double>, <256 x i1>, i32) + +define fastcc <256 x double> @test_vp_fmul_v256f64_vv_merge(<256 x double> %passthru, <256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fmul_v256f64_vv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfmul.d %v0, %v1, %v2, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %vr = call <256 x double> @llvm.vp.fmul.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fmul_v256f64_rv_merge(<256 x double> %passthru, double %s0, <256 x double> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fmul_v256f64_rv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmul.d %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x double> undef, double %s0, i32 0 + %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer + %vr = call <256 x double> @llvm.vp.fmul.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fmul_v256f64_vr_merge(<256 x double> %passthru, <256 x double> %i0, double %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fmul_v256f64_vr_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfmul.d %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x double> undef, double %s1, i32 0 + %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer + %vr = call <256 x double> @llvm.vp.fmul.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} diff --git a/llvm/test/CodeGen/VE/Vector/vp_fsub_merge.ll b/llvm/test/CodeGen/VE/Vector/vp_fsub_merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_fsub_merge.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s + +declare <256 x float> @llvm.vp.merge.v256f32(<256 x i1>, <256 x float>, <256 x float>, i32) +declare <256 x float> @llvm.vp.fsub.v256f32(<256 x float>, <256 x float>, <256 x i1>, i32) + +define fastcc <256 x float> @test_vp_fsub_v256f32_vv_merge(<256 x float> %passthru, <256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fsub_v256f32_vv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: pvfsub.up %v0, %v1, %v2, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %vr = call <256 x float> @llvm.vp.fsub.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fsub_v256f32_rv_merge(<256 x float> %passthru, float %s0, <256 x float> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fsub_v256f32_rv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: pvfsub.up %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x float> undef, float %s0, i32 0 + %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fsub.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + +define fastcc <256 x float> @test_vp_fsub_v256f32_vr_merge(<256 x float> %passthru, <256 x float> %i0, float %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fsub_v256f32_vr_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea %s2, 256 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vbrd %v2, %s0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: pvfsub.up %v0, %v1, %v2, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x float> undef, float %s1, i32 0 + %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer + %vr = call <256 x float> @llvm.vp.fsub.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x float> @llvm.vp.merge.v256f32(<256 x i1> %m, <256 x float> %vr, <256 x float> %passthru, i32 %n) + ret <256 x float> %r0 +} + + +declare <256 x double> @llvm.vp.merge.v256f64(<256 x i1>, <256 x double>, <256 x double>, i32) +declare <256 x double> @llvm.vp.fsub.v256f64(<256 x double>, <256 x double>, <256 x i1>, i32) + +define fastcc <256 x double> @test_vp_fsub_v256f64_vv_merge(<256 x double> %passthru, <256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fsub_v256f64_vv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vfsub.d %v0, %v1, %v2, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %vr = call <256 x double> @llvm.vp.fsub.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fsub_v256f64_rv_merge(<256 x double> %passthru, double %s0, <256 x double> %i1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fsub_v256f64_rv_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfsub.d %v0, %s0, %v1, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %xins = insertelement <256 x double> undef, double %s0, i32 0 + %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer + %vr = call <256 x double> @llvm.vp.fsub.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +} + +define fastcc <256 x double> @test_vp_fsub_v256f64_vr_merge(<256 x double> %passthru, <256 x double> %i0, double %s1, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: test_vp_fsub_v256f64_vr_merge: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea %s2, 256 +; CHECK-NEXT: lvl %s2 +; CHECK-NEXT: vbrd %v2, %s0 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vfsub.d %v0, %v1, %v2, %vm1 +; CHECK-NEXT: b.l.t (, %s10) + %yins = insertelement <256 x double> undef, double %s1, i32 0 + %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer + %vr = call <256 x double> @llvm.vp.fsub.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) + %r0 = call <256 x double> @llvm.vp.merge.v256f64(<256 x i1> %m, <256 x double> %vr, <256 x double> %passthru, i32 %n) + ret <256 x double> %r0 +}