diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -0,0 +1,303 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh < %s | FileCheck %s -check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh < %s | FileCheck %s -check-prefixes=CHECK,RV64 + +; The two loads are contigous and should be folded into one +define void @widen_2xv4i16(ptr %x, ptr %z) { +; CHECK-LABEL: widen_2xv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 8 +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %x + %b.gep = getelementptr i8, ptr %x, i64 8 + %b = load <4 x i16>, ptr %b.gep + %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + store <8 x i16> %c, ptr %z + ret void +} + +define void @widen_3xv4i16(ptr %x, ptr %z) { +; RV32-LABEL: widen_3xv4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: addi a2, a0, 8 +; RV32-NEXT: vle16.v v10, (a2) +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vle16.v v12, (a0) +; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV32-NEXT: vslideup.vi v8, v10, 4 +; RV32-NEXT: addi a0, a1, 16 +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vse16.v v12, (a0) +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vse16.v v8, (a1) +; RV32-NEXT: ret +; +; RV64-LABEL: widen_3xv4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vle16.v v8, (a0) +; RV64-NEXT: addi a2, a0, 8 +; RV64-NEXT: vle16.v v10, (a2) +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vle16.v v12, (a0) +; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; RV64-NEXT: vslideup.vi v8, v10, 4 +; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma +; RV64-NEXT: vslideup.vi v8, v12, 8 +; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma +; RV64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-NEXT: addi a0, a1, 16 +; RV64-NEXT: vse64.v v10, (a0) +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vse16.v v8, (a1) +; RV64-NEXT: ret + %a = load <4 x i16>, ptr %x + %b.gep = getelementptr i8, ptr %x, i64 8 + %b = load <4 x i16>, ptr %b.gep + %c.gep = getelementptr i8, ptr %b.gep, i64 8 + %c = load <4 x i16>, ptr %c.gep + %d.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + %d.1 = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> + %d.2 = shufflevector <8 x i16> %d.0, <8 x i16> %d.1, <12 x i32> + store <12 x i16> %d.2, ptr %z + ret void +} + +define void @widen_4xv4i16(ptr %x, ptr %z) { +; CHECK-LABEL: widen_4xv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a2, a0, 8 +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: addi a2, a0, 16 +; CHECK-NEXT: vle16.v v12, (a2) +; CHECK-NEXT: addi a0, a0, 24 +; CHECK-NEXT: vle16.v v14, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v14, 12 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %x + %b.gep = getelementptr i8, ptr %x, i64 8 + %b = load <4 x i16>, ptr %b.gep + %c.gep = getelementptr i8, ptr %b.gep, i64 8 + %c = load <4 x i16>, ptr %c.gep + %d.gep = getelementptr i8, ptr %c.gep, i64 8 + %d = load <4 x i16>, ptr %d.gep + %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> + %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> + store <16 x i16> %e.2, ptr %z + ret void +} + +; Should be a strided load - with type coercion to i64 +define void @strided_constant(ptr %x, ptr %z) { +; CHECK-LABEL: strided_constant: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %x + %b.gep = getelementptr i8, ptr %x, i64 16 + %b = load <4 x i16>, ptr %b.gep + %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + store <8 x i16> %c, ptr %z + ret void +} + +; Should be a strided load +define void @strided_constant_64(ptr %x, ptr %z) { +; CHECK-LABEL: strided_constant_64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 64 +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %x + %b.gep = getelementptr i8, ptr %x, i64 64 + %b = load <4 x i16>, ptr %b.gep + %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + store <8 x i16> %c, ptr %z + ret void +} + +; Vector is too large to fit into a single strided load +define void @strided_constant_v4i32(ptr %x, ptr %z) { +; CHECK-LABEL: strided_constant_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vse32.v v8, (a1) +; CHECK-NEXT: ret + %a = load <4 x i32>, ptr %x + %b.gep = getelementptr i8, ptr %x, i64 16 + %b = load <4 x i32>, ptr %b.gep + %c = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + store <8 x i32> %c, ptr %z + ret void +} + +; Interestingly, can be a stride 0 load +define void @strided_constant_0(ptr %x, ptr %z) { +; CHECK-LABEL: strided_constant_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vslideup.vi v9, v8, 4 +; CHECK-NEXT: vse16.v v9, (a1) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %x + %b = load <4 x i16>, ptr %x + %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + store <8 x i16> %c, ptr %z + ret void +} + + +define void @strided_runtime(ptr %x, ptr %z, i64 %s) { +; CHECK-LABEL: strided_runtime: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %x + %b.gep = getelementptr i8, ptr %x, i64 %s + %b = load <4 x i16>, ptr %b.gep + %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + store <8 x i16> %c, ptr %z + ret void +} + + +define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) { +; CHECK-LABEL: strided_runtime_4xv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vle16.v v14, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v14, 12 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %x + %b.gep = getelementptr i8, ptr %x, i64 %s + %b = load <4 x i16>, ptr %b.gep + %c.gep = getelementptr i8, ptr %b.gep, i64 %s + %c = load <4 x i16>, ptr %c.gep + %d.gep = getelementptr i8, ptr %c.gep, i64 %s + %d = load <4 x i16>, ptr %d.gep + %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> + %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> + store <16 x i16> %e.2, ptr %z + ret void +} + +define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) { +; CHECK-LABEL: strided_runtime_4xv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vle16.v v14, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v14, 12 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %a = load <4 x half>, ptr %x + %b.gep = getelementptr i8, ptr %x, i64 %s + %b = load <4 x half>, ptr %b.gep + %c.gep = getelementptr i8, ptr %b.gep, i64 %s + %c = load <4 x half>, ptr %c.gep + %d.gep = getelementptr i8, ptr %c.gep, i64 %s + %d = load <4 x half>, ptr %d.gep + %e.0 = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> + %e.1 = shufflevector <4 x half> %c, <4 x half> %d, <8 x i32> + %e.2 = shufflevector <8 x half> %e.0, <8 x half> %e.1, <16 x i32> + store <16 x half> %e.2, ptr %z + ret void +} + +define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) { +; CHECK-LABEL: strided_runtime_4xv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vle32.v v14, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v12, 4 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v14, 6 +; CHECK-NEXT: vse32.v v8, (a1) +; CHECK-NEXT: ret + %a = load <2 x float>, ptr %x + %b.gep = getelementptr i8, ptr %x, i64 %s + %b = load <2 x float>, ptr %b.gep + %c.gep = getelementptr i8, ptr %b.gep, i64 %s + %c = load <2 x float>, ptr %c.gep + %d.gep = getelementptr i8, ptr %c.gep, i64 %s + %d = load <2 x float>, ptr %d.gep + %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> + %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> + %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> + store <8 x float> %e.2, ptr %z + ret void +}