diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll --- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; == Scalable == + define @lane_mask_nxv16i1_i32(i32 %index, i32 %TC) { ; CHECK-LABEL: lane_mask_nxv16i1_i32: ; CHECK: // %bb.0: @@ -317,6 +319,265 @@ } +; == Fixed width == + +define <16 x i1> @lane_mask_v16i1_i32(i32 %index, i32 %TC) { +; CHECK-LABEL: lane_mask_v16i1_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI15_0 +; CHECK-NEXT: adrp x9, .LCPI15_3 +; CHECK-NEXT: adrp x10, .LCPI15_2 +; CHECK-NEXT: dup v2.4s, w0 +; CHECK-NEXT: dup v5.4s, w1 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI15_0] +; CHECK-NEXT: adrp x8, .LCPI15_1 +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI15_3] +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI15_2] +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_1] +; CHECK-NEXT: uqadd v1.4s, v2.4s, v1.4s +; CHECK-NEXT: uqadd v3.4s, v2.4s, v3.4s +; CHECK-NEXT: uqadd v4.4s, v2.4s, v4.4s +; CHECK-NEXT: uqadd v0.4s, v2.4s, v0.4s +; CHECK-NEXT: cmhi v1.4s, v5.4s, v1.4s +; CHECK-NEXT: cmhi v2.4s, v5.4s, v3.4s +; CHECK-NEXT: cmhi v3.4s, v5.4s, v4.4s +; CHECK-NEXT: cmhi v0.4s, v5.4s, v0.4s +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC) + ret <16 x i1> %active.lane.mask +} + +define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) { +; CHECK-LABEL: lane_mask_v8i1_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI16_1 +; CHECK-NEXT: adrp x9, .LCPI16_0 +; CHECK-NEXT: dup v2.4s, w0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_1] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI16_0] +; CHECK-NEXT: uqadd v0.4s, v2.4s, v0.4s +; CHECK-NEXT: uqadd v1.4s, v2.4s, v1.4s +; CHECK-NEXT: dup v2.4s, w1 +; CHECK-NEXT: cmhi v0.4s, v2.4s, v0.4s +; CHECK-NEXT: cmhi v1.4s, v2.4s, v1.4s +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: ret + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC) + ret <8 x i1> %active.lane.mask +} + +define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) { +; CHECK-LABEL: lane_mask_v4i1_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: dup v1.4s, w0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: uqadd v0.4s, v1.4s, v0.4s +; CHECK-NEXT: dup v1.4s, w1 +; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC) + ret <4 x i1> %active.lane.mask +} + +define <2 x i1> @lane_mask_v2i1_i32(i32 %index, i32 %TC) { +; CHECK-LABEL: lane_mask_v2i1_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: dup v0.2s, w0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: dup v1.2s, w1 +; CHECK-NEXT: cmhi v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret + %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC) + ret <2 x i1> %active.lane.mask +} + +define <16 x i1> @lane_mask_v16i1_i64(i64 %index, i64 %TC) { +; CHECK-LABEL: lane_mask_v16i1_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: adrp x9, .LCPI19_1 +; CHECK-NEXT: adrp x10, .LCPI19_2 +; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: dup v17.2d, x1 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: adrp x8, .LCPI19_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI19_1] +; CHECK-NEXT: adrp x9, .LCPI19_4 +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI19_2] +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI19_3] +; CHECK-NEXT: adrp x8, .LCPI19_5 +; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI19_4] +; CHECK-NEXT: adrp x9, .LCPI19_7 +; CHECK-NEXT: uqadd v0.2d, v1.2d, v0.2d +; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI19_5] +; CHECK-NEXT: adrp x8, .LCPI19_6 +; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI19_7] +; CHECK-NEXT: uqadd v2.2d, v1.2d, v2.2d +; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI19_6] +; CHECK-NEXT: uqadd v3.2d, v1.2d, v3.2d +; CHECK-NEXT: uqadd v4.2d, v1.2d, v4.2d +; CHECK-NEXT: uqadd v6.2d, v1.2d, v6.2d +; CHECK-NEXT: uqadd v7.2d, v1.2d, v7.2d +; CHECK-NEXT: uqadd v16.2d, v1.2d, v16.2d +; CHECK-NEXT: uqadd v1.2d, v1.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v17.2d, v6.2d +; CHECK-NEXT: cmhi v5.2d, v17.2d, v7.2d +; CHECK-NEXT: cmhi v7.2d, v17.2d, v16.2d +; CHECK-NEXT: cmhi v1.2d, v17.2d, v1.2d +; CHECK-NEXT: cmhi v4.2d, v17.2d, v4.2d +; CHECK-NEXT: cmhi v3.2d, v17.2d, v3.2d +; CHECK-NEXT: cmhi v2.2d, v17.2d, v2.2d +; CHECK-NEXT: cmhi v0.2d, v17.2d, v0.2d +; CHECK-NEXT: uzp1 v5.4s, v7.4s, v5.4s +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v6.4s +; CHECK-NEXT: uzp1 v3.4s, v3.4s, v4.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: uzp1 v1.8h, v1.8h, v5.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 %index, i64 %TC) + ret <16 x i1> %active.lane.mask +} + +define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) { +; CHECK-LABEL: lane_mask_v8i1_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: adrp x9, .LCPI20_3 +; CHECK-NEXT: adrp x10, .LCPI20_2 +; CHECK-NEXT: dup v2.2d, x0 +; CHECK-NEXT: dup v5.2d, x1 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: adrp x8, .LCPI20_1 +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI20_3] +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI20_2] +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_1] +; CHECK-NEXT: uqadd v1.2d, v2.2d, v1.2d +; CHECK-NEXT: uqadd v3.2d, v2.2d, v3.2d +; CHECK-NEXT: uqadd v4.2d, v2.2d, v4.2d +; CHECK-NEXT: uqadd v0.2d, v2.2d, v0.2d +; CHECK-NEXT: cmhi v1.2d, v5.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v5.2d, v3.2d +; CHECK-NEXT: cmhi v3.2d, v5.2d, v4.2d +; CHECK-NEXT: cmhi v0.2d, v5.2d, v0.2d +; CHECK-NEXT: uzp1 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: ret + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 %index, i64 %TC) + ret <8 x i1> %active.lane.mask +} + +define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) { +; CHECK-LABEL: lane_mask_v4i1_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI21_1 +; CHECK-NEXT: adrp x9, .LCPI21_0 +; CHECK-NEXT: dup v2.2d, x0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI21_1] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI21_0] +; CHECK-NEXT: uqadd v0.2d, v2.2d, v0.2d +; CHECK-NEXT: uqadd v1.2d, v2.2d, v1.2d +; CHECK-NEXT: dup v2.2d, x1 +; CHECK-NEXT: cmhi v0.2d, v2.2d, v0.2d +; CHECK-NEXT: cmhi v1.2d, v2.2d, v1.2d +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %index, i64 %TC) + ret <4 x i1> %active.lane.mask +} + +define <2 x i1> @lane_mask_v2i1_i64(i64 %index, i64 %TC) { +; CHECK-LABEL: lane_mask_v2i1_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI22_0 +; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI22_0] +; CHECK-NEXT: uqadd v0.2d, v1.2d, v0.2d +; CHECK-NEXT: dup v1.2d, x1 +; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 %index, i64 %TC) + ret <2 x i1> %active.lane.mask +} + +define <16 x i1> @lane_mask_v16i1_i8(i8 %index, i8 %TC) { +; CHECK-LABEL: lane_mask_v16i1_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: dup v1.16b, w0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: uqadd v0.16b, v1.16b, v0.16b +; CHECK-NEXT: dup v1.16b, w1 +; CHECK-NEXT: cmhi v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ret + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i8(i8 %index, i8 %TC) + ret <16 x i1> %active.lane.mask +} + +define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) { +; CHECK-LABEL: lane_mask_v8i1_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI24_0 +; CHECK-NEXT: dup v0.8b, w0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI24_0] +; CHECK-NEXT: uqadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.8b, w1 +; CHECK-NEXT: cmhi v0.8b, v1.8b, v0.8b +; CHECK-NEXT: ret + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i8(i8 %index, i8 %TC) + ret <8 x i1> %active.lane.mask +} + +define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) { +; CHECK-LABEL: lane_mask_v4i1_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4h, w0 +; CHECK-NEXT: adrp x8, .LCPI25_0 +; CHECK-NEXT: dup v2.4h, w1 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI25_0] +; CHECK-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-NEXT: bic v2.4h, #255, lsl #8 +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: movi d1, #0xff00ff00ff00ff +; CHECK-NEXT: umin v0.4h, v0.4h, v1.4h +; CHECK-NEXT: cmhi v0.4h, v2.4h, v0.4h +; CHECK-NEXT: ret + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8 %index, i8 %TC) + ret <4 x i1> %active.lane.mask +} + +define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) { +; CHECK-LABEL: lane_mask_v2i1_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: movi d0, #0x0000ff000000ff +; CHECK-NEXT: dup v1.2s, w0 +; CHECK-NEXT: adrp x8, .LCPI26_0 +; CHECK-NEXT: dup v3.2s, w1 +; CHECK-NEXT: and v1.8b, v1.8b, v0.8b +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI26_0] +; CHECK-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-NEXT: and v2.8b, v3.8b, v0.8b +; CHECK-NEXT: umin v0.2s, v1.2s, v0.2s +; CHECK-NEXT: cmhi v0.2s, v2.2s, v0.2s +; CHECK-NEXT: ret + %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC) + ret <2 x i1> %active.lane.mask +} + + declare @llvm.get.active.lane.mask.nxv32i1.i32(i32, i32) declare @llvm.get.active.lane.mask.nxv16i1.i32(i32, i32) declare @llvm.get.active.lane.mask.nxv8i1.i32(i32, i32) @@ -334,3 +595,19 @@ declare @llvm.get.active.lane.mask.nxv8i1.i8(i8, i8) declare @llvm.get.active.lane.mask.nxv4i1.i8(i8, i8) declare @llvm.get.active.lane.mask.nxv2i1.i8(i8, i8) + + +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32, i32) + +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64, i64) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64, i64) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64, i64) +declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64, i64) + +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i8(i8, i8) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i8(i8, i8) +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8, i8) +declare <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8, i8)