diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1514,9 +1514,11 @@ if (!Subtarget->hasSVE()) return true; - // We can only support legal predicate result types. + // We can only support legal predicate result types. We can use the SVE + // whilelo instruction for generating fixed-width predicates too. if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 && - ResVT != MVT::nxv16i1) + ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 && + ResVT != MVT::v8i1 && ResVT != MVT::v16i1) return true; // The whilelo instruction only works with i32 or i64 scalar inputs. @@ -15390,6 +15392,39 @@ switch (IID) { default: break; + case Intrinsic::get_active_lane_mask: { + SDValue Res = SDValue(); + EVT VT = N->getValueType(0); + if (VT.isFixedLengthVector()) { + // We can use the SVE whilelo instruction to lower this intrinsic by + // creating the appropriate sequence of scalable vector operations and + // then extracting a fixed-width subvector from the scalable vector. + + SDLoc DL(N); + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64); + + EVT WhileVT = EVT::getVectorVT( + *DAG.getContext(), MVT::i1, + ElementCount::getScalable(VT.getVectorNumElements())); + + // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32. + EVT PromVT = getPromotedVTForPredicate(WhileVT); + + // Get the fixed-width equivalent of PromVT for extraction. + EVT ExtVT = + EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(), + VT.getVectorElementCount()); + + Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID, + N->getOperand(1), N->getOperand(2)); + Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res, + DAG.getConstant(0, DL, MVT::i64)); + Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res); + } + return Res; + } case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll --- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -324,27 +324,9 @@ define <16 x i1> @lane_mask_v16i1_i32(i32 %index, i32 %TC) { ; CHECK-LABEL: lane_mask_v16i1_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: adrp x9, .LCPI15_3 -; CHECK-NEXT: adrp x10, .LCPI15_2 -; CHECK-NEXT: dup v2.4s, w0 -; CHECK-NEXT: dup v5.4s, w1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: adrp x8, .LCPI15_1 -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI15_3] -; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI15_2] -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_1] -; CHECK-NEXT: uqadd v1.4s, v2.4s, v1.4s -; CHECK-NEXT: uqadd v3.4s, v2.4s, v3.4s -; CHECK-NEXT: uqadd v4.4s, v2.4s, v4.4s -; CHECK-NEXT: uqadd v0.4s, v2.4s, v0.4s -; CHECK-NEXT: cmhi v1.4s, v5.4s, v1.4s -; CHECK-NEXT: cmhi v2.4s, v5.4s, v3.4s -; CHECK-NEXT: cmhi v3.4s, v5.4s, v4.4s -; CHECK-NEXT: cmhi v0.4s, v5.4s, v0.4s -; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: whilelo p0.b, w0, w1 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC) ret <16 x i1> %active.lane.mask @@ -353,17 +335,8 @@ define <8 x i1> @lane_mask_v8i1_i32(i32 %index, i32 %TC) { ; CHECK-LABEL: lane_mask_v8i1_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI16_1 -; CHECK-NEXT: adrp x9, .LCPI16_0 -; CHECK-NEXT: dup v2.4s, w0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_1] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI16_0] -; CHECK-NEXT: uqadd v0.4s, v2.4s, v0.4s -; CHECK-NEXT: uqadd v1.4s, v2.4s, v1.4s -; CHECK-NEXT: dup v2.4s, w1 -; CHECK-NEXT: cmhi v0.4s, v2.4s, v0.4s -; CHECK-NEXT: cmhi v1.4s, v2.4s, v1.4s -; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: whilelo p0.h, w0, w1 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC) @@ -373,12 +346,8 @@ define <4 x i1> @lane_mask_v4i1_i32(i32 %index, i32 %TC) { ; CHECK-LABEL: lane_mask_v4i1_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI17_0 -; CHECK-NEXT: dup v1.4s, w0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_0] -; CHECK-NEXT: uqadd v0.4s, v1.4s, v0.4s -; CHECK-NEXT: dup v1.4s, w1 -; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-NEXT: whilelo p0.s, w0, w1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC) @@ -388,12 +357,9 @@ define <2 x i1> @lane_mask_v2i1_i32(i32 %index, i32 %TC) { ; CHECK-LABEL: lane_mask_v2i1_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI18_0 -; CHECK-NEXT: dup v0.2s, w0 -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI18_0] -; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: dup v1.2s, w1 -; CHECK-NEXT: cmhi v0.2s, v1.2s, v0.2s +; CHECK-NEXT: whilelo p0.d, w0, w1 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: ret %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC) ret <2 x i1> %active.lane.mask @@ -402,47 +368,9 @@ define <16 x i1> @lane_mask_v16i1_i64(i64 %index, i64 %TC) { ; CHECK-LABEL: lane_mask_v16i1_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: adrp x9, .LCPI19_1 -; CHECK-NEXT: adrp x10, .LCPI19_2 -; CHECK-NEXT: dup v1.2d, x0 -; CHECK-NEXT: dup v17.2d, x1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: adrp x8, .LCPI19_3 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI19_1] -; CHECK-NEXT: adrp x9, .LCPI19_4 -; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI19_2] -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI19_3] -; CHECK-NEXT: adrp x8, .LCPI19_5 -; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI19_4] -; CHECK-NEXT: adrp x9, .LCPI19_7 -; CHECK-NEXT: uqadd v0.2d, v1.2d, v0.2d -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI19_5] -; CHECK-NEXT: adrp x8, .LCPI19_6 -; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI19_7] -; CHECK-NEXT: uqadd v2.2d, v1.2d, v2.2d -; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI19_6] -; CHECK-NEXT: uqadd v3.2d, v1.2d, v3.2d -; CHECK-NEXT: uqadd v4.2d, v1.2d, v4.2d -; CHECK-NEXT: uqadd v6.2d, v1.2d, v6.2d -; CHECK-NEXT: uqadd v7.2d, v1.2d, v7.2d -; CHECK-NEXT: uqadd v16.2d, v1.2d, v16.2d -; CHECK-NEXT: uqadd v1.2d, v1.2d, v5.2d -; CHECK-NEXT: cmhi v6.2d, v17.2d, v6.2d -; CHECK-NEXT: cmhi v5.2d, v17.2d, v7.2d -; CHECK-NEXT: cmhi v7.2d, v17.2d, v16.2d -; CHECK-NEXT: cmhi v1.2d, v17.2d, v1.2d -; CHECK-NEXT: cmhi v4.2d, v17.2d, v4.2d -; CHECK-NEXT: cmhi v3.2d, v17.2d, v3.2d -; CHECK-NEXT: cmhi v2.2d, v17.2d, v2.2d -; CHECK-NEXT: cmhi v0.2d, v17.2d, v0.2d -; CHECK-NEXT: uzp1 v5.4s, v7.4s, v5.4s -; CHECK-NEXT: uzp1 v1.4s, v1.4s, v6.4s -; CHECK-NEXT: uzp1 v3.4s, v3.4s, v4.4s -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v2.4s -; CHECK-NEXT: uzp1 v1.8h, v1.8h, v5.8h -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: whilelo p0.b, x0, x1 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 %index, i64 %TC) ret <16 x i1> %active.lane.mask @@ -451,27 +379,8 @@ define <8 x i1> @lane_mask_v8i1_i64(i64 %index, i64 %TC) { ; CHECK-LABEL: lane_mask_v8i1_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI20_0 -; CHECK-NEXT: adrp x9, .LCPI20_3 -; CHECK-NEXT: adrp x10, .LCPI20_2 -; CHECK-NEXT: dup v2.2d, x0 -; CHECK-NEXT: dup v5.2d, x1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: adrp x8, .LCPI20_1 -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI20_3] -; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI20_2] -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_1] -; CHECK-NEXT: uqadd v1.2d, v2.2d, v1.2d -; CHECK-NEXT: uqadd v3.2d, v2.2d, v3.2d -; CHECK-NEXT: uqadd v4.2d, v2.2d, v4.2d -; CHECK-NEXT: uqadd v0.2d, v2.2d, v0.2d -; CHECK-NEXT: cmhi v1.2d, v5.2d, v1.2d -; CHECK-NEXT: cmhi v2.2d, v5.2d, v3.2d -; CHECK-NEXT: cmhi v3.2d, v5.2d, v4.2d -; CHECK-NEXT: cmhi v0.2d, v5.2d, v0.2d -; CHECK-NEXT: uzp1 v1.4s, v2.4s, v1.4s -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: whilelo p0.h, x0, x1 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 %index, i64 %TC) @@ -481,17 +390,8 @@ define <4 x i1> @lane_mask_v4i1_i64(i64 %index, i64 %TC) { ; CHECK-LABEL: lane_mask_v4i1_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI21_1 -; CHECK-NEXT: adrp x9, .LCPI21_0 -; CHECK-NEXT: dup v2.2d, x0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI21_1] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI21_0] -; CHECK-NEXT: uqadd v0.2d, v2.2d, v0.2d -; CHECK-NEXT: uqadd v1.2d, v2.2d, v1.2d -; CHECK-NEXT: dup v2.2d, x1 -; CHECK-NEXT: cmhi v0.2d, v2.2d, v0.2d -; CHECK-NEXT: cmhi v1.2d, v2.2d, v1.2d -; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: whilelo p0.s, x0, x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %index, i64 %TC) @@ -501,12 +401,8 @@ define <2 x i1> @lane_mask_v2i1_i64(i64 %index, i64 %TC) { ; CHECK-LABEL: lane_mask_v2i1_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI22_0 -; CHECK-NEXT: dup v1.2d, x0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI22_0] -; CHECK-NEXT: uqadd v0.2d, v1.2d, v0.2d -; CHECK-NEXT: dup v1.2d, x1 -; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d +; CHECK-NEXT: whilelo p0.d, x0, x1 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: ret %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 %index, i64 %TC)