diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14517,20 +14517,20 @@ // or unpredicated operation, which potentially allows better isel (perhaps // using immediate forms) or relaxing register reuse requirements. static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, - SelectionDAG &DAG, - bool UnpredOp = false) { + SelectionDAG &DAG, bool UnpredOp = false, + bool SwapOperands = false) { assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!"); assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!"); SDValue Pg = N->getOperand(1); + SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2); + SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3); // ISD way to specify an all active predicate. if (isAllActivePredicate(Pg)) { if (UnpredOp) - return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), N->getOperand(2), - N->getOperand(3)); - else - return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, - N->getOperand(2), N->getOperand(3)); + return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2); + + return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2); } // FUTURE: SplatVector(true) @@ -14652,6 +14652,8 @@ return convertMergedOpToPredOp(N, ISD::ADD, DAG, true); case Intrinsic::aarch64_sve_sub: return convertMergedOpToPredOp(N, ISD::SUB, DAG, true); + case Intrinsic::aarch64_sve_subr: + return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true); case Intrinsic::aarch64_sve_and: return convertMergedOpToPredOp(N, ISD::AND, DAG, true); case Intrinsic::aarch64_sve_bic: diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll @@ -263,6 +263,159 @@ ret %out } +; SUBR + +define @subr_i8( %a) { +; CHECK-LABEL: subr_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.b, z0.b, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %elt = insertelement undef, i8 127, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.subr.nxv16i8( %pg, + %a, + %splat) + ret %out +} + +define @subr_i16( %a) { +; CHECK-LABEL: subr_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.h, z0.h, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 127, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.subr.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @subr_i16_out_of_range( %a) { +; CHECK-LABEL: subr_i16_out_of_range: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: sub z0.h, z1.h, z0.h +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %elt = insertelement undef, i16 257, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.subr.nxv8i16( %pg, + %a, + %splat) + ret %out +} + +define @subr_i32( %a) { +; CHECK-LABEL: subr_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.s, z0.s, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 127, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.subr.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @subr_i32_out_of_range( %a) { +; CHECK-LABEL: subr_i32_out_of_range: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: sub z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %elt = insertelement undef, i32 257, i32 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.subr.nxv4i32( %pg, + %a, + %splat) + ret %out +} + +define @subr_i64( %a) { +; CHECK-LABEL: subr_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: subr z0.d, z0.d, #127 +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 127, i64 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.subr.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +define @subr_i64_out_of_range( %a) { +; CHECK-LABEL: subr_i64_out_of_range: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: sub z0.d, z1.d, z0.d +; CHECK-NEXT: ret + %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %elt = insertelement undef, i64 257, i64 0 + %splat = shufflevector %elt, undef, zeroinitializer + %out = call @llvm.aarch64.sve.subr.nxv2i64( %pg, + %a, + %splat) + ret %out +} + +; As subr_i32 but where pg is i8 based and thus compatible for i32. +define @subr_i32_ptrue_all_b( %a) #0 { +; CHECK-LABEL: subr_i32_ptrue_all_b: +; CHECK: subr z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.b = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.subr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As subr_i32 but where pg is i16 based and thus compatible for i32. +define @subr_i32_ptrue_all_h( %a) #0 { +; CHECK-LABEL: subr_i32_ptrue_all_h: +; CHECK: subr z0.s, z0.s, #1 +; CHECK-NEXT: ret + %pg.h = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg.h) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.subr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + +; As subr_i32 but where pg is i64 based, which is not compatibile for i32 and +; thus inactive lanes are important and the immediate form cannot be used. +define @subr_i32_ptrue_all_d( %a) #0 { +; CHECK-LABEL: subr_i32_ptrue_all_d: +; CHECK-DAG: ptrue [[PG:p[0-9]+]].d +; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1 +; CHECK-DAG: subr z0.s, [[PG]]/m, z0.s, [[DUP]].s +; CHECK-NEXT: ret + %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %pg.b = tail call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %pg.d) + %pg.s = tail call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %pg.b) + %b = tail call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = tail call @llvm.aarch64.sve.subr.nxv4i32( %pg.s, + %a, + %b) + ret %out +} + ; SMAX define @smax_i8( %a) { @@ -1894,6 +2047,11 @@ declare @llvm.aarch64.sve.sub.nxv4i32(, , ) declare @llvm.aarch64.sve.sub.nxv2i64(, , ) +declare @llvm.aarch64.sve.subr.nxv16i8(, , ) +declare @llvm.aarch64.sve.subr.nxv8i16(, , ) +declare @llvm.aarch64.sve.subr.nxv4i32(, , ) +declare @llvm.aarch64.sve.subr.nxv2i64(, , ) + declare @llvm.aarch64.sve.sqadd.x.nxv16i8(, ) declare @llvm.aarch64.sve.sqadd.x.nxv8i16(, ) declare @llvm.aarch64.sve.sqadd.x.nxv4i32(, )