Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -152,7 +152,11 @@ UMINV, SMAXV, UMAXV, - + // Vector min/max + SMIN, + SMAX, + UMIN, + UMAX, // Vector bitwise negation NOT, @@ -181,7 +185,6 @@ /// need to re-interpret the data in SIMD vector registers in big-endian /// mode without emitting such REV instructions. NVCAST, - SMULL, UMULL, Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8610,21 +8610,77 @@ return SDValue(); } -// vselect (v1i1 setcc) -> -// vselect (v1iXX setcc) (XX is the size of the compared operand type) -// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as -// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine -// such VSELECT. +// \brief Generate vector Min/Max node +// vselect x, y (cmp lt x, y) -> [s|u]min x, y +// vselect x, y (cmp gt x, y) -> [s|u]max x, y +static SDValue vcombineMinNumMaxNum(SDLoc DL, EVT VT, SDValue &SetCondCode, + SDValue IfTrue, SDValue IfFalse, + SelectionDAG &DAG) { + + assert(SetCondCode.getOpcode() == ISD::SETCC && + "No set condition code operand\n"); + if (!SetCondCode.hasOneUse()) + return SDValue(); + if (!VT.isVector()) + return SDValue(); + if (!VT.isInteger()) + return SDValue(); + + ISD::CondCode CC = cast(SetCondCode.getOperand(2))->get(); + SDValue CCLHS = SetCondCode.getOperand(0); + SDValue CCRHS = SetCondCode.getOperand(1); + + if (!(CCLHS == IfTrue && CCRHS == IfFalse) && + !(CCLHS == IfFalse && CCRHS == IfTrue)) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + unsigned Opcode; + switch (CC) { + case ISD::SETLT: + case ISD::SETLE: { + Opcode = (CCLHS == IfTrue) ? AArch64ISD::SMIN : AArch64ISD::SMAX; + if (TLI.isOperationLegalOrCustom(Opcode, VT)) + return DAG.getNode(Opcode, DL, VT, CCLHS, CCRHS); + return SDValue(); + } + case ISD::SETULT: + case ISD::SETULE: { + Opcode = (CCLHS == IfTrue) ? AArch64ISD::UMIN : AArch64ISD::UMAX; + if (TLI.isOperationLegalOrCustom(Opcode, VT)) + return DAG.getNode(Opcode, DL, VT, CCLHS, CCRHS); + return SDValue(); + } + case ISD::SETGT: + case ISD::SETGE: { + Opcode = (CCLHS == IfTrue) ? AArch64ISD::SMAX : AArch64ISD::SMIN; + if (TLI.isOperationLegalOrCustom(Opcode, VT)) + return DAG.getNode(Opcode, DL, VT, CCLHS, CCRHS); + return SDValue(); + } + case ISD::SETUGT: + case ISD::SETUGE: { + Opcode = (CCLHS == IfTrue) ? AArch64ISD::UMAX : AArch64ISD::UMIN; + if (TLI.isOperationLegalOrCustom(Opcode, VT)) + return DAG.getNode(Opcode, DL, VT, CCLHS, CCRHS); + return SDValue(); + } + default: + llvm_unreachable("Unknown condition code!"); + return SDValue(); + } +} +// \brief Combine vselect and setcc static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); - EVT CCVT = N0.getValueType(); - if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || - CCVT.getVectorElementType() != MVT::i1) + if (N0.getOpcode() != ISD::SETCC) return SDValue(); EVT ResVT = N->getValueType(0); EVT CmpVT = N0.getOperand(0).getValueType(); + // Only combine when the result type is of the same size as the compared // operands. if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) @@ -8632,6 +8688,24 @@ SDValue IfTrue = N->getOperand(1); SDValue IfFalse = N->getOperand(2); + + // Min/Max + + SDValue MinMax = + vcombineMinNumMaxNum(SDLoc(N), ResVT, N0, IfTrue, IfFalse, DAG); + if (MinMax.getNode()) + return MinMax; + + // vselect (v1i1 setcc) -> + // vselect (v1iXX setcc) (XX is the size of the compared operand type) + // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as + // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine + // such VSELECT. + EVT CCVT = N0.getValueType(); + if (CCVT.getVectorNumElements() != 1 || + CCVT.getVectorElementType() != MVT::i1) + return SDValue(); + SDValue SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), N0.getOperand(0), N0.getOperand(1), Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -266,6 +266,10 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>; def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; +def AArch64smin : SDNode<"AArch64ISD::SMIN", SDT_AArch64binvec>; +def AArch64umin : SDNode<"AArch64ISD::UMIN", SDT_AArch64binvec>; +def AArch64smax : SDNode<"AArch64ISD::SMAX", SDT_AArch64binvec>; +def AArch64umax : SDNode<"AArch64ISD::UMAX", SDT_AArch64binvec>; //===----------------------------------------------------------------------===// @@ -2827,6 +2831,66 @@ def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +//UMAX +def : Pat<(v8i8 (AArch64umax (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (UMAXv8i8 V64:$Rn, V64:$Rm)>; +def : Pat<(v4i16 (AArch64umax (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (UMAXv4i16 V64:$Rn, V64:$Rm)>; +def : Pat<(v2i32 (AArch64umax (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (UMAXv2i32 V64:$Rn, V64:$Rm)>; + +def : Pat<(v16i8 (AArch64umax (v16i8 V128:$Rn), (v16i8 V128:$Rm))), + (UMAXv16i8 V128:$Rn, V128:$Rm)>; +def : Pat<(v8i16 (AArch64umax (v8i16 V128:$Rn), (v8i16 V128:$Rm))), + (UMAXv8i16 V128:$Rn, V128:$Rm)>; +def : Pat<(v4i32 (AArch64umax (v4i32 V128:$Rn), (v4i32 V128:$Rm))), + (UMAXv4i32 V128:$Rn, V128:$Rm)>; + +//UMIN +def : Pat<(v8i8 (AArch64umin (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (UMINv8i8 V64:$Rn, V64:$Rm)>; +def : Pat<(v4i16 (AArch64umin (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (UMINv4i16 V64:$Rn, V64:$Rm)>; +def : Pat<(v2i32 (AArch64umin (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (UMINv2i32 V64:$Rn, V64:$Rm)>; + +def : Pat<(v16i8 (AArch64umin (v16i8 V128:$Rn), (v16i8 V128:$Rm))), + (UMINv16i8 V128:$Rn, V128:$Rm)>; +def : Pat<(v8i16 (AArch64umin (v8i16 V128:$Rn), (v8i16 V128:$Rm))), + (UMINv8i16 V128:$Rn, V128:$Rm)>; +def : Pat<(v4i32 (AArch64umin (v4i32 V128:$Rn), (v4i32 V128:$Rm))), + (UMINv4i32 V128:$Rn, V128:$Rm)>; + +// SMAX +def : Pat<(v8i8 (AArch64smax (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (SMAXv8i8 V64:$Rn, V64:$Rm)>; +def : Pat<(v4i16 (AArch64smax (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (SMAXv4i16 V64:$Rn, V64:$Rm)>; +def : Pat<(v2i32 (AArch64smax (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (SMAXv2i32 V64:$Rn, V64:$Rm)>; + +def : Pat<(v16i8 (AArch64smax (v16i8 V128:$Rn), (v16i8 V128:$Rm))), + (SMAXv16i8 V128:$Rn, V128:$Rm)>; +def : Pat<(v8i16 (AArch64smax (v8i16 V128:$Rn), (v8i16 V128:$Rm))), + (SMAXv8i16 V128:$Rn, V128:$Rm)>; +def : Pat<(v4i32 (AArch64smax (v4i32 V128:$Rn), (v4i32 V128:$Rm))), + (SMAXv4i32 V128:$Rn, V128:$Rm)>; + +// SMIN +def : Pat<(v8i8 (AArch64smin (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (SMINv8i8 V64:$Rn, V64:$Rm)>; +def : Pat<(v4i16 (AArch64smin (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (SMINv4i16 V64:$Rn, V64:$Rm)>; +def : Pat<(v2i32 (AArch64smin (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (SMINv2i32 V64:$Rn, V64:$Rm)>; + +def : Pat<(v16i8 (AArch64smin (v16i8 V128:$Rn), (v16i8 V128:$Rm))), + (SMINv16i8 V128:$Rn, V128:$Rm)>; +def : Pat<(v8i16 (AArch64smin (v8i16 V128:$Rn), (v8i16 V128:$Rm))), + (SMINv8i16 V128:$Rn, V128:$Rm)>; +def : Pat<(v4i32 (AArch64smin (v4i32 V128:$Rn), (v4i32 V128:$Rm))), + (SMINv4i32 V128:$Rn, V128:$Rm)>; + def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}", (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>; def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}", Index: test/CodeGen/AArch64/aarch64-vmin-vmax-opt.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-vmin-vmax-opt.ll @@ -0,0 +1,66 @@ +; RUN: llc < %s -mtriple=arm64-none-linux-gnu -o - | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios8.0.0" + +@b = common global [10000 x i32] zeroinitializer, align 4 +@c = common global [10000 x i32] zeroinitializer, align 4 +@m = common global [10000 x i32] zeroinitializer, align 4 + +; Function Attrs: nounwind ssp +define void @foo() { +; CHECK-LABEL: foo +; CHECK: smax v[[OUT:[0-9]+]].4s, v[[IN1:[0-9]+]].4s, v[[IN2:[0-9]+]].4s +; CHECK: smin v[[OUT:[0-9]+]].4s, v[[IN1:[0-9]+]].4s, v[[IN2:[0-9]+]].4s +; CHECK: umax v[[OUT:[0-9]+]].4s, v[[IN1:[0-9]+]].4s, v[[IN2:[0-9]+]].4s +; CHECK: umin v[[OUT:[0-9]+]].4s, v[[IN1:[0-9]+]].4s, v[[IN2:[0-9]+]].4s +entry: + %index = add i64 1, 0 + %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 + %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer + %induction = add <4 x i64> %broadcast.splat, + %0 = add nsw <4 x i64> %induction, + %1 = extractelement <4 x i64> %0, i32 0 + %2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @b, i64 0, i64 %1 + %3 = getelementptr i32, i32* %2, i32 0 + %4 = bitcast i32* %3 to <4 x i32>* + %wide.load46 = load <4 x i32>, <4 x i32>* %4, align 4 + %5 = getelementptr inbounds [10000 x i32], [10000 x i32]* @c, i64 0, i64 %1 + %6 = getelementptr i32, i32* %5, i32 0 + %7 = bitcast i32* %6 to <4 x i32>* + %wide.load47 = load <4 x i32>, <4 x i32>* %7, align 4 + %8 = add nsw <4 x i32> %wide.load46, %wide.load47 + %9 = getelementptr i32, i32* %2, i32 0 + %10 = bitcast i32* %9 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %10, align 4 + %11 = add nsw <4 x i32> %wide.load46, %wide.load + %12 = icmp sgt <4 x i32> %8, %11 + %13 = select <4 x i1> %12, <4 x i32> %8, <4 x i32> %11 + %i0 = extractelement <4 x i64> %induction, i32 0 + %melti0 = getelementptr inbounds [10000 x i32], [10000 x i32]* @m, i64 0, i64 %i0 + %14 = getelementptr i32, i32* %melti0, i32 0 + %15 = bitcast i32* %14 to <4 x i32>* + store <4 x i32> %13, <4 x i32>* %15, align 4 + %16 = icmp slt <4 x i32> %8, %11 + %17 = select <4 x i1> %16, <4 x i32> %8, <4 x i32> %11 + %i1 = extractelement <4 x i64> %induction, i32 1 + %melti1 = getelementptr inbounds [10000 x i32], [10000 x i32]* @m, i64 0, i64 %i1 + %s1 = getelementptr i32, i32* %melti1, i32 0 + %s11 = bitcast i32* %s1 to <4 x i32>* + store <4 x i32> %17, <4 x i32>* %s11, align 4 + %18 = icmp ugt <4 x i32> %8, %11 + %19 = select <4 x i1> %18, <4 x i32> %8, <4 x i32> %11 + %i2 = extractelement <4 x i64> %induction, i32 2 + %melti2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @m, i64 0, i64 %i2 + %s2 = getelementptr i32, i32* %melti2, i32 0 + %s22 = bitcast i32* %s2 to <4 x i32>* + store <4 x i32> %19, <4 x i32>* %s22, align 4 + %20 = icmp ult <4 x i32> %8, %11 + %21 = select <4 x i1> %20, <4 x i32> %8, <4 x i32> %11 + %i3 = extractelement <4 x i64> %induction, i32 3 + %melti3 = getelementptr inbounds [10000 x i32], [10000 x i32]* @m, i64 0, i64 %i3 + %s3 = getelementptr i32, i32* %melti3, i32 0 + %s33 = bitcast i32* %s3 to <4 x i32>* + store <4 x i32> %21, <4 x i32>* %s33, align 4 + ret void +}