Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8614,13 +8614,9 @@ unsigned Op, SelectionDAG &DAG) { EVT VTy = OpV->getOperand(0).getValueType(); - if (!VTy.isVector()) - return SDValue(); + assert(VTy.isVector() && "Expected a vector type"); int NumVecElts = VTy.getVectorNumElements(); - if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) - return SDValue(); - int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); SDValue PreOp = OpV; // Iterate over each step of the across vector reduction. @@ -8754,7 +8750,10 @@ return SDValue(); EVT EltTy = VTy.getVectorElementType(); - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + int NumVecElts = VTy.getVectorNumElements(); + if (!((EltTy == MVT::i32 && NumVecElts == 4) || + (EltTy == MVT::i16 && (NumVecElts == 4 || NumVecElts == 8)) || + (EltTy == MVT::i8 && (NumVecElts == 8 || NumVecElts == 16)))) return SDValue(); // Check if extracting from the same vector. @@ -8828,7 +8827,10 @@ return SDValue(); EVT EltTy = VTy.getVectorElementType(); - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + int NumVecElts = VTy.getVectorNumElements(); + if (!((EltTy == MVT::i32 && NumVecElts == 4) || + (EltTy == MVT::i16 && (NumVecElts == 4 || NumVecElts == 8)) || + (EltTy == MVT::i8 && (NumVecElts == 8 || NumVecElts == 16)))) return SDValue(); return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); Index: test/CodeGen/AArch64/aarch64-addv.ll =================================================================== --- test/CodeGen/AArch64/aarch64-addv.ll +++ test/CodeGen/AArch64/aarch64-addv.ll @@ -1,7 +1,21 @@ ; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s -define i8 @add_B(<16 x i8>* %arr) { -; CHECK-LABEL: add_B +define i8 @add_8B(<8 x i8>* %arr) { +; CHECK-LABEL: add_8B +; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b + %bin.rdx = load <8 x i8>, <8 x i8>* %arr + %rdx.shuf = shufflevector <8 x i8> %bin.rdx, <8 x i8> undef, <8 x i32> + %bin.rdx11 = add <8 x i8> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <8 x i8> %bin.rdx11, <8 x i8> undef, <8 x i32> + %bin.rdx13 = add <8 x i8> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <8 x i8> %bin.rdx13, <8 x i8> undef, <8 x i32> + %bin.rdx14 = add <8 x i8> %bin.rdx13, %rdx.shuf13 + %r = extractelement <8 x i8> %bin.rdx14, i32 0 + ret i8 %r +} + +define i8 @add_16B(<16 x i8>* %arr) { +; CHECK-LABEL: add_16B ; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b %bin.rdx = load <16 x i8>, <16 x i8>* %arr %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> @@ -16,8 +30,39 @@ ret i8 %r } -define i16 @add_H(<8 x i16>* %arr) { -; CHECK-LABEL: add_H +define i8 @add_32B(<32 x i8>* %arr) { +; CHECK-LABEL: add_32B +; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b + %bin.rdx = load <32 x i8>, <32 x i8>* %arr + %rdx.shuf00 = shufflevector <32 x i8> %bin.rdx, <32 x i8> undef, <32 x i32> + %bin.rdx00 = add <32 x i8> %bin.rdx, %rdx.shuf00 + %rdx.shuf0 = shufflevector <32 x i8> %bin.rdx00, <32 x i8> undef, <32 x i32> + %bin.rdx0 = add <32 x i8> %bin.rdx00, %rdx.shuf0 + %rdx.shuf = shufflevector <32 x i8> %bin.rdx0, <32 x i8> undef, <32 x i32> + %bin.rdx11 = add <32 x i8> %bin.rdx0, %rdx.shuf + %rdx.shuf12 = shufflevector <32 x i8> %bin.rdx11, <32 x i8> undef, <32 x i32> + %bin.rdx13 = add <32 x i8> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <32 x i8> %bin.rdx13, <32 x i8> undef, <32 x i32> + %bin.rdx14 = add <32 x i8> %bin.rdx13, %rdx.shuf13 + %r = extractelement <32 x i8> %bin.rdx14, i32 0 + ret i8 %r +} + +define i16 @add_4H( <4 x i16>* %arr) { +; CHECK-LABEL: add_4H +; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h +entry: + %bin.rdx = load <4 x i16>, <4 x i16>* %arr + %rdx.shuf = shufflevector <4 x i16> %bin.rdx, <4 x i16> undef, <4 x i32> + %bin.rdx11 = add <4 x i16> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <4 x i16> %bin.rdx11, <4 x i16> undef, <4 x i32> + %bin.rdx13 = add <4 x i16> %bin.rdx11, %rdx.shuf12 + %r = extractelement <4 x i16> %bin.rdx13, i32 0 + ret i16 %r +} + +define i16 @add_8H(<8 x i16>* %arr) { +; CHECK-LABEL: add_8H ; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h %bin.rdx = load <8 x i16>, <8 x i16>* %arr %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> @@ -30,8 +75,34 @@ ret i16 %r } -define i32 @add_S( <4 x i32>* %arr) { -; CHECK-LABEL: add_S +define i16 @add_16H(<16 x i16>* %arr) { +; CHECK-LABEL: add_16H +; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + %bin.rdx = load <16 x i16>, <16 x i16>* %arr + %rdx.shuf0 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> + %bin.rdx0 = add <16 x i16> %bin.rdx, %rdx.shuf0 + %rdx.shuf = shufflevector <16 x i16> %bin.rdx0, <16 x i16> undef, <16 x i32> + %bin.rdx11 = add <16 x i16> %bin.rdx0, %rdx.shuf + %rdx.shuf12 = shufflevector <16 x i16> %bin.rdx11, <16 x i16> undef, <16 x i32> + %bin.rdx13 = add <16 x i16> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <16 x i16> %bin.rdx13, <16 x i16> undef, <16 x i32> + %bin.rdx14 = add <16 x i16> %bin.rdx13, %rdx.shuf13 + %r = extractelement <16 x i16> %bin.rdx14, i32 0 + ret i16 %r +} + +; CHECK-LABEL: add_2S +; CHECK-NOT: addv +define i32 @add_2S(<2 x i32>* %arr) { + %bin.rdx = load <2 x i32>, <2 x i32>* %arr + %rdx.shuf0 = shufflevector <2 x i32> %bin.rdx, <2 x i32> undef, <2 x i32> + %bin.rdx0 = add <2 x i32> %bin.rdx, %rdx.shuf0 + %r = extractelement <2 x i32> %bin.rdx0, i32 0 + ret i32 %r +} + +define i32 @add_4S( <4 x i32>* %arr) { +; CHECK-LABEL: add_4S ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s %bin.rdx = load <4 x i32>, <4 x i32>* %arr %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> @@ -42,8 +113,22 @@ ret i32 %r } -define i64 @add_D(<2 x i64>* %arr) { -; CHECK-LABEL: add_D +define i32 @add_8S(<8 x i32>* %arr) { +; CHECK-LABEL: add_8S +; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s + %bin.rdx = load <8 x i32>, <8 x i32>* %arr + %rdx.shuf = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx11 = add <8 x i32> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <8 x i32> %bin.rdx11, <8 x i32> undef, <8 x i32> + %bin.rdx13 = add <8 x i32> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <8 x i32> %bin.rdx13, <8 x i32> undef, <8 x i32> + %bin.rdx14 = add <8 x i32> %bin.rdx13, %rdx.shuf13 + %r = extractelement <8 x i32> %bin.rdx14, i32 0 + ret i32 %r +} + +define i64 @add_2D(<2 x i64>* %arr) { +; CHECK-LABEL: add_2D ; CHECK-NOT: addv %bin.rdx = load <2 x i64>, <2 x i64>* %arr %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> Index: test/CodeGen/AArch64/aarch64-minmaxv.ll =================================================================== --- test/CodeGen/AArch64/aarch64-minmaxv.ll +++ test/CodeGen/AArch64/aarch64-minmaxv.ll @@ -3,9 +3,29 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64-linu--gnu" -; CHECK-LABEL: smax_B + +; CHECK-LABEL: smax_8B +; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.8b +define i8 @smax_8B(<8 x i8>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i8>, <8 x i8>* %arr + %rdx.shuf = shufflevector <8 x i8> %rdx.minmax.select, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp sgt <8 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i8> %rdx.minmax.select, <8 x i8> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i8> %rdx.minmax.select24, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp sgt <8 x i8> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i8> %rdx.minmax.select24, <8 x i8> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i8> %rdx.minmax.select27, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp sgt <8 x i8> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i8> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i8> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i8 %rdx.minmax.select27.elt, i8 %rdx.shuf28.elt + ret i8 %r +} + +; CHECK-LABEL: smax_16B ; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b -define i8 @smax_B(<16 x i8>* nocapture readonly %arr) { +define i8 @smax_16B(<16 x i8>* nocapture readonly %arr) { %arr.load = load <16 x i8>, <16 x i8>* %arr %rdx.shuf = shufflevector <16 x i8> %arr.load, <16 x i8> undef, <16 x i32> %rdx.minmax.cmp22 = icmp sgt <16 x i8> %arr.load, %rdx.shuf @@ -25,9 +45,25 @@ ret i8 %r } -; CHECK-LABEL: smax_H +; CHECK-LABEL: smax_4H +; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.4h +define i16 @smax_4H(<4 x i16> * nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i16>, <4 x i16>* %arr + %rdx.shuf = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp sgt <4 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i16> %rdx.minmax.select19, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp sgt <4 x i16> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i16> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i16> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i16 %rdx.minmax.select19.elt, i16 %rdx.shuf20.elt + ret i16 %r +} + +; CHECK-LABEL: smax_8H ; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h -define i16 @smax_H(<8 x i16>* nocapture readonly %arr) { +define i16 @smax_8H(<8 x i16>* nocapture readonly %arr) { %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> %rdx.minmax.cmp23 = icmp sgt <8 x i16> %rdx.minmax.select, %rdx.shuf @@ -44,9 +80,22 @@ ret i16 %r } -; CHECK-LABEL: smax_S +; CHECK-LABEL: smax_2S +; CHECK-NOT: smaxv +define i32 @smax_2S(<2 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i32>, <2 x i32>* %arr + %rdx.shuf = shufflevector <2 x i32> %rdx.minmax.select, <2 x i32> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp sgt <2 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i32> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i32> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i32 %rdx.minmax.select.elt, i32 %rdx.shuf.elt + ret i32 %r +} + +; CHECK-LABEL: smax_4S ; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s -define i32 @smax_S(<4 x i32> * nocapture readonly %arr) { +define i32 @smax_4S(<4 x i32> * nocapture readonly %arr) { %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> %rdx.minmax.cmp18 = icmp sgt <4 x i32> %rdx.minmax.select, %rdx.shuf @@ -60,9 +109,28 @@ ret i32 %r } -; CHECK-LABEL: smax_D +; CHECK-LABEL: smax_8S +; CHECK-NOT: smaxv +define i32 @smax_8S(<8 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i32>, <8 x i32>* %arr + %rdx.shuf = shufflevector <8 x i32> %rdx.minmax.select, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp sgt <8 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i32> %rdx.minmax.select, <8 x i32> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i32> %rdx.minmax.select24, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp sgt <8 x i32> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i32> %rdx.minmax.select24, <8 x i32> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i32> %rdx.minmax.select27, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp sgt <8 x i32> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i32> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i32> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i32 %rdx.minmax.select27.elt, i32 %rdx.shuf28.elt + ret i32 %r +} + +; CHECK-LABEL: smax_2D ; CHECK-NOT: smaxv -define i64 @smax_D(<2 x i64>* nocapture readonly %arr) { +define i64 @smax_2D(<2 x i64>* nocapture readonly %arr) { %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> %rdx.minmax.cmp18 = icmp sgt <2 x i64> %rdx.minmax.select, %rdx.shuf @@ -73,10 +141,28 @@ ret i64 %r } +; CHECK-LABEL: umax_8B +; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b +define i8 @umax_8B(<8 x i8>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i8>, <8 x i8>* %arr + %rdx.shuf = shufflevector <8 x i8> %rdx.minmax.select, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp ugt <8 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i8> %rdx.minmax.select, <8 x i8> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i8> %rdx.minmax.select24, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp ugt <8 x i8> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i8> %rdx.minmax.select24, <8 x i8> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i8> %rdx.minmax.select27, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp ugt <8 x i8> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i8> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i8> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i8 %rdx.minmax.select27.elt, i8 %rdx.shuf28.elt + ret i8 %r +} -; CHECK-LABEL: umax_B +; CHECK-LABEL: umax_16B ; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b -define i8 @umax_B(<16 x i8>* nocapture readonly %arr) { +define i8 @umax_16B(<16 x i8>* nocapture readonly %arr) { %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> %rdx.minmax.cmp22 = icmp ugt <16 x i8> %rdx.minmax.select, %rdx.shuf @@ -96,9 +182,26 @@ ret i8 %r } -; CHECK-LABEL: umax_H + +; CHECK-LABEL: umax_4H +; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h +define i16 @umax_4H(<4 x i16> * nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i16>, <4 x i16>* %arr + %rdx.shuf = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp ugt <4 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i16> %rdx.minmax.select19, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp ugt <4 x i16> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i16> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i16> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i16 %rdx.minmax.select19.elt, i16 %rdx.shuf20.elt + ret i16 %r +} + +; CHECK-LABEL: umax_8H ; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h -define i16 @umax_H(<8 x i16>* nocapture readonly %arr) { +define i16 @umax_8H(<8 x i16>* nocapture readonly %arr) { %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> %rdx.minmax.cmp23 = icmp ugt <8 x i16> %rdx.minmax.select, %rdx.shuf @@ -115,9 +218,22 @@ ret i16 %r } -; CHECK-LABEL: umax_S +; CHECK-LABEL: umax_2S +; CHECK-NOT: umaxv +define i32 @umax_2S(<2 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i32>, <2 x i32>* %arr + %rdx.shuf = shufflevector <2 x i32> %rdx.minmax.select, <2 x i32> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp ugt <2 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i32> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i32> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i32 %rdx.minmax.select.elt, i32 %rdx.shuf.elt + ret i32 %r +} + +; CHECK-LABEL: umax_4S ; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s -define i32 @umax_S(<4 x i32>* nocapture readonly %arr) { +define i32 @umax_4S(<4 x i32>* nocapture readonly %arr) { %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> %rdx.minmax.cmp18 = icmp ugt <4 x i32> %rdx.minmax.select, %rdx.shuf @@ -131,9 +247,28 @@ ret i32 %r } -; CHECK-LABEL: umax_D +; CHECK-LABEL: umax_8S +; CHECK-NOT: umaxv +define i32 @umax_8S(<8 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i32>, <8 x i32>* %arr + %rdx.shuf = shufflevector <8 x i32> %rdx.minmax.select, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp ugt <8 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i32> %rdx.minmax.select, <8 x i32> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i32> %rdx.minmax.select24, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp ugt <8 x i32> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i32> %rdx.minmax.select24, <8 x i32> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i32> %rdx.minmax.select27, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp ugt <8 x i32> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i32> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i32> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i32 %rdx.minmax.select27.elt, i32 %rdx.shuf28.elt + ret i32 %r +} + +; CHECK-LABEL: umax_2D ; CHECK-NOT: umaxv -define i64 @umax_D(<2 x i64>* nocapture readonly %arr) { +define i64 @umax_2D(<2 x i64>* nocapture readonly %arr) { %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> %rdx.minmax.cmp18 = icmp ugt <2 x i64> %rdx.minmax.select, %rdx.shuf @@ -144,10 +279,28 @@ ret i64 %r } +; CHECK-LABEL: smin_8B +; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.8b +define i8 @smin_8B(<8 x i8>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i8>, <8 x i8>* %arr + %rdx.shuf = shufflevector <8 x i8> %rdx.minmax.select, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp slt <8 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i8> %rdx.minmax.select, <8 x i8> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i8> %rdx.minmax.select24, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp slt <8 x i8> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i8> %rdx.minmax.select24, <8 x i8> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i8> %rdx.minmax.select27, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp slt <8 x i8> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i8> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i8> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i8 %rdx.minmax.select27.elt, i8 %rdx.shuf28.elt + ret i8 %r +} -; CHECK-LABEL: smin_B +; CHECK-LABEL: smin_16B ; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b -define i8 @smin_B(<16 x i8>* nocapture readonly %arr) { +define i8 @smin_16B(<16 x i8>* nocapture readonly %arr) { %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> %rdx.minmax.cmp22 = icmp slt <16 x i8> %rdx.minmax.select, %rdx.shuf @@ -167,9 +320,25 @@ ret i8 %r } -; CHECK-LABEL: smin_H +; CHECK-LABEL: smin_4H +; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.4h +define i16 @smin_4H(<4 x i16> * nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i16>, <4 x i16>* %arr + %rdx.shuf = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp slt <4 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i16> %rdx.minmax.select19, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp slt <4 x i16> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i16> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i16> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i16 %rdx.minmax.select19.elt, i16 %rdx.shuf20.elt + ret i16 %r +} + +; CHECK-LABEL: smin_8H ; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h -define i16 @smin_H(<8 x i16>* nocapture readonly %arr) { +define i16 @smin_8H(<8 x i16>* nocapture readonly %arr) { %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> %rdx.minmax.cmp23 = icmp slt <8 x i16> %rdx.minmax.select, %rdx.shuf @@ -186,9 +355,22 @@ ret i16 %r } -; CHECK-LABEL: smin_S +; CHECK-LABEL: smin_2S +; CHECK-NOT: sminv +define i32 @smin_2S(<2 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i32>, <2 x i32>* %arr + %rdx.shuf = shufflevector <2 x i32> %rdx.minmax.select, <2 x i32> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp slt <2 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i32> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i32> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i32 %rdx.minmax.select.elt, i32 %rdx.shuf.elt + ret i32 %r +} + +; CHECK-LABEL: smin_4S ; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s -define i32 @smin_S(<4 x i32>* nocapture readonly %arr) { +define i32 @smin_4S(<4 x i32>* nocapture readonly %arr) { %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> %rdx.minmax.cmp18 = icmp slt <4 x i32> %rdx.minmax.select, %rdx.shuf @@ -202,9 +384,28 @@ ret i32 %r } -; CHECK-LABEL: smin_D +; CHECK-LABEL: smin_8S +; CHECK-NOT: sminv +define i32 @smin_8S(<8 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i32>, <8 x i32>* %arr + %rdx.shuf = shufflevector <8 x i32> %rdx.minmax.select, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp slt <8 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i32> %rdx.minmax.select, <8 x i32> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i32> %rdx.minmax.select24, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp slt <8 x i32> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i32> %rdx.minmax.select24, <8 x i32> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i32> %rdx.minmax.select27, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp slt <8 x i32> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i32> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i32> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i32 %rdx.minmax.select27.elt, i32 %rdx.shuf28.elt + ret i32 %r +} + +; CHECK-LABEL: smin_2D ; CHECK-NOT: sminv -define i64 @smin_D(<2 x i64>* nocapture readonly %arr) { +define i64 @smin_2D(<2 x i64>* nocapture readonly %arr) { %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> %rdx.minmax.cmp18 = icmp slt <2 x i64> %rdx.minmax.select, %rdx.shuf @@ -216,9 +417,28 @@ } -; CHECK-LABEL: umin_B +; CHECK-LABEL: umin_8B +; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b +define i8 @umin_8B(<8 x i8>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i8>, <8 x i8>* %arr + %rdx.shuf = shufflevector <8 x i8> %rdx.minmax.select, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp ult <8 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i8> %rdx.minmax.select, <8 x i8> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i8> %rdx.minmax.select24, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp ult <8 x i8> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i8> %rdx.minmax.select24, <8 x i8> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i8> %rdx.minmax.select27, <8 x i8> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp ult <8 x i8> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i8> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i8> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i8 %rdx.minmax.select27.elt, i8 %rdx.shuf28.elt + ret i8 %r +} + +; CHECK-LABEL: umin_16B ; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b -define i8 @umin_B(<16 x i8>* nocapture readonly %arr) { +define i8 @umin_16B(<16 x i8>* nocapture readonly %arr) { %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> %rdx.minmax.cmp22 = icmp ult <16 x i8> %rdx.minmax.select, %rdx.shuf @@ -238,9 +458,25 @@ ret i8 %r } -; CHECK-LABEL: umin_H +; CHECK-LABEL: umin_4H +; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.4h +define i16 @umin_4H(<4 x i16> * nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i16>, <4 x i16>* %arr + %rdx.shuf = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp ult <4 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i16> %rdx.minmax.select19, <4 x i16> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp ult <4 x i16> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i16> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i16> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i16 %rdx.minmax.select19.elt, i16 %rdx.shuf20.elt + ret i16 %r +} + +; CHECK-LABEL: umin_8H ; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h -define i16 @umin_H(<8 x i16>* nocapture readonly %arr) { +define i16 @umin_8H(<8 x i16>* nocapture readonly %arr) { %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> %rdx.minmax.cmp23 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf @@ -257,9 +493,22 @@ ret i16 %r } -; CHECK-LABEL: umin_S +; CHECK-LABEL: umin_2S +; CHECK-NOT: uminv +define i32 @umin_2S(<2 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i32>, <2 x i32>* %arr + %rdx.shuf = shufflevector <2 x i32> %rdx.minmax.select, <2 x i32> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp ult <2 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i32> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i32> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i32 %rdx.minmax.select.elt, i32 %rdx.shuf.elt + ret i32 %r +} + +; CHECK-LABEL: umin_4S ; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s -define i32 @umin_S(<4 x i32>* nocapture readonly %arr) { +define i32 @umin_4S(<4 x i32>* nocapture readonly %arr) { %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> %rdx.minmax.cmp18 = icmp ult <4 x i32> %rdx.minmax.select, %rdx.shuf @@ -273,9 +522,28 @@ ret i32 %r } -; CHECK-LABEL: umin_D +; CHECK-LABEL: umin_8S +; CHECK-NOT: uminv +define i32 @umin_8S(<8 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i32>, <8 x i32>* %arr + %rdx.shuf = shufflevector <8 x i32> %rdx.minmax.select, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp ult <8 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i32> %rdx.minmax.select, <8 x i32> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i32> %rdx.minmax.select24, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp ult <8 x i32> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i32> %rdx.minmax.select24, <8 x i32> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i32> %rdx.minmax.select27, <8 x i32> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp ult <8 x i32> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i32> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i32> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i32 %rdx.minmax.select27.elt, i32 %rdx.shuf28.elt + ret i32 %r +} + +; CHECK-LABEL: umin_2D ; CHECK-NOT: uminv -define i64 @umin_D(<2 x i64>* nocapture readonly %arr) { +define i64 @umin_2D(<2 x i64>* nocapture readonly %arr) { %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> %rdx.minmax.cmp18 = icmp ult <2 x i64> %rdx.minmax.select, %rdx.shuf