Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -147,6 +147,11 @@ setOperationAction(ISD::SABSDIFF, VT, Legal); setOperationAction(ISD::UABSDIFF, VT, Legal); } + if (!VT.isFloatingPoint() && + VT != MVT::v2i64 && VT != MVT::v1i64) + for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) + setOperationAction(Opcode, VT, Legal); + } void ARMTargetLowering::addDRTypeForNEON(MVT VT) { Index: lib/Target/ARM/ARMInstrNEON.td =================================================================== --- lib/Target/ARM/ARMInstrNEON.td +++ lib/Target/ARM/ARMInstrNEON.td @@ -5063,6 +5063,55 @@ "vmin", "f32", v4f32, v4f32, fminnan, 1>; +def : Pat<(v8i8 (smin DPR:$Rn, DPR:$Rm)), + (VMINsv8i8 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v4i16 (smin DPR:$Rn, DPR:$Rm)), + (VMINsv4i16 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v2i32 (smin DPR:$Rn, DPR:$Rm)), + (VMINsv2i32 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v16i8 (smin QPR:$Rn, QPR:$Rm)), + (VMINsv16i8 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v8i16 (smin QPR:$Rn, QPR:$Rm)), + (VMINsv8i16 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v4i32 (smin QPR:$Rn, QPR:$Rm)), + (VMINsv4i32 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v8i8 (smax DPR:$Rn, DPR:$Rm)), + (VMAXsv8i8 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v4i16 (smax DPR:$Rn, DPR:$Rm)), + (VMAXsv4i16 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v2i32 (smax DPR:$Rn, DPR:$Rm)), + (VMAXsv2i32 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v16i8 (smax QPR:$Rn, QPR:$Rm)), + (VMAXsv16i8 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v8i16 (smax QPR:$Rn, QPR:$Rm)), + (VMAXsv8i16 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v4i32 (smax QPR:$Rn, QPR:$Rm)), + (VMAXsv4i32 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v8i8 (umin DPR:$Rn, DPR:$Rm)), + (VMINuv8i8 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v4i16 (umin DPR:$Rn, DPR:$Rm)), + (VMINuv4i16 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v2i32 (umin DPR:$Rn, DPR:$Rm)), + (VMINuv2i32 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v16i8 (umin QPR:$Rn, QPR:$Rm)), + (VMINuv16i8 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v8i16 (umin QPR:$Rn, QPR:$Rm)), + (VMINuv8i16 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v4i32 (umin QPR:$Rn, QPR:$Rm)), + (VMINuv4i32 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v8i8 (umax DPR:$Rn, DPR:$Rm)), + (VMAXuv8i8 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v4i16 (umax DPR:$Rn, DPR:$Rm)), + (VMAXuv4i16 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v2i32 (umax DPR:$Rn, DPR:$Rm)), + (VMAXuv2i32 DPR:$Rn, DPR:$Rm)>; +def : Pat<(v16i8 (umax QPR:$Rn, QPR:$Rm)), + (VMAXuv16i8 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v8i16 (umax QPR:$Rn, QPR:$Rm)), + (VMAXuv8i16 QPR:$Rn, QPR:$Rm)>; +def : Pat<(v4i32 (umax QPR:$Rn, QPR:$Rm)), + (VMAXuv4i32 QPR:$Rn, QPR:$Rm)>; + // VMINNM let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { def VMINNMND : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1, Index: test/CodeGen/ARM/minmax.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/minmax.ll @@ -0,0 +1,193 @@ +; RUN: llc < %s -mtriple=armv8-linux-gnu -mattr=+neon | FileCheck %s + +; CHECK-LABEL: t1 +; CHECK: vmax.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <4 x i32> @t1(<4 x i32> %a, <4 x i32> %b) { + %t1 = icmp sgt <4 x i32> %a, %b + %t2 = select <4 x i1> %t1, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %t2 +} + +; CHECK-LABEL: t2 +; CHECK: vmin.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <4 x i32> @t2(<4 x i32> %a, <4 x i32> %b) { + %t1 = icmp slt <4 x i32> %a, %b + %t2 = select <4 x i1> %t1, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %t2 +} + +; CHECK-LABEL: t3 +; CHECK: vmax.u32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <4 x i32> @t3(<4 x i32> %a, <4 x i32> %b) { + %t1 = icmp ugt <4 x i32> %a, %b + %t2 = select <4 x i1> %t1, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %t2 +} + +; CHECK-LABEL: t4 +; CHECK: vmin.u32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <4 x i32> @t4(<4 x i32> %a, <4 x i32> %b) { + %t1 = icmp ult <4 x i32> %a, %b + %t2 = select <4 x i1> %t1, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %t2 +} + +; CHECK-LABEL: t5 +; CHECK: vmax.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <2 x i32> @t5(<2 x i32> %a, <2 x i32> %b) { + %t1 = icmp sgt <2 x i32> %a, %b + %t2 = select <2 x i1> %t1, <2 x i32> %a, <2 x i32> %b + ret <2 x i32> %t2 +} + +; CHECK-LABEL: t6 +; CHECK: vmin.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <2 x i32> @t6(<2 x i32> %a, <2 x i32> %b) { + %t1 = icmp slt <2 x i32> %a, %b + %t2 = select <2 x i1> %t1, <2 x i32> %a, <2 x i32> %b + ret <2 x i32> %t2 +} + +; CHECK-LABEL: t7 +; CHECK: vmax.u32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <2 x i32> @t7(<2 x i32> %a, <2 x i32> %b) { + %t1 = icmp ugt <2 x i32> %a, %b + %t2 = select <2 x i1> %t1, <2 x i32> %a, <2 x i32> %b + ret <2 x i32> %t2 +} + +; CHECK-LABEL: t8 +; CHECK: vmin.u32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <2 x i32> @t8(<2 x i32> %a, <2 x i32> %b) { + %t1 = icmp ult <2 x i32> %a, %b + %t2 = select <2 x i1> %t1, <2 x i32> %a, <2 x i32> %b + ret <2 x i32> %t2 +} + +; CHECK-LABEL: t9 +; CHECK: vmax.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <8 x i16> @t9(<8 x i16> %a, <8 x i16> %b) { + %t1 = icmp sgt <8 x i16> %a, %b + %t2 = select <8 x i1> %t1, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %t2 +} + +; CHECK-LABEL: t10 +; CHECK: vmin.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <8 x i16> @t10(<8 x i16> %a, <8 x i16> %b) { + %t1 = icmp slt <8 x i16> %a, %b + %t2 = select <8 x i1> %t1, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %t2 +} + +; CHECK-LABEL: t11 +; CHECK: vmax.u16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <8 x i16> @t11(<8 x i16> %a, <8 x i16> %b) { + %t1 = icmp ugt <8 x i16> %a, %b + %t2 = select <8 x i1> %t1, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %t2 +} + +; CHECK-LABEL: t12 +; CHECK: vmin.u16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <8 x i16> @t12(<8 x i16> %a, <8 x i16> %b) { + %t1 = icmp ult <8 x i16> %a, %b + %t2 = select <8 x i1> %t1, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %t2 +} + +; CHECK-LABEL: t13 +; CHECK: vmax.s16 +define <4 x i16> @t13(<4 x i16> %a, <4 x i16> %b) { + %t1 = icmp sgt <4 x i16> %a, %b + %t2 = select <4 x i1> %t1, <4 x i16> %a, <4 x i16> %b + ret <4 x i16> %t2 +} + +; CHECK-LABEL: t14 +; CHECK: vmin.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <4 x i16> @t14(<4 x i16> %a, <4 x i16> %b) { + %t1 = icmp slt <4 x i16> %a, %b + %t2 = select <4 x i1> %t1, <4 x i16> %a, <4 x i16> %b + ret <4 x i16> %t2 +} + +; CHECK-LABEL: t15 +; CHECK: vmax.u16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <4 x i16> @t15(<4 x i16> %a, <4 x i16> %b) { + %t1 = icmp ugt <4 x i16> %a, %b + %t2 = select <4 x i1> %t1, <4 x i16> %a, <4 x i16> %b + ret <4 x i16> %t2 +} + +; CHECK-LABEL: t16 +; CHECK: vmin.u16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <4 x i16> @t16(<4 x i16> %a, <4 x i16> %b) { + %t1 = icmp ult <4 x i16> %a, %b + %t2 = select <4 x i1> %t1, <4 x i16> %a, <4 x i16> %b + ret <4 x i16> %t2 +} + +; CHECK-LABEL: t17 +; CHECK: vmax.s8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <16 x i8> @t17(<16 x i8> %a, <16 x i8> %b) { + %t1 = icmp sgt <16 x i8> %a, %b + %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %t2 +} + +; CHECK-LABEL: t18 +; CHECK: vmin.s8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <16 x i8> @t18(<16 x i8> %a, <16 x i8> %b) { + %t1 = icmp slt <16 x i8> %a, %b + %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %t2 +} + +; CHECK-LABEL: t19 +; CHECK: vmax.u8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <16 x i8> @t19(<16 x i8> %a, <16 x i8> %b) { + %t1 = icmp ugt <16 x i8> %a, %b + %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %t2 +} + +; CHECK-LABEL: t20 +; CHECK: vmin.u8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +define <16 x i8> @t20(<16 x i8> %a, <16 x i8> %b) { + %t1 = icmp ult <16 x i8> %a, %b + %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b + ret <16 x i8> %t2 +} + +; CHECK-LABEL: t21 +; CHECK: vmax.s8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <8 x i8> @t21(<8 x i8> %a, <8 x i8> %b) { + %t1 = icmp sgt <8 x i8> %a, %b + %t2 = select <8 x i1> %t1, <8 x i8> %a, <8 x i8> %b + ret <8 x i8> %t2 +} + +; CHECK-LABEL: t22 +; CHECK: vmin.s8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <8 x i8> @t22(<8 x i8> %a, <8 x i8> %b) { + %t1 = icmp slt <8 x i8> %a, %b + %t2 = select <8 x i1> %t1, <8 x i8> %a, <8 x i8> %b + ret <8 x i8> %t2 +} + +; CHECK-LABEL: t23 +; CHECK: vmax.u8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <8 x i8> @t23(<8 x i8> %a, <8 x i8> %b) { + %t1 = icmp ugt <8 x i8> %a, %b + %t2 = select <8 x i1> %t1, <8 x i8> %a, <8 x i8> %b + ret <8 x i8> %t2 +} + +; CHECK-LABEL: t24 +; CHECK: vmin.u8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +define <8 x i8> @t24(<8 x i8> %a, <8 x i8> %b) { + %t1 = icmp ult <8 x i8> %a, %b + %t2 = select <8 x i1> %t1, <8 x i8> %a, <8 x i8> %b + ret <8 x i8> %t2 +} Index: test/CodeGen/ARM/vselect_imax.ll =================================================================== --- test/CodeGen/ARM/vselect_imax.ll +++ test/CodeGen/ARM/vselect_imax.ll @@ -3,8 +3,7 @@ ; Make sure that ARM backend with NEON handles vselect. define void @vmax_v4i32(<4 x i32>* %m, <4 x i32> %a, <4 x i32> %b) { -; CHECK: vcgt.s32 [[QR:q[0-9]+]], [[Q1:q[0-9]+]], [[Q2:q[0-9]+]] -; CHECK: vbsl [[QR]], [[Q1]], [[Q2]] +; CHECK: vmax.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} %cmpres = icmp sgt <4 x i32> %a, %b %maxres = select <4 x i1> %cmpres, <4 x i32> %a, <4 x i32> %b store <4 x i32> %maxres, <4 x i32>* %m @@ -21,8 +20,8 @@ %v0 = load %T0_10, %T0_10* %loadaddr %v1 = load %T0_10, %T0_10* %loadaddr2 %c = icmp slt %T0_10 %v0, %v1 -; CHECK: vbsl -; CHECK: vbsl +; CHECK: vmin.s16 +; CHECK: vmin.s16 ; COST: func_blend10 ; COST: cost of 40 {{.*}} select %r = select %T1_10 %c, %T0_10 %v0, %T0_10 %v1 @@ -37,8 +36,8 @@ %v0 = load %T0_14, %T0_14* %loadaddr %v1 = load %T0_14, %T0_14* %loadaddr2 %c = icmp slt %T0_14 %v0, %v1 -; CHECK: vbsl -; CHECK: vbsl +; CHECK: vmin.s32 +; CHECK: vmin.s32 ; COST: func_blend14 ; COST: cost of 41 {{.*}} select %r = select %T1_14 %c, %T0_14 %v0, %T0_14 %v1 @@ -50,8 +49,8 @@ ; CHECK-LABEL: func_blend15: define void @func_blend15(%T0_15* %loadaddr, %T0_15* %loadaddr2, %T1_15* %blend, %T0_15* %storeaddr) { -; CHECK: vbsl -; CHECK: vbsl +; CHECK: vmin.s32 +; CHECK: vmin.s32 %v0 = load %T0_15, %T0_15* %loadaddr %v1 = load %T0_15, %T0_15* %loadaddr2 %c = icmp slt %T0_15 %v0, %v1