Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2059,7 +2059,8 @@ : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>; multiclass MVE_VHADD_m { + SDNode unpred_op, Intrinsic pred_int, PatFrag add_op, + SDNode shift_op> { def "" : MVE_VHADD_; defvar Inst = !cast(NAME); @@ -2068,6 +2069,9 @@ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))), + (Inst MQPR:$Qm, MQPR:$Qn)>; + // Predicated add-and-divide-by-two def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), @@ -2077,18 +2081,44 @@ } } -multiclass MVE_VHADD - : MVE_VHADD_m; +multiclass MVE_VHADD + : MVE_VHADD_m; + +def addnuw : PatFrag<(ops node:$lhs, node:$rhs), + (add node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoUnsignedWrap(); +}]>; -defm MVE_VHADDs8 : MVE_VHADD; -defm MVE_VHADDs16 : MVE_VHADD; -defm MVE_VHADDs32 : MVE_VHADD; -defm MVE_VHADDu8 : MVE_VHADD; -defm MVE_VHADDu16 : MVE_VHADD; -defm MVE_VHADDu32 : MVE_VHADD; +def addnsw : PatFrag<(ops node:$lhs, node:$rhs), + (add node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoSignedWrap(); +}]>; + +def subnuw : PatFrag<(ops node:$lhs, node:$rhs), + (sub node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoUnsignedWrap(); +}]>; + +def subnsw : PatFrag<(ops node:$lhs, node:$rhs), + (sub node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoSignedWrap(); +}]>; + +// Halving add/sub perform the arithemtic operation with an extra bit of +// precision, before performing the shift, to void clipping errors. We're not +// modelling that here with these patterns, but we're using no wrap forms of +// add/sub to ensure that the extra bit of information is not needed. +defm MVE_VHADDs8 : MVE_VHADD; +defm MVE_VHADDs16 : MVE_VHADD; +defm MVE_VHADDs32 : MVE_VHADD; +defm MVE_VHADDu8 : MVE_VHADD; +defm MVE_VHADDu16 : MVE_VHADD; +defm MVE_VHADDu32 : MVE_VHADD; multiclass MVE_VHSUB_m { + SDNode unpred_op, Intrinsic pred_int, PatFrag sub_op, + SDNode shift_op> { def "" : MVE_VHSUB_; defvar Inst = !cast(NAME); @@ -2098,6 +2128,10 @@ (i32 VTI.Unsigned))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (shift_op (sub_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))), + (Inst MQPR:$Qm, MQPR:$Qn)>; + + // Predicated subtract-and-divide-by-two def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), @@ -2108,15 +2142,16 @@ } } -multiclass MVE_VHSUB - : MVE_VHSUB_m; +multiclass MVE_VHSUB + : MVE_VHSUB_m; -defm MVE_VHSUBs8 : MVE_VHSUB; -defm MVE_VHSUBs16 : MVE_VHSUB; -defm MVE_VHSUBs32 : MVE_VHSUB; -defm MVE_VHSUBu8 : MVE_VHSUB; -defm MVE_VHSUBu16 : MVE_VHSUB; -defm MVE_VHSUBu32 : MVE_VHSUB; +defm MVE_VHSUBs8 : MVE_VHSUB; +defm MVE_VHSUBs16 : MVE_VHSUB; +defm MVE_VHSUBs32 : MVE_VHSUB; +defm MVE_VHSUBu8 : MVE_VHSUB; +defm MVE_VHSUBu16 : MVE_VHSUB; +defm MVE_VHSUBu32 : MVE_VHSUB; class MVE_VDUP pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary, Index: llvm/test/CodeGen/Thumb2/mve-halving.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-halving.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @vhadds_v16i8(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vhadds_v16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vshr.s8 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <16 x i8> %x, %y + %half = ashr <16 x i8> %add, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <16 x i8> @vhaddu_v16i8(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vhaddu_v16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vshr.u8 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <16 x i8> %x, %y + %half = lshr <16 x i8> %add, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <8 x i16> @vhadds_v8i16(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vhadds_v8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vshr.s16 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <8 x i16> %x, %y + %half = ashr <8 x i16> %add, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <8 x i16> @vhaddu_v8i16(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vhaddu_v8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <8 x i16> %x, %y + %half = lshr <8 x i16> %add, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <4 x i32> @vhadds_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vhadds_v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vshr.s32 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <4 x i32> %x, %y + %half = ashr <4 x i32> %add, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <4 x i32> @vhaddu_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vhaddu_v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: bx lr + %add = add <4 x i32> %x, %y + %half = lshr <4 x i32> %add, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <16 x i8> @vhsubs_v16i8(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vhsubs_v16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vsub.i8 q0, q0, q1 +; CHECK-NEXT: vshr.s8 q0, q0, #1 +; CHECK-NEXT: bx lr + %sub = sub <16 x i8> %x, %y + %half = ashr <16 x i8> %sub, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <16 x i8> @vhsubu_v16i8(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vhsubu_v16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vsub.i8 q0, q0, q1 +; CHECK-NEXT: vshr.u8 q0, q0, #1 +; CHECK-NEXT: bx lr + %sub = sub <16 x i8> %x, %y + %half = lshr <16 x i8> %sub, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <8 x i16> @vhsubs_v8i16(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vhsubs_v8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: vshr.s16 q0, q0, #1 +; CHECK-NEXT: bx lr + %sub = sub <8 x i16> %x, %y + %half = ashr <8 x i16> %sub, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <8 x i16> @vhsubu_v8i16(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vhsubu_v8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: bx lr + %sub = sub <8 x i16> %x, %y + %half = lshr <8 x i16> %sub, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <4 x i32> @vhsubs_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vhsubs_v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vsub.i32 q0, q0, q1 +; CHECK-NEXT: vshr.s32 q0, q0, #1 +; CHECK-NEXT: bx lr + %sub = sub <4 x i32> %x, %y + %half = ashr <4 x i32> %sub, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <4 x i32> @vhsubu_v4i32(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vhsubu_v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vsub.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: bx lr + %sub = sub <4 x i32> %x, %y + %half = lshr <4 x i32> %sub, + ret <4 x i32> %half +} + +define arm_aapcs_vfpcc <16 x i8> @vhadds_v16i8_nw(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vhadds_v16i8_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhadd.s8 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nsw <16 x i8> %x, %y + %half = ashr <16 x i8> %add, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <16 x i8> @vhaddu_v16i8_nw(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vhaddu_v16i8_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhadd.u8 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nuw <16 x i8> %x, %y + %half = lshr <16 x i8> %add, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <8 x i16> @vhadds_v8i16_nw(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vhadds_v8i16_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhadd.s16 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nsw <8 x i16> %x, %y + %half = ashr <8 x i16> %add, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <8 x i16> @vhaddu_v8i16_nw(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vhaddu_v8i16_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhadd.u16 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nuw <8 x i16> %x, %y + %half = lshr <8 x i16> %add, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <4 x i32> @vhadds_v4i32_nw(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vhadds_v4i32_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhadd.s32 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nsw <4 x i32> %x, %y + %half = ashr <4 x i32> %add, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <4 x i32> @vhaddu_v4i32_nw(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vhaddu_v4i32_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhadd.u32 q0, q0, q1 +; CHECK-NEXT: bx lr + %add = add nuw <4 x i32> %x, %y + %half = lshr <4 x i32> %add, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <16 x i8> @vhsubs_v16i8_nw(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vhsubs_v16i8_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhsub.s8 q0, q0, q1 +; CHECK-NEXT: bx lr + %sub = sub nsw <16 x i8> %x, %y + %half = ashr <16 x i8> %sub, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <16 x i8> @vhsubu_v16i8_nw(<16 x i8> %x, <16 x i8> %y) { +; CHECK-LABEL: vhsubu_v16i8_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhsub.u8 q0, q0, q1 +; CHECK-NEXT: bx lr + %sub = sub nuw <16 x i8> %x, %y + %half = lshr <16 x i8> %sub, + ret <16 x i8> %half +} +define arm_aapcs_vfpcc <8 x i16> @vhsubs_v8i16_nw(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vhsubs_v8i16_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhsub.s16 q0, q0, q1 +; CHECK-NEXT: bx lr + %sub = sub nsw <8 x i16> %x, %y + %half = ashr <8 x i16> %sub, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <8 x i16> @vhsubu_v8i16_nw(<8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: vhsubu_v8i16_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhsub.u16 q0, q0, q1 +; CHECK-NEXT: bx lr + %sub = sub nuw <8 x i16> %x, %y + %half = lshr <8 x i16> %sub, + ret <8 x i16> %half +} +define arm_aapcs_vfpcc <4 x i32> @vhsubs_v4i32_nw(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vhsubs_v4i32_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhsub.s32 q0, q0, q1 +; CHECK-NEXT: bx lr + %sub = sub nsw <4 x i32> %x, %y + %half = ashr <4 x i32> %sub, + ret <4 x i32> %half +} +define arm_aapcs_vfpcc <4 x i32> @vhsubu_v4i32_nw(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vhsubu_v4i32_nw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vhsub.u32 q0, q0, q1 +; CHECK-NEXT: bx lr + %sub = sub nuw <4 x i32> %x, %y + %half = lshr <4 x i32> %sub, + ret <4 x i32> %half +} Index: llvm/test/CodeGen/Thumb2/mve-vhaddsub.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vhaddsub.ll +++ llvm/test/CodeGen/Thumb2/mve-vhaddsub.ll @@ -28,8 +28,7 @@ define arm_aapcs_vfpcc <4 x i32> @add_ashr_v4i32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: add_ashr_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vshr.s32 q0, q0, #1 +; CHECK-NEXT: vhadd.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = add nsw <4 x i32> %src1, %src2 @@ -100,8 +99,7 @@ define arm_aapcs_vfpcc <4 x i32> @sub_ashr_v4i32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: sub_ashr_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vsub.i32 q0, q0, q1 -; CHECK-NEXT: vshr.s32 q0, q0, #1 +; CHECK-NEXT: vhsub.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = sub nsw <4 x i32> %src1, %src2