Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -2733,6 +2733,33 @@ BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; +// SABD Vd., Vn., Vm. Subtracts the elements of Vm from the corresponding +// elements of Vn, and places the absolute values of the results in the elements of Vd. +def : Pat<(xor (v8i8 (AArch64vashr (v8i8(sub V64:$Rn, V64:$Rm)), (i32 7))), + (v8i8 (add (v8i8(sub V64:$Rn, V64:$Rm)), + (AArch64vashr (v8i8(sub V64:$Rn, V64:$Rm)), (i32 7))))), + (SABDv8i8 V64:$Rn, V64:$Rm)>; +def : Pat<(xor (v4i16 (AArch64vashr (v4i16(sub V64:$Rn, V64:$Rm)), (i32 15))), + (v4i16 (add (v4i16(sub V64:$Rn, V64:$Rm)), + (AArch64vashr (v4i16(sub V64:$Rn, V64:$Rm)), (i32 15))))), + (SABDv4i16 V64:$Rn, V64:$Rm)>; +def : Pat<(xor (v2i32 (AArch64vashr (v2i32(sub V64:$Rn, V64:$Rm)), (i32 31))), + (v2i32 (add (v2i32(sub V64:$Rn, V64:$Rm)), + (AArch64vashr (v2i32(sub V64:$Rn, V64:$Rm)), (i32 31))))), + (SABDv2i32 V64:$Rn, V64:$Rm)>; +def : Pat<(xor (v16i8 (AArch64vashr (v16i8(sub V128:$Rn, V128:$Rm)), (i32 7))), + (v16i8 (add (v16i8(sub V128:$Rn, V128:$Rm)), + (AArch64vashr (v16i8(sub V128:$Rn, V128:$Rm)), (i32 7))))), + (SABDv16i8 V128:$Rn, V128:$Rm)>; +def : Pat<(xor (v8i16 (AArch64vashr (v8i16(sub V128:$Rn, V128:$Rm)), (i32 15))), + (v8i16 (add (v8i16(sub V128:$Rn, V128:$Rm)), + (AArch64vashr (v8i16(sub V128:$Rn, V128:$Rm)), (i32 15))))), + (SABDv8i16 V128:$Rn, V128:$Rm)>; +def : Pat<(xor (v4i32 (AArch64vashr (v4i32(sub V128:$Rn, V128:$Rm)), (i32 31))), + (v4i32 (add (v4i32(sub V128:$Rn, V128:$Rm)), + (AArch64vashr (v4i32(sub V128:$Rn, V128:$Rm)), (i32 31))))), + (SABDv4i32 V128:$Rn, V128:$Rm)>; + def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), Index: test/CodeGen/AArch64/arm64-sabd.ll =================================================================== --- test/CodeGen/AArch64/arm64-sabd.ll +++ test/CodeGen/AArch64/arm64-sabd.ll @@ -0,0 +1,107 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +@a = common global [4 x i32] zeroinitializer +@b = common global [4 x i32] zeroinitializer +@c = common global [4 x i32] zeroinitializer + +; CHECK: testv4i32 +; CHECK: sabd +define void @testv4i32() { + %1 = load <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*) + %2 = load <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*) + %3 = sub nsw <4 x i32> %1, %2 + %4 = icmp sgt <4 x i32> %3, + %5 = sub <4 x i32> zeroinitializer, %3 + %6 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> %5 + store <4 x i32> %6, <4 x i32>* bitcast ([4 x i32]* @c to <4 x i32>*) + ret void +} + +@d = common global [2 x i32] zeroinitializer +@e = common global [2 x i32] zeroinitializer +@f = common global [2 x i32] zeroinitializer + +; CHECK: testv2i32 +; CHECK: sabd +define void @testv2i32() { + %1 = load <2 x i32>* bitcast ([2 x i32]* @d to <2 x i32>*) + %2 = load <2 x i32>* bitcast ([2 x i32]* @e to <2 x i32>*) + %3 = sub nsw <2 x i32> %1, %2 + %4 = icmp sgt <2 x i32> %3, + %5 = sub <2 x i32> zeroinitializer, %3 + %6 = select <2 x i1> %4, <2 x i32> %3, <2 x i32> %5 + store <2 x i32> %6, <2 x i32>* bitcast ([2 x i32]* @f to <2 x i32>*) + ret void +} + +@g = common global [8 x i16] zeroinitializer +@h = common global [8 x i16] zeroinitializer +@i = common global [8 x i16] zeroinitializer + +; CHECK: testv8i16 +; CHECK: sabd +define void @testv8i16() { + %1 = load <8 x i16>* bitcast ([8 x i16]* @g to <8 x i16>*) + %2 = load <8 x i16>* bitcast ([8 x i16]* @h to <8 x i16>*) + %3 = sub nsw <8 x i16> %1, %2 + %4 = icmp sgt <8 x i16> %3, + %5 = sub <8 x i16> zeroinitializer, %3 + %6 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %5 + store <8 x i16> %6, <8 x i16>* bitcast ([8 x i16]* @i to <8 x i16>*) + ret void +} + +@j = common global [4 x i16] zeroinitializer +@k = common global [4 x i16] zeroinitializer +@l = common global [4 x i16] zeroinitializer + +; CHECK: testv4i16 +; CHECK: sabd +define void @testv4i16() { + %1 = load <4 x i16>* bitcast ([4 x i16]* @j to <4 x i16>*) + %2 = load <4 x i16>* bitcast ([4 x i16]* @k to <4 x i16>*) + %3 = sub nsw <4 x i16> %1, %2 + %4 = icmp sgt <4 x i16> %3, + %5 = sub <4 x i16> zeroinitializer, %3 + %6 = select <4 x i1> %4, <4 x i16> %3, <4 x i16> %5 + store <4 x i16> %6, <4 x i16>* bitcast ([4 x i16]* @l to <4 x i16>*) + ret void +} + +@m = common global [16 x i8] zeroinitializer +@n = common global [16 x i8] zeroinitializer +@o = common global [16 x i8] zeroinitializer + +; CHECK: testv16i8 +; CHECK: sabd +define void @testv16i8() { + %1 = load <16 x i8>* bitcast ([16 x i8]* @m to <16 x i8>*) + %2 = load <16 x i8>* bitcast ([16 x i8]* @n to <16 x i8>*) + %3 = sub nsw <16 x i8> %1, %2 + %4 = icmp sgt <16 x i8> %3, + %5 = sub <16 x i8> zeroinitializer, %3 + %6 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %5 + store <16 x i8> %6, <16 x i8>* bitcast ([16 x i8]* @o to <16 x i8>*) + ret void +} + +@p = common global [8 x i8] zeroinitializer +@q = common global [8 x i8] zeroinitializer +@r = common global [8 x i8] zeroinitializer + +; CHECK: testv8i8 +; CHECK: sabd +define void @testv8i8() { + %1 = load <8 x i8>* bitcast ([8 x i8]* @p to <8 x i8>*) + %2 = load <8 x i8>* bitcast ([8 x i8]* @q to <8 x i8>*) + %3 = sub nsw <8 x i8> %1, %2 + %4 = icmp sgt <8 x i8> %3, + %5 = sub <8 x i8> zeroinitializer, %3 + %6 = select <8 x i1> %4, <8 x i8> %3, <8 x i8> %5 + store <8 x i8> %6, <8 x i8>* bitcast ([8 x i8]* @r to <8 x i8>*) + ret void +} + +