diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -99,11 +99,6 @@ STATISTIC(NumShiftInserts, "Number of vector shift inserts"); STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); -static cl::opt -EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, - cl::desc("Allow AArch64 SLI/SRI formation"), - cl::init(false)); - // FIXME: The necessary dtprel relocations don't seem to be supported // well in the GNU bfd and gold linkers at the moment. Therefore, by // default, for now, fall back to GeneralDynamic code generation. @@ -7893,8 +7888,9 @@ // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a -// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. -// Also, logical shift right -> sri, with the same structure. +// BUILD_VECTORs with constant element C1, C2 is a constant, and: +// - for the SLI case: C1 == Ones(ElemSizeInBits) >> (ElemSizeInBits - C2) +// - for the SRI case: C1 == Ones(ElemSizeInBits) << (ElemSizeInBits - C2) static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -7927,15 +7923,21 @@ if (!isAllConstantBuildVector(And.getOperand(1), C1)) return SDValue(); - // Is C1 == ~C2, taking into account how much one can shift elements of a - // particular size? + // Is C1 == Ones(ElemSizeInBits) << (ElemSizeInBits - C2) or + // C1 == Ones(ElemSizeInBits) >> (ElemSizeInBits - C2), taking into account + // how much one can shift elements of a particular size? uint64_t C2 = C2node->getZExtValue(); unsigned ElemSizeInBits = VT.getScalarSizeInBits(); if (C2 > ElemSizeInBits) return SDValue(); unsigned ElemMask = (1 << ElemSizeInBits) - 1; - if ((C1 & ElemMask) != (~C2 & ElemMask)) - return SDValue(); + if (IsShiftRight) { + if ((C1 & ElemMask) != ((ElemMask << (ElemSizeInBits - C2)) & ElemMask)) + return SDValue(); + } else { + if ((C1 & ElemMask) != ((ElemMask >> (ElemSizeInBits - C2)) & ElemMask)) + return SDValue(); + } SDValue X = And.getOperand(0); SDValue Y = Shift.getOperand(0); @@ -7959,10 +7961,8 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) - if (EnableAArch64SlrGeneration) { - if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) - return Res; - } + if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) + return Res; EVT VT = Op.getValueType(); diff --git a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll --- a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll @@ -1,9 +1,9 @@ -; RUN: llc < %s -aarch64-shift-insert-generation=true -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { ; CHECK-LABEL: testLeftGood: ; CHECK: sli.16b v0, v1, #3 - %and.i = and <16 x i8> %src1, + %and.i = and <16 x i8> %src1, %vshl_n = shl <16 x i8> %src2, %result = or <16 x i8> %and.i, %vshl_n store <16 x i8> %result, <16 x i8>* %dest, align 16 @@ -23,7 +23,7 @@ define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind { ; CHECK-LABEL: testRightGood: ; CHECK: sri.16b v0, v1, #3 - %and.i = and <16 x i8> %src1, + %and.i = and <16 x i8> %src1, %vshl_n = lshr <16 x i8> %src2, %result = or <16 x i8> %and.i, %vshl_n store <16 x i8> %result, <16 x i8>* %dest, align 16