Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -926,6 +926,70 @@ return nullptr; } +/// Convert a table lookup to shufflevector if the mask is constant. +/// This can only benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in +/// which case we could lower the shufflevector with rev64 instructions +/// as it's actually a byte reverse. +static Value *simplifyTableLookup(const IntrinsicInst &II, + const DataLayout &DL, + InstCombiner::BuilderTy &Builder) { + auto VecTy = cast(II.getType()); + unsigned NumElts = VecTy->getNumElements(); + assert((NumElts == 8) && "Unexpected number of elements in shuffle mask!"); + + auto Mask = dyn_cast(II.getArgOperand(1)); + + // If the mask is coming from a vector load try to fold it. + if (!Mask) { + auto Vld1 = dyn_cast(II.getArgOperand(1)); + + if (Vld1 && Vld1->getIntrinsicID() == Intrinsic::arm_neon_vld1) { + // Strip off any GEP address adjustments and pointer + // casts from the original object being addressed. + Value *V = GetUnderlyingObject(Vld1->getArgOperand(0), DL); + + if (auto GV = dyn_cast(V)) { + if (GV->isConstant() && GV->hasDefinitiveInitializer()) { + Constant *C = GV->getInitializer(); + SmallVector NewElements; + + for (unsigned I = 0; I < NumElts; ++I) { + Constant *Elt = C->getAggregateElement(I); + if (!Elt) + return nullptr; + NewElements.push_back(Elt); + } + Mask = ConstantVector::get(NewElements); + } + } + } + } + + if (!Mask) + return nullptr; + + Constant *Indexes[8] = {nullptr}; + auto EltTy = Type::getInt32Ty(II.getContext()); + + // Check whether the mask matches the pattern { 7,6,5,4,3,2,1,0 }. + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = Mask->getAggregateElement(I); + if (!COp || !isa(COp)) + return nullptr; + + uint8_t Index = cast(COp)->getValue().getZExtValue(); + if (Index != NumElts-1 - I) + return nullptr; + + Indexes[I] = ConstantInt::get(EltTy, Index); + } + + auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts)); + auto V1 = II.getArgOperand(0); + auto V2 = Constant::getNullValue(V1->getType()); + return Builder.CreateShuffleVector(V1, V2, ShuffleMask); +} + /// Attempt to convert pshufb* to shufflevector if the mask is constant. static Value *simplifyX86pshufb(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { @@ -2995,6 +3059,13 @@ break; } + case Intrinsic::arm_neon_vtbl1: + case Intrinsic::aarch64_neon_tbl1: + if (Value *V = simplifyTableLookup(*II, DL, Builder)) { + return replaceInstUsesWith(*II, V); + } + break; + case Intrinsic::arm_neon_vmulls: case Intrinsic::arm_neon_vmullu: case Intrinsic::aarch64_neon_smull: Index: test/Transforms/InstCombine/AArch64/table-lookup.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/AArch64/table-lookup.ll @@ -0,0 +1,14 @@ +; RUN: opt -instcombine -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-arm-none-eabi" + +define <8 x i8> @table_lookup(<16 x i8> %vec) { +entry: +;CHECK-NOT: call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8 +;CHECK: shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> + %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> ) + ret <8 x i8> %tbl1 +} + +declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) Index: test/Transforms/InstCombine/ARM/table-lookup.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/ARM/table-lookup.ll @@ -0,0 +1,19 @@ +; RUN: opt -instcombine -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv8-arm-none-eabi" + +@big_endian_mask = hidden constant [8 x i8] c"\07\06\05\04\03\02\01\00", align 16 + +define <8 x i8> @table_lookup(<8 x i8> %vec) { +entry: +;CHECK-NOT: call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8 +;CHECK-NOT: call <8 x i8> @llvm.arm.neon.vtbl1 +;CHECK: shufflevector <8 x i8> %vec, <8 x i8> undef, <8 x i32> + %mask = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @big_endian_mask, i32 0, i32 0), i32 16) + %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> %mask) + ret <8 x i8> %vtbl1 +} + +declare <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8*, i32) +declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>)