Index: llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp =================================================================== --- llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -78,6 +78,7 @@ static bool optimizeConvertFromSVBool(IntrinsicInst *I); static bool optimizePTest(IntrinsicInst *I); static bool optimizeVectorMul(IntrinsicInst *I); + static bool optimizeTBL(IntrinsicInst *I); static bool processPhiNode(IntrinsicInst *I); }; @@ -437,6 +438,39 @@ return Changed; } +bool SVEIntrinsicOpts::optimizeTBL(IntrinsicInst *I) { + assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_tbl && + "Unexpected opcode"); + + auto *OpVal = I->getOperand(0); + auto *OpIndices = I->getOperand(1); + VectorType *VTy = cast(I->getType()); + + // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with + // constant splat value < minimal element count of result. + auto *DupXIntrI = dyn_cast(OpIndices); + if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) + return false; + + auto *SplatValue = dyn_cast(DupXIntrI->getOperand(0)); + if (!SplatValue || + SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) + return false; + + // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to + // sve_dup_x(extractelement(OpVal, SplatValue)) for further optimization. + LLVMContext &Ctx = I->getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(I); + auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); + auto *NewDupX = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_dup_x, {VTy}, {Extract}); + + I->replaceAllUsesWith(NewDupX); + I->eraseFromParent(); + return true; +} + bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool && "Unexpected opcode"); @@ -507,6 +541,8 @@ case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: return optimizePTest(IntrI); + case Intrinsic::aarch64_sve_tbl: + return optimizeTBL(IntrI); default: return false; } @@ -560,6 +596,7 @@ case Intrinsic::aarch64_sve_ptrue: case Intrinsic::aarch64_sve_mul: case Intrinsic::aarch64_sve_fmul: + case Intrinsic::aarch64_sve_tbl: for (User *U : F.users()) Functions.insert(cast(U)->getFunction()); break; Index: llvm/test/CodeGen/AArch64/sve-tbl-dupx.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-tbl-dupx.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -aarch64-sve-intrinsic-opts -dce < %s 2>%t | FileCheck %s +; op2 = tbl(op1 dup_x(idx)) -> op2 = dup_x(extractelement(op1, idx)) + +define @dup_ext_i8( %data) { +; CHECK-LABEL: @dup_ext_i8( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i8 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.dup.x.nxv16i8(i8 [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = call @llvm.aarch64.sve.dup.x.nxv16i8(i8 1) + %out = call @llvm.aarch64.sve.tbl.nxv16i8( %data, %tmp) + ret %out +} + +define @dup_ext_i16( %data) { +; CHECK-LABEL: @dup_ext_i16( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i16 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) + %out = call @llvm.aarch64.sve.tbl.nxv8i16( %data, %tmp) + ret %out +} + +define @dup_ext_i32( %data) { +; CHECK-LABEL: @dup_ext_i32( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = call @llvm.aarch64.sve.tbl.nxv4i32( %data, %tmp) + ret %out +} + +define @dup_ext_i64( %data) { +; CHECK-LABEL: @dup_ext_i64( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) + %out = call @llvm.aarch64.sve.tbl.nxv2i64( %data, %tmp) + ret %out +} + +define @dup_ext_f16( %data) { +; CHECK-LABEL: @dup_ext_f16( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i16 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8f16(half [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) + %out = call @llvm.aarch64.sve.tbl.nxv8f16( %data, %tmp) + ret %out +} + +define @dup_ext_f32( %data) { +; CHECK-LABEL: @dup_ext_f32( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.dup.x.nxv4f32(float [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) + %out = call @llvm.aarch64.sve.tbl.nxv4f32( %data, %tmp) + ret %out +} + +define @dup_ext_f64( %data) { +; CHECK-LABEL: @dup_ext_f64( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[DATA:%.*]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.dup.x.nxv2f64(double [[TMP1]]) +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) + %out = call @llvm.aarch64.sve.tbl.nxv2f64( %data, %tmp) + ret %out +} + +declare @llvm.aarch64.sve.dup.x.nxv16i8( i8) +declare @llvm.aarch64.sve.dup.x.nxv8i16(i16) +declare @llvm.aarch64.sve.dup.x.nxv4i32(i32) +declare @llvm.aarch64.sve.dup.x.nxv2i64(i64) +declare @llvm.aarch64.sve.dup.x.nxv8f16(half) +declare @llvm.aarch64.sve.dup.x.nxv2f32(float) +declare @llvm.aarch64.sve.dup.x.nxv4f32(float) +declare @llvm.aarch64.sve.dup.x.nxv2f64(double) +declare @llvm.aarch64.sve.tbl.nxv16i8( , ) +declare @llvm.aarch64.sve.tbl.nxv8i16( , ) +declare @llvm.aarch64.sve.tbl.nxv4i32( , ) +declare @llvm.aarch64.sve.tbl.nxv2i64( , ) +declare @llvm.aarch64.sve.tbl.nxv8f16( , ) +declare @llvm.aarch64.sve.tbl.nxv4f32( , ) +declare @llvm.aarch64.sve.tbl.nxv2f64( , )