diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1381,6 +1381,79 @@ return None; } +static Optional instCombineSVEDupqLane(InstCombiner &IC, + IntrinsicInst &II) { + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + + auto VecTy = cast(II.getType()); + Type *VecElTy = VecTy->getScalarType(); + + Type *WideElTy = nullptr; + if (VecElTy->isFloatTy()) + WideElTy = Builder.getDoubleTy(); + if (VecElTy->isHalfTy()) + WideElTy = Builder.getFloatTy(); + if (WideElTy == nullptr) + return None; + + auto VecInsert = dyn_cast(II.getOperand(0)); + if (!VecInsert || VecInsert->getIntrinsicID() != Intrinsic::vector_insert) + return None; + + // Capture the sequence of scalars in reverse, i.e. y(3), x(2), y(1), x(0) + SmallVector RSequence; + Value *CurrentValue = VecInsert->getOperand(1); + uint64_t NumElements = VecTy->getMinNumElements(); + for (uint64_t i = NumElements; i > 0; i--) { + InsertElementInst *InsertElt = dyn_cast(CurrentValue); + if (!InsertElt) + break; + + // Bail if the insertelement indices aren't descending linearly + auto Idx = cast(InsertElt->getOperand(2)); + if (Idx->getValue() != APInt(64, i - 1)) + return None; + + RSequence.push_back(InsertElt->getOperand(1)); + CurrentValue = InsertElt->getOperand(0); + } + + if (RSequence.size() < NumElements) + return None; + + // Check for the pattern (y, x, y, x) or (y, x, y, x, y, x, y, x) + for (uint64_t i = 0; i < NumElements - 2; i++) + if (RSequence[i] != RSequence[i + 2]) + return None; + + // Simplify the redundant long chain of insertelement's to two insertelements + // splatted as one widened element + auto Zero = ConstantInt::get(Builder.getInt64Ty(), APInt::getZero(64)); + auto One = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 1)); + Value *InsertEltX = Builder.CreateInsertElement( + UndefValue::get(CurrentValue->getType()), RSequence[1], Zero); + Value *InsertEltY = + Builder.CreateInsertElement(InsertEltX, RSequence[0], One); + CallInst *InsertSubvec = Builder.CreateInsertVector( + II.getType(), UndefValue::get(II.getType()), InsertEltY, Zero); + + uint64_t HalfNumElements = NumElements / 2; + auto WideScalableTy = ScalableVectorType::get(WideElTy, HalfNumElements); + Value *WideCast = + Builder.CreateBitOrPointerCast(InsertSubvec, WideScalableTy); + + auto ShuffleMaskTy = + ScalableVectorType::get(Builder.getInt32Ty(), HalfNumElements); + auto ShuffleMask = ConstantAggregateZero::get(ShuffleMaskTy); + + Value *Shuffle = Builder.CreateShuffleVector( + WideCast, PoisonValue::get(WideScalableTy), ShuffleMask); + Value *RetCast = Builder.CreateBitOrPointerCast(Shuffle, II.getType()); + + return IC.replaceInstUsesWith(II, RetCast); +} + static Optional instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II) { Value *A = II.getArgOperand(0); @@ -1497,6 +1570,8 @@ return instCombineSVESel(IC, II); case Intrinsic::aarch64_sve_srshl: return instCombineSVESrshl(IC, II); + case Intrinsic::aarch64_sve_dupq_lane: + return instCombineSVEDupqLane(IC, II); } return None; diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -587,49 +587,35 @@ define dso_local @dupq_f32_repeat_complex(float %x, float %y) { ; CHECK-LABEL: dupq_f32_repeat_complex: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: mov v2.s[1], v1.s[0] -; CHECK-NEXT: mov v2.s[2], v0.s[0] -; CHECK-NEXT: mov v2.s[3], v1.s[0] -; CHECK-NEXT: mov z0.q, q2 +; CHECK-NEXT: mov v0.s[1], v1.s[0] +; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: ret %1 = insertelement <4 x float> undef, float %x, i64 0 %2 = insertelement <4 x float> %1, float %y, i64 1 - %3 = insertelement <4 x float> %2, float %x, i64 2 - %4 = insertelement <4 x float> %3, float %y, i64 3 - %5 = tail call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %4, i64 0) - %6 = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( %5, i64 0) + %3 = call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %2, i64 0) + %4 = bitcast %3 to + %5 = shufflevector %4, poison, zeroinitializer + %6 = bitcast %5 to ret %6 } -define dso_local @dupq_f16_repeat_complex(half %a, half %b) { +define dso_local @dupq_f16_repeat_complex(half %x, half %y) { ; CHECK-LABEL: dupq_f16_repeat_complex: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: // kill: def $h1 killed $h1 def $q1 -; CHECK-NEXT: mov v2.h[1], v1.h[0] -; CHECK-NEXT: mov v2.h[2], v0.h[0] -; CHECK-NEXT: mov v2.h[3], v1.h[0] -; CHECK-NEXT: mov v2.h[4], v0.h[0] -; CHECK-NEXT: mov v2.h[5], v1.h[0] -; CHECK-NEXT: mov v2.h[6], v0.h[0] -; CHECK-NEXT: mov v2.h[7], v1.h[0] -; CHECK-NEXT: mov z0.q, q2 -; CHECK-NEXT: ret - %1 = insertelement <8 x half> undef, half %a, i64 0 - %2 = insertelement <8 x half> %1, half %b, i64 1 - %3 = insertelement <8 x half> %2, half %a, i64 2 - %4 = insertelement <8 x half> %3, half %b, i64 3 - %5 = insertelement <8 x half> %4, half %a, i64 4 - %6 = insertelement <8 x half> %5, half %b, i64 5 - %7 = insertelement <8 x half> %6, half %a, i64 6 - %8 = insertelement <8 x half> %7, half %b, i64 7 - %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %8, i64 0) - %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) - ret %10 +; CHECK-NEXT: mov v0.h[1], v1.h[0] +; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ret + %1 = insertelement <8 x half> undef, half %x, i64 0 + %2 = insertelement <8 x half> %1, half %y, i64 1 + %3 = call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %2, i64 0) + %4 = bitcast %3 to + %5 = shufflevector %4, poison, zeroinitializer + %6 = bitcast %5 to + ret %6 } define @ext_i8( %a, %b) { diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-dupqlane.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define dso_local @dupq_f32_repeat_complex(float %x, float %y) { +; CHECK-LABEL: @dupq_f32_repeat_complex( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <4 x float> undef, float %x, i64 0 + %2 = insertelement <4 x float> %1, float %y, i64 1 + %3 = insertelement <4 x float> %2, float %x, i64 2 + %4 = insertelement <4 x float> %3, float %y, i64 3 + %5 = tail call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %4, i64 0) + %6 = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( %5, i64 0) + ret %6 +} + +define dso_local @dupq_f16_repeat_complex(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_repeat_complex( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[TMP3]] to +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector [[TMP4]], poison, zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = bitcast [[TMP5]] to +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <8 x half> undef, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %b, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f32_complex_no_pattern(float %x, float %y, float %z) { +; CHECK-LABEL: @dupq_f32_complex_no_pattern( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[Z:%.*]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[Y]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( [[TMP5]], i64 0) +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <4 x float> undef, float %x, i64 0 + %2 = insertelement <4 x float> %1, float %y, i64 1 + %3 = insertelement <4 x float> %2, float %z, i64 2 + %4 = insertelement <4 x float> %3, float %y, i64 3 + %5 = tail call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %4, i64 0) + %6 = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( %5, i64 0) + ret %6 +} + +define dso_local @dupq_f32_complex_rev(float %x, float %y) { +; CHECK-LABEL: @dupq_f32_complex_rev( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[Y:%.*]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[X]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[Y]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( [[TMP5]], i64 0) +; CHECK-NEXT: ret [[TMP6]] +; + %1 = insertelement <4 x float> undef, float %x, i64 3 + %2 = insertelement <4 x float> %1, float %y, i64 2 + %3 = insertelement <4 x float> %2, float %x, i64 1 + %4 = insertelement <4 x float> %3, float %y, i64 0 + %5 = tail call @llvm.vector.insert.nxv4f32.v4f32( undef, <4 x float> %4, i64 0) + %6 = tail call @llvm.aarch64.sve.dupq.lane.nxv4f32( %5, i64 0) + ret %6 +} + +define dso_local @dupq_f16_complex_missing_middle(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_complex_missing_middle( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 6 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 7 +; CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP7]], i64 0) +; CHECK-NEXT: ret [[TMP8]] +; + %1 = insertelement <8 x half> undef, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 6 + %6 = insertelement <8 x half> %5, half %b, i64 7 + %7 = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %6, i64 0) + %8 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %7, i64 0) + ret %8 +} + +define dso_local @dupq_f16_repeat_complex_bad_front(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_repeat_complex_bad_front( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[A]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B:%.*]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[B]], i64 7 +; CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP9]], i64 0) +; CHECK-NEXT: ret [[TMP10]] +; + %1 = insertelement <8 x half> undef, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %a, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %b, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +define dso_local @dupq_f16_repeat_complex_bad_back(half %a, half %b) { +; CHECK-LABEL: @dupq_f16_repeat_complex_bad_back( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[A]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[B]], i64 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[A]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[B]], i64 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[A]], i64 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half [[A]], i64 7 +; CHECK-NEXT: [[TMP9:%.*]] = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( [[TMP9]], i64 0) +; CHECK-NEXT: ret [[TMP10]] +; + %1 = insertelement <8 x half> undef, half %a, i64 0 + %2 = insertelement <8 x half> %1, half %b, i64 1 + %3 = insertelement <8 x half> %2, half %a, i64 2 + %4 = insertelement <8 x half> %3, half %b, i64 3 + %5 = insertelement <8 x half> %4, half %a, i64 4 + %6 = insertelement <8 x half> %5, half %b, i64 5 + %7 = insertelement <8 x half> %6, half %a, i64 6 + %8 = insertelement <8 x half> %7, half %a, i64 7 + %9 = tail call @llvm.vector.insert.nxv8f16.v8f16( undef, <8 x half> %8, i64 0) + %10 = tail call @llvm.aarch64.sve.dupq.lane.nxv8f16( %9, i64 0) + ret %10 +} + +declare @llvm.vector.insert.nxv8f16.v8f16(, <8 x half>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8f16(, i64) +declare @llvm.vector.insert.nxv4f32.v4f32(, <4 x float>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4f32(, i64) +declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) +declare @llvm.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) + +attributes #0 = { "target-features"="+sve" }