diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -438,6 +438,18 @@ return IC.replaceInstUsesWith(II, Insert); } +static Optional instCombineSVEDupX(InstCombiner &IC, + IntrinsicInst &II) { + // Replace DupX with a regular IR splat. + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + auto *RetTy = cast(II.getType()); + Value *Splat = + Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); + + return IC.replaceInstUsesWith(II, Splat); +} + static Optional instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II) { LLVMContext &Ctx = II.getContext(); @@ -455,12 +467,9 @@ return None; // Check that we have a compare of zero.. - auto *DupX = dyn_cast(II.getArgOperand(2)); - if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) - return None; - - auto *DupXArg = dyn_cast(DupX->getArgOperand(0)); - if (!DupXArg || !DupXArg->isZero()) + auto *SplatValue = + dyn_cast_or_null(getSplatValue(II.getArgOperand(2))); + if (!SplatValue || !SplatValue->isZero()) return None; // ..against a dupq @@ -693,14 +702,11 @@ IRBuilder<> Builder(II.getContext()); Builder.SetInsertPoint(&II); - // Return true if a given instruction is an aarch64_sve_dup_x intrinsic call - // with a unit splat value, false otherwise. - auto IsUnitDupX = [](auto *I) { - auto *IntrI = dyn_cast(I); - if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) + // Return true if a given instruction is a unit splat value, false otherwise. + auto IsUnitSplat = [](auto *I) { + auto *SplatValue = getSplatValue(I); + if (!SplatValue) return false; - - auto *SplatValue = IntrI->getOperand(0); return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); }; @@ -717,10 +723,10 @@ // The OpMultiplier variable should always point to the dup (if any), so // swap if necessary. - if (IsUnitDup(OpMultiplicand) || IsUnitDupX(OpMultiplicand)) + if (IsUnitDup(OpMultiplicand) || IsUnitSplat(OpMultiplicand)) std::swap(OpMultiplier, OpMultiplicand); - if (IsUnitDupX(OpMultiplier)) { + if (IsUnitSplat(OpMultiplier)) { // [f]mul pg (dupx 1) %n => %n OpMultiplicand->takeName(&II); return IC.replaceInstUsesWith(II, OpMultiplicand); @@ -767,13 +773,9 @@ auto *OpIndices = II.getOperand(1); VectorType *VTy = cast(II.getType()); - // Check whether OpIndices is an aarch64_sve_dup_x intrinsic call with - // constant splat value < minimal element count of result. - auto *DupXIntrI = dyn_cast(OpIndices); - if (!DupXIntrI || DupXIntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) - return None; - - auto *SplatValue = dyn_cast(DupXIntrI->getOperand(0)); + // Check whether OpIndices is a constant splat value < minimal element count + // of result. + auto *SplatValue = dyn_cast_or_null(getSplatValue(OpIndices)); if (!SplatValue || SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) return None; @@ -801,6 +803,8 @@ return instCombineConvertFromSVBool(IC, II); case Intrinsic::aarch64_sve_dup: return instCombineSVEDup(IC, II); + case Intrinsic::aarch64_sve_dup_x: + return instCombineSVEDupX(IC, II); case Intrinsic::aarch64_sve_cmpne: case Intrinsic::aarch64_sve_cmpne_wide: return instCombineSVECmpNE(IC, II); diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll @@ -53,8 +53,7 @@ ; Edge case -- make sure that the case where we're fmultiplying two dups ; together is sane. ; CHECK-LABEL: @idempotent_fmul_two_dups( -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8f16(half 0xH3C00) -; CHECK-NEXT: ret [[TMP1]] +; CHECK-NEXT: ret shufflevector ( insertelement ( poison, half 0xH3C00, i32 0), poison, zeroinitializer) ; %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) %2 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) @@ -65,9 +64,8 @@ ; Non-idempotent fmuls -- we don't expect these to be optimised out. define @non_idempotent_fmul_f16( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_fmul_f16( -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8f16(half 0xH4000) -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.nxv8f16( [[PG:%.*]], [[A:%.*]], [[TMP1]]) -; CHECK-NEXT: ret [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.nxv8f16( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, half 0xH4000, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 2.0) %2 = call @llvm.aarch64.sve.fmul.nxv8f16( %pg, %a, %1) @@ -76,9 +74,8 @@ define @non_idempotent_fmul_f32( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_fmul_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv4f32(float 2.000000e+00) -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.nxv4f32( [[PG:%.*]], [[A:%.*]], [[TMP1]]) -; CHECK-NEXT: ret [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.nxv4f32( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv4f32(float 2.0) %2 = call @llvm.aarch64.sve.fmul.nxv4f32( %pg, %a, %1) @@ -87,9 +84,8 @@ define @non_idempotent_fmul_f64( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_fmul_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv2f64(double 2.000000e+00) -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.fmul.nxv2f64( [[PG:%.*]], [[A:%.*]], [[TMP1]]) -; CHECK-NEXT: ret [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.nxv2f64( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, double 2.000000e+00, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 2.0) %2 = call @llvm.aarch64.sve.fmul.nxv2f64( %pg, %a, %1) diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll @@ -53,8 +53,7 @@ ; Edge case -- make sure that the case where we're multiplying two dups ; together is sane. ; CHECK-LABEL: @idempotent_mul_two_dups( -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) -; CHECK-NEXT: ret [[TMP1]] +; CHECK-NEXT: ret shufflevector ( insertelement ( poison, i16 1, i32 0), poison, zeroinitializer) ; %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) %2 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) @@ -65,9 +64,8 @@ ; Non-idempotent muls -- we don't expect these to be optimised out. define @non_idempotent_mul_i16( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_mul_i16( -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 2) -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.mul.nxv8i16( [[PG:%.*]], [[A:%.*]], [[TMP1]]) -; CHECK-NEXT: ret [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.nxv8i16( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, i16 2, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 2) %2 = call @llvm.aarch64.sve.mul.nxv8i16( %pg, %a, %1) @@ -76,9 +74,8 @@ define @non_idempotent_mul_i32( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_mul_i32( -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 2) -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.mul.nxv4i32( [[PG:%.*]], [[A:%.*]], [[TMP1]]) -; CHECK-NEXT: ret [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.nxv4i32( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 2) %2 = call @llvm.aarch64.sve.mul.nxv4i32( %pg, %a, %1) @@ -87,9 +84,8 @@ define @non_idempotent_mul_i64( %pg, %a) #0 { ; CHECK-LABEL: @non_idempotent_mul_i64( -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 2) -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.aarch64.sve.mul.nxv2i64( [[PG:%.*]], [[A:%.*]], [[TMP1]]) -; CHECK-NEXT: ret [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.nxv2i64( [[PG:%.*]], [[A:%.*]], shufflevector ( insertelement ( poison, i64 2, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: ret [[TMP1]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 2) %2 = call @llvm.aarch64.sve.mul.nxv2i64( %pg, %a, %1)