diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12573,6 +12573,31 @@ Ops.push_back(&II->getArgOperandUse(1)); return true; + case Intrinsic::fma: { + // If the fma operates on a legal vector type, the indexed variants can be + // used, if the second operand is a splat of a valid index. + Value *IndexOp = II->getArgOperand(1); + if (!isSplatShuffle(IndexOp)) + return false; + auto *Shuffle = cast(IndexOp); + int Idx = Shuffle->getShuffleMask()[0]; + if (Idx < 0) + return false; + auto *VT = dyn_cast(Shuffle->getType()); + if (!VT) + return false; + EVT ActualVT = getValueType(II->getModule()->getDataLayout(), VT, + /*AllowUnknown*/ true); + MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; + if ((!Subtarget->hasFullFP16() || + (ActualMVT != MVT::v4f16 && ActualMVT != MVT::v8f16)) && + ActualMVT != MVT::v4f32 && ActualMVT != MVT::v2f64) + return false; + if (unsigned(Idx) >= VT->getNumElements()) + return false; + Ops.push_back(&II->getArgOperandUse(1)); + return true; + } default: return false; } diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -codegenprepare -S | FileCheck %s +; RUN: opt < %s -codegenprepare -S | FileCheck --check-prefixes=CHECK,NOFP16 %s +; RUN: opt < %s -codegenprepare -S -mattr=+fullfp16 | FileCheck --check-prefixes=CHECK,FULLFP16 %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown" @@ -498,29 +499,53 @@ declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) define <8 x half> @sink_shufflevector_fma_v8f16(i1 %c, <8 x half> %a, <8 x half> %b) { -; CHECK-LABEL: @sink_shufflevector_fma_v8f16( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> -; CHECK-NEXT: [[S3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> -; CHECK-NEXT: [[S4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> -; CHECK-NEXT: [[S5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> -; CHECK-NEXT: [[S6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> -; CHECK-NEXT: [[S7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> -; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] -; CHECK: if.then: -; CHECK-NEXT: [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[S0]], <8 x half> [[B]]) -; CHECK-NEXT: [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[S1]], <8 x half> [[B]]) -; CHECK-NEXT: [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[S2]], <8 x half> [[B]]) -; CHECK-NEXT: [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[S3]], <8 x half> [[B]]) -; CHECK-NEXT: ret <8 x half> [[R_3]] -; CHECK: if.else: -; CHECK-NEXT: [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[S4]], <8 x half> [[B]]) -; CHECK-NEXT: [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[S5]], <8 x half> [[B]]) -; CHECK-NEXT: [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[S6]], <8 x half> [[B]]) -; CHECK-NEXT: [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[S7]], <8 x half> [[B]]) -; CHECK-NEXT: ret <8 x half> [[R_7]] +; NOFP16-LABEL: @sink_shufflevector_fma_v8f16( +; NOFP16-NEXT: entry: +; NOFP16-NEXT: [[S0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer +; NOFP16-NEXT: [[S1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; NOFP16-NEXT: [[S2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; NOFP16-NEXT: [[S3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; NOFP16-NEXT: [[S4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; NOFP16-NEXT: [[S5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; NOFP16-NEXT: [[S6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; NOFP16-NEXT: [[S7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; NOFP16-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; NOFP16: if.then: +; NOFP16-NEXT: [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[S0]], <8 x half> [[B]]) +; NOFP16-NEXT: [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[S1]], <8 x half> [[B]]) +; NOFP16-NEXT: [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[S2]], <8 x half> [[B]]) +; NOFP16-NEXT: [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[S3]], <8 x half> [[B]]) +; NOFP16-NEXT: ret <8 x half> [[R_3]] +; NOFP16: if.else: +; NOFP16-NEXT: [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[S4]], <8 x half> [[B]]) +; NOFP16-NEXT: [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[S5]], <8 x half> [[B]]) +; NOFP16-NEXT: [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[S6]], <8 x half> [[B]]) +; NOFP16-NEXT: [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[S7]], <8 x half> [[B]]) +; NOFP16-NEXT: ret <8 x half> [[R_7]] +; +; FULLFP16-LABEL: @sink_shufflevector_fma_v8f16( +; FULLFP16-NEXT: entry: +; FULLFP16-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; FULLFP16: if.then: +; FULLFP16-NEXT: [[TMP0:%.*]] = shufflevector <8 x half> [[A:%.*]], <8 x half> poison, <8 x i32> zeroinitializer +; FULLFP16-NEXT: [[R_0:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B:%.*]], <8 x half> [[TMP0]], <8 x half> [[B]]) +; FULLFP16-NEXT: [[TMP1:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; FULLFP16-NEXT: [[R_1:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_0]], <8 x half> [[TMP1]], <8 x half> [[B]]) +; FULLFP16-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; FULLFP16-NEXT: [[R_2:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_1]], <8 x half> [[TMP2]], <8 x half> [[B]]) +; FULLFP16-NEXT: [[TMP3:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; FULLFP16-NEXT: [[R_3:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_2]], <8 x half> [[TMP3]], <8 x half> [[B]]) +; FULLFP16-NEXT: ret <8 x half> [[R_3]] +; FULLFP16: if.else: +; FULLFP16-NEXT: [[TMP4:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; FULLFP16-NEXT: [[R_4:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[TMP4]], <8 x half> [[B]]) +; FULLFP16-NEXT: [[TMP5:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; FULLFP16-NEXT: [[R_5:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_4]], <8 x half> [[TMP5]], <8 x half> [[B]]) +; FULLFP16-NEXT: [[TMP6:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; FULLFP16-NEXT: [[R_6:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_5]], <8 x half> [[TMP6]], <8 x half> [[B]]) +; FULLFP16-NEXT: [[TMP7:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> +; FULLFP16-NEXT: [[R_7:%.*]] = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> [[R_6]], <8 x half> [[TMP7]], <8 x half> [[B]]) +; FULLFP16-NEXT: ret <8 x half> [[R_7]] ; entry: %s0 = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> zeroinitializer @@ -553,18 +578,18 @@ define <4 x float> @sink_shufflevector_fma_v4f32(i1 %c, <8 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @sink_shufflevector_fma_v4f32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[S2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[S3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[S0]], <4 x float> [[B]]) -; CHECK-NEXT: [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_0]], <4 x float> [[S1]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[R_0:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[TMP0]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[R_1:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_0]], <4 x float> [[TMP1]], <4 x float> [[B]]) ; CHECK-NEXT: ret <4 x float> [[R_1]] ; CHECK: if.else: -; CHECK-NEXT: [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[S2]], <4 x float> [[B]]) -; CHECK-NEXT: [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_2]], <4 x float> [[S3]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[R_2:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[TMP2]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[R_3:%.*]] = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[R_2]], <4 x float> [[TMP3]], <4 x float> [[B]]) ; CHECK-NEXT: ret <4 x float> [[R_3]] ; entry: @@ -590,14 +615,14 @@ define <2 x double> @sink_shufflevector_fma_v2f64(i1 %c, <2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @sink_shufflevector_fma_v2f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[S0:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[S1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[R_0:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[S0]], <2 x double> [[B]]) +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[R_0:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[TMP0]], <2 x double> [[B]]) ; CHECK-NEXT: ret <2 x double> [[R_0]] ; CHECK: if.else: -; CHECK-NEXT: [[R_1:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[S1]], <2 x double> [[B]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[R_1:%.*]] = tail call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[TMP1]], <2 x double> [[B]]) ; CHECK-NEXT: ret <2 x double> [[R_1]] ; entry: