diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11924,6 +11924,12 @@ return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2); } +static bool isSplatShuffle(Value *V) { + if (auto *Shuf = dyn_cast(V)) + return is_splat(Shuf->getShuffleMask()); + return false; +} + /// Check if sinking \p I's operands to I's basic block is profitable, because /// the operands can be folded into a target instruction, e.g. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). @@ -11934,12 +11940,24 @@ if (IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { + case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: - if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) - return false; - Ops.push_back(&II->getOperandUse(0)); - Ops.push_back(&II->getOperandUse(1)); - return true; + if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) { + Ops.push_back(&II->getOperandUse(0)); + Ops.push_back(&II->getOperandUse(1)); + return true; + } + LLVM_FALLTHROUGH; + + case Intrinsic::aarch64_neon_sqdmull: + case Intrinsic::aarch64_neon_sqdmulh: + case Intrinsic::aarch64_neon_sqrdmulh: + // Sink splats for index lane variants + if (isSplatShuffle(II->getOperand(0))) + Ops.push_back(&II->getOperandUse(0)); + if (isSplatShuffle(II->getOperand(1))) + Ops.push_back(&II->getOperandUse(1)); + return !Ops.empty(); case Intrinsic::aarch64_neon_pmull64: if (!areOperandsOfVmullHighP64(II->getArgOperand(0), diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll --- a/llvm/test/CodeGen/AArch64/sinksplat.ll +++ b/llvm/test/CodeGen/AArch64/sinksplat.ll @@ -7,12 +7,11 @@ ; CHECK-NEXT: fmov d1, d0 ; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: dup v1.4h, v1.h[3] ; CHECK-NEXT: .LBB0_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: smlal v0.4s, v2.4h, v1.4h +; CHECK-NEXT: smlal v0.4s, v2.4h, v1.h[3] ; CHECK-NEXT: b.eq .LBB0_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -40,12 +39,11 @@ ; CHECK-NEXT: fmov d1, d0 ; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: dup v1.4h, v1.h[3] ; CHECK-NEXT: .LBB1_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h +; CHECK-NEXT: umlal v0.4s, v2.4h, v1.h[3] ; CHECK-NEXT: b.eq .LBB1_1 ; CHECK-NEXT: // %bb.2: // %l2 ; CHECK-NEXT: ret @@ -73,12 +71,11 @@ ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: dup v1.4s, v1.s[3] ; CHECK-NEXT: .LBB2_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.4s +; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.s[3] ; CHECK-NEXT: sqadd v0.4s, v0.4s, v2.4s ; CHECK-NEXT: b.eq .LBB2_1 ; CHECK-NEXT: // %bb.2: // %l2 @@ -107,12 +104,11 @@ ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: dup v1.4s, v1.s[3] ; CHECK-NEXT: .LBB3_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.4s +; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.s[3] ; CHECK-NEXT: sqsub v0.4s, v0.4s, v2.4s ; CHECK-NEXT: b.eq .LBB3_1 ; CHECK-NEXT: // %bb.2: // %l2 @@ -141,12 +137,11 @@ ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: dup v1.4s, v1.s[3] ; CHECK-NEXT: .LBB4_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: sqdmulh v2.4s, v2.4s, v1.4s +; CHECK-NEXT: sqdmulh v2.4s, v2.4s, v1.s[3] ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: b.eq .LBB4_1 ; CHECK-NEXT: // %bb.2: // %l2 @@ -175,12 +170,11 @@ ; CHECK-NEXT: fmov d1, d0 ; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: dup v1.4h, v1.h[3] ; CHECK-NEXT: .LBB5_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: subs w8, w8, #1 -; CHECK-NEXT: sqdmull v2.4s, v2.4h, v1.4h +; CHECK-NEXT: sqdmull v2.4s, v2.4h, v1.h[3] ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: b.eq .LBB5_1 ; CHECK-NEXT: // %bb.2: // %l2 diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll @@ -150,6 +150,38 @@ ret <8 x i16> %vmull1 } +; The masks used are suitable for umull, sink shufflevector to users. +define <8 x i16> @sink_shufflevector_smull(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @sink_shufflevector_smull( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]]) +; CHECK-NEXT: ret <8 x i16> [[VMULL0]] +; CHECK: if.else: +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]]) +; CHECK-NEXT: ret <8 x i16> [[VMULL1]] +; +entry: + %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + br i1 undef, label %if.then, label %if.else + +if.then: + %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> + %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3 + ret <8 x i16> %vmull0 + +if.else: + %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> + %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3 + ret <8 x i16> %vmull1 +} + ; Both exts and their shufflevector operands can be sunk. define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: @sink_shufflevector_ext_subadd( @@ -271,8 +303,8 @@ } -; Function Attrs: nounwind readnone -declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2 +declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) +declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) ; The insertelement should be inserted before shufflevector, otherwise 'does not dominate all uses' error will occur. define <4 x i32> @sink_insertelement(i16 %e, i8 %f) {