diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" @@ -14016,12 +14017,20 @@ return true; } case Instruction::Mul: { - bool IsProfitable = false; + int NumZExts = 0, NumSExts = 0; for (auto &Op : I->operands()) { // Make sure we are not already sinking this operand if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) continue; + if (match(&Op, m_SExt(m_Value()))) { + NumSExts++; + continue; + } else if (match(&Op, m_ZExt(m_Value()))) { + NumZExts++; + continue; + } + ShuffleVectorInst *Shuffle = dyn_cast(Op); // If the Shuffle is a splat and the operand is a zext/sext, sinking the @@ -14031,11 +14040,14 @@ match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { Ops.push_back(&Shuffle->getOperandUse(0)); Ops.push_back(&Op); - IsProfitable = true; + if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) + NumSExts++; + else + NumZExts++; continue; } - if (!Shuffle || !Shuffle->isZeroEltSplat()) + if (!Shuffle) continue; Value *ShuffleOperand = Shuffle->getOperand(0); @@ -14054,15 +14066,27 @@ continue; unsigned Opcode = OperandInstr->getOpcode(); - if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt) - continue; + if (Opcode == Instruction::SExt) + NumSExts++; + else if (Opcode == Instruction::ZExt) + NumZExts++; + else { + // If we find that the top bits are known 0, then we can sink and allow + // the backend to generate a umull. + unsigned Bitwidth = I->getType()->getScalarSizeInBits(); + APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); + const DataLayout &DL = I->getFunction()->getParent()->getDataLayout(); + if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) + continue; + NumZExts++; + } Ops.push_back(&Shuffle->getOperandUse(0)); Ops.push_back(&Op); - IsProfitable = true; } - return IsProfitable; + // Is it profitable to sink if we found two of the same type of extends. + return !Ops.empty() && (NumSExts == 2 || NumZExts == 2); } default: return false; diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -427,13 +427,12 @@ ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB5_4: // %vector.ph -; CHECK-NEXT: dup v2.8b, w9 ; CHECK-NEXT: and x11, x10, #0xfffffff0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov x12, x11 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: dup v2.8h, w9 ; CHECK-NEXT: .LBB5_5: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp d3, d4, [x8, #-8] @@ -704,9 +703,8 @@ ; CHECK: // %bb.0: // %vector.header ; CHECK-NEXT: and w8, w3, #0xffff ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: dup v0.4h, w8 ; CHECK-NEXT: and x8, x0, #0xfffffff8 -; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: .LBB10_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x2, w0, uxtw #1 @@ -767,27 +765,24 @@ define void @matrix_mul_unsigned_and_double(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) { ; CHECK-LABEL: matrix_mul_unsigned_and_double: ; CHECK: // %bb.0: // %vector.header -; CHECK-NEXT: and w8, w3, #0xffff +; CHECK-NEXT: and w9, w3, #0xffff ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: and x8, x0, #0xfffffff0 -; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: dup v0.8h, w9 ; CHECK-NEXT: .LBB11_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x2, w0, uxtw #1 +; CHECK-NEXT: add x10, x1, w0, uxtw #2 ; CHECK-NEXT: subs x8, x8, #16 +; CHECK-NEXT: add w0, w0, #16 ; CHECK-NEXT: ldr q1, [x9] ; CHECK-NEXT: ldur q2, [x9, #8] -; CHECK-NEXT: add x9, x1, w0, uxtw #2 -; CHECK-NEXT: add w0, w0, #16 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: umull2 v3.4s, v0.8h, v1.8h ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: umull2 v4.4s, v0.8h, v2.8h ; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h -; CHECK-NEXT: umull v3.4s, v0.4h, v3.4h -; CHECK-NEXT: umull v4.4s, v0.4h, v4.4h -; CHECK-NEXT: stp q1, q3, [x9] -; CHECK-NEXT: stp q2, q4, [x9, #32] +; CHECK-NEXT: stp q1, q3, [x10] +; CHECK-NEXT: stp q2, q4, [x10, #32] ; CHECK-NEXT: b.ne .LBB11_1 ; CHECK-NEXT: // %bb.2: // %for.end12 ; CHECK-NEXT: ret diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll @@ -313,12 +313,12 @@ ; CHECK-NEXT: for.cond4.preheader.lr.ph: ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[F:%.*]], 0 ; CHECK-NEXT: [[CONV25:%.*]] = sext i16 [[E:%.*]] to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT143:%.*]] = insertelement <4 x i32> poison, i32 [[CONV25]], i32 0 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND4_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND4_PREHEADER_PREHEADER:%.*]] ; CHECK: for.cond4.preheader.us.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV25]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT144:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> zeroinitializer, [[BROADCAST_SPLAT144]] -; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; CHECK-NEXT: [[BROADCAST_SPLAT144:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT143]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> zeroinitializer, [[BROADCAST_SPLAT144]] +; CHECK-NEXT: ret <4 x i32> [[TMP0]] ; CHECK: for.cond4.preheader.preheader: ; CHECK-NEXT: ret <4 x i32> zeroinitializer ;