Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" @@ -14019,12 +14020,20 @@ return true; } case Instruction::Mul: { - bool IsProfitable = false; + int NumZExts = 0, NumSExts = 0; for (auto &Op : I->operands()) { // Make sure we are not already sinking this operand if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) continue; + if (match(&Op, m_SExt(m_Value()))) { + NumSExts++; + continue; + } else if (match(&Op, m_ZExt(m_Value()))) { + NumZExts++; + continue; + } + ShuffleVectorInst *Shuffle = dyn_cast(Op); // If the Shuffle is a splat and the operand is a zext/sext, sinking the @@ -14034,11 +14043,14 @@ match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { Ops.push_back(&Shuffle->getOperandUse(0)); Ops.push_back(&Op); - IsProfitable = true; + if (match(Shuffle->getOperand(0), m_SExt(m_Value()))) + NumSExts++; + else + NumZExts++; continue; } - if (!Shuffle || !Shuffle->isZeroEltSplat()) + if (!Shuffle) continue; Value *ShuffleOperand = Shuffle->getOperand(0); @@ -14057,15 +14069,27 @@ continue; unsigned Opcode = OperandInstr->getOpcode(); - if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt) - continue; + if (Opcode == Instruction::SExt) + NumSExts++; + else if (Opcode == Instruction::ZExt) + NumZExts++; + else { + // If we find that the top bits are known 0, then we can sink and allow + // the backend to generate a umull. + unsigned Bitwidth = I->getType()->getScalarSizeInBits(); + APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2); + const DataLayout &DL = I->getFunction()->getParent()->getDataLayout(); + if (!MaskedValueIsZero(OperandInstr, UpperMask, DL)) + continue; + NumZExts++; + } Ops.push_back(&Shuffle->getOperandUse(0)); Ops.push_back(&Op); - IsProfitable = true; } - return IsProfitable; + // Is it profitable to sink if we found two of the same type of extends. + return !Ops.empty() && (NumSExts == 2 || NumZExts == 2); } default: return false; Index: llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -427,13 +427,12 @@ ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB5_4: // %vector.ph -; CHECK-NEXT: dup v2.8b, w9 ; CHECK-NEXT: and x11, x10, #0xfffffff0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov x12, x11 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: dup v2.8h, w9 ; CHECK-NEXT: .LBB5_5: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp d3, d4, [x8, #-8] @@ -704,9 +703,8 @@ ; CHECK: // %bb.0: // %vector.header ; CHECK-NEXT: and w8, w3, #0xffff ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: dup v0.4h, w8 ; CHECK-NEXT: and x8, x0, #0xfffffff8 -; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: .LBB10_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x2, w0, uxtw #1 @@ -767,27 +765,24 @@ define void @matrix_mul_unsigned_and_double(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) { ; CHECK-LABEL: matrix_mul_unsigned_and_double: ; CHECK: // %bb.0: // %vector.header -; CHECK-NEXT: and w8, w3, #0xffff +; CHECK-NEXT: and w9, w3, #0xffff ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: and x8, x0, #0xfffffff0 -; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: dup v0.8h, w9 ; CHECK-NEXT: .LBB11_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x2, w0, uxtw #1 +; CHECK-NEXT: add x10, x1, w0, uxtw #2 ; CHECK-NEXT: subs x8, x8, #16 +; CHECK-NEXT: add w0, w0, #16 ; CHECK-NEXT: ldr q1, [x9] ; CHECK-NEXT: ldur q2, [x9, #8] -; CHECK-NEXT: add x9, x1, w0, uxtw #2 -; CHECK-NEXT: add w0, w0, #16 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: umull2 v3.4s, v0.8h, v1.8h ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: umull2 v4.4s, v0.8h, v2.8h ; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h -; CHECK-NEXT: umull v3.4s, v0.4h, v3.4h -; CHECK-NEXT: umull v4.4s, v0.4h, v4.4h -; CHECK-NEXT: stp q1, q3, [x9] -; CHECK-NEXT: stp q2, q4, [x9, #32] +; CHECK-NEXT: stp q1, q3, [x10] +; CHECK-NEXT: stp q2, q4, [x10, #32] ; CHECK-NEXT: b.ne .LBB11_1 ; CHECK-NEXT: // %bb.2: // %for.end12 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/sve-loopforms.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-loopforms.ll +++ llvm/test/CodeGen/AArch64/sve-loopforms.ll @@ -45,9 +45,9 @@ ; CHECK-NEXT: add x17, x11, x18 ; CHECK-NEXT: add x3, x18, x15 ; CHECK-NEXT: add x1, x17, x15 -; CHECK-NEXT: dup v2.8h, w12 -; CHECK-NEXT: dup v3.8h, w9 ; CHECK-NEXT: and x4, x15, #0xfffffff8 +; CHECK-NEXT: dup v2.8b, w12 +; CHECK-NEXT: dup v3.8b, w9 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: add x16, x2, x18 ; CHECK-NEXT: add x18, x1, #1 @@ -59,8 +59,8 @@ ; CHECK-NEXT: mov x21, x0 ; CHECK-NEXT: dup v0.8h, w10 ; CHECK-NEXT: dup v1.8h, w14 -; CHECK-NEXT: xtn v2.8b, v2.8h -; CHECK-NEXT: xtn v3.8b, v3.8h +; CHECK-NEXT: dup v4.16b, w12 +; CHECK-NEXT: dup v5.16b, w9 ; CHECK-NEXT: b .LBB0_4 ; CHECK-NEXT: .LBB0_3: // %for.cond10.for.cond.cleanup12_crit_edge.us ; CHECK-NEXT: // in Loop: Header=BB0_4 Depth=1 @@ -144,22 +144,22 @@ ; CHECK-NEXT: add x24, x16, x23 ; CHECK-NEXT: add x25, x16, x22 ; CHECK-NEXT: add x23, x23, #8 -; CHECK-NEXT: ldr d4, [x24] -; CHECK-NEXT: ldr d5, [x25] -; CHECK-NEXT: ldur d6, [x24, #-1] +; CHECK-NEXT: ldr d6, [x24] +; CHECK-NEXT: ldr d7, [x25] +; CHECK-NEXT: ldur d16, [x24, #-1] ; CHECK-NEXT: add x24, x21, x22 -; CHECK-NEXT: umull v4.8h, v3.8b, v4.8b -; CHECK-NEXT: ldur d7, [x25, #-1] -; CHECK-NEXT: umull v5.8h, v3.8b, v5.8b +; CHECK-NEXT: umull v6.8h, v3.8b, v6.8b +; CHECK-NEXT: ldur d17, [x25, #-1] +; CHECK-NEXT: umull v7.8h, v3.8b, v7.8b ; CHECK-NEXT: add x22, x22, #8 ; CHECK-NEXT: add x25, x5, x22 ; CHECK-NEXT: cmp x25, #1 -; CHECK-NEXT: umlal v4.8h, v2.8b, v6.8b -; CHECK-NEXT: umlal v5.8h, v2.8b, v7.8b -; CHECK-NEXT: mul v4.8h, v4.8h, v0.8h -; CHECK-NEXT: mla v4.8h, v5.8h, v1.8h -; CHECK-NEXT: rshrn v4.8b, v4.8h, #6 -; CHECK-NEXT: stur d4, [x24, #-1] +; CHECK-NEXT: umlal v6.8h, v2.8b, v16.8b +; CHECK-NEXT: umlal v7.8h, v2.8b, v17.8b +; CHECK-NEXT: mul v6.8h, v6.8h, v0.8h +; CHECK-NEXT: mla v6.8h, v7.8h, v1.8h +; CHECK-NEXT: rshrn v6.8b, v6.8h, #6 +; CHECK-NEXT: stur d6, [x24, #-1] ; CHECK-NEXT: b.ne .LBB0_13 ; CHECK-NEXT: // %bb.14: // %vec.epilog.middle.block ; CHECK-NEXT: // in Loop: Header=BB0_4 Depth=1 @@ -175,30 +175,26 @@ ; CHECK-NEXT: // Parent Loop BB0_4 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add x24, x23, x22 -; CHECK-NEXT: add x25, x16, x22 -; CHECK-NEXT: ldr q4, [x23, x22] -; CHECK-NEXT: ldur q5, [x24, #1] -; CHECK-NEXT: ldur q7, [x25, #1] -; CHECK-NEXT: ext v16.16b, v4.16b, v4.16b, #8 -; CHECK-NEXT: ldr q6, [x25] -; CHECK-NEXT: ext v17.16b, v5.16b, v5.16b, #8 -; CHECK-NEXT: umull v5.8h, v3.8b, v5.8b -; CHECK-NEXT: ext v19.16b, v7.16b, v7.16b, #8 -; CHECK-NEXT: umull v7.8h, v3.8b, v7.8b -; CHECK-NEXT: umull v17.8h, v3.8b, v17.8b -; CHECK-NEXT: umlal v5.8h, v2.8b, v4.8b -; CHECK-NEXT: ext v18.16b, v6.16b, v6.16b, #8 -; CHECK-NEXT: umull v4.8h, v3.8b, v19.8b -; CHECK-NEXT: umlal v17.8h, v2.8b, v16.8b -; CHECK-NEXT: umlal v7.8h, v2.8b, v6.8b -; CHECK-NEXT: mul v5.8h, v5.8h, v0.8h -; CHECK-NEXT: umlal v4.8h, v2.8b, v18.8b -; CHECK-NEXT: mul v6.8h, v17.8h, v0.8h -; CHECK-NEXT: mla v5.8h, v7.8h, v1.8h -; CHECK-NEXT: mla v6.8h, v4.8h, v1.8h -; CHECK-NEXT: rshrn v4.8b, v5.8h, #6 -; CHECK-NEXT: rshrn2 v4.16b, v6.8h, #6 -; CHECK-NEXT: str q4, [x21, x22] +; CHECK-NEXT: ldr q7, [x23, x22] +; CHECK-NEXT: ldur q6, [x24, #1] +; CHECK-NEXT: add x24, x16, x22 +; CHECK-NEXT: umull v16.8h, v5.8b, v6.8b +; CHECK-NEXT: ldur q18, [x24, #1] +; CHECK-NEXT: umull2 v6.8h, v5.16b, v6.16b +; CHECK-NEXT: ldr q17, [x24] +; CHECK-NEXT: umull v19.8h, v5.8b, v18.8b +; CHECK-NEXT: umlal v16.8h, v4.8b, v7.8b +; CHECK-NEXT: umull2 v18.8h, v5.16b, v18.16b +; CHECK-NEXT: umlal2 v6.8h, v4.16b, v7.16b +; CHECK-NEXT: umlal v19.8h, v4.8b, v17.8b +; CHECK-NEXT: mul v7.8h, v16.8h, v0.8h +; CHECK-NEXT: umlal2 v18.8h, v4.16b, v17.16b +; CHECK-NEXT: mul v6.8h, v6.8h, v0.8h +; CHECK-NEXT: mla v7.8h, v19.8h, v1.8h +; CHECK-NEXT: mla v6.8h, v18.8h, v1.8h +; CHECK-NEXT: rshrn v7.8b, v7.8h, #6 +; CHECK-NEXT: rshrn2 v7.16b, v6.8h, #6 +; CHECK-NEXT: str q7, [x21, x22] ; CHECK-NEXT: add x22, x22, #16 ; CHECK-NEXT: cmp x3, x22 ; CHECK-NEXT: b.ne .LBB0_16 @@ -486,21 +482,21 @@ ; CHECK-NEXT: add x4, x2, x16 ; CHECK-NEXT: neg x5, x13 ; CHECK-NEXT: add x6, x11, #1 -; CHECK-NEXT: mov w19, #1 -; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: mov w20, #1 +; CHECK-NEXT: mov x21, x0 ; CHECK-NEXT: cnth x3 -; CHECK-NEXT: mov z0.h, w14 -; CHECK-NEXT: mov z1.h, w9 -; CHECK-NEXT: mov z2.h, w10 -; CHECK-NEXT: mov z3.h, w15 -; CHECK-NEXT: rdvl x21, #1 +; CHECK-NEXT: mov z0.h, w10 +; CHECK-NEXT: mov z1.h, w15 +; CHECK-NEXT: rdvl x19, #1 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z2.h, w14 +; CHECK-NEXT: mov z3.h, w9 ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_3: // %for.cond10.for.cond.cleanup12_crit_edge.us ; CHECK-NEXT: // in Loop: Header=BB1_4 Depth=1 ; CHECK-NEXT: add x17, x17, x11 -; CHECK-NEXT: add x20, x20, x12 +; CHECK-NEXT: add x21, x21, x12 ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: add x4, x4, x11 ; CHECK-NEXT: cmp w8, w7 @@ -534,7 +530,7 @@ ; CHECK-NEXT: madd w25, w9, w25, w27 ; CHECK-NEXT: mul w24, w24, w10 ; CHECK-NEXT: madd w24, w25, w15, w24 -; CHECK-NEXT: add x25, x20, x23 +; CHECK-NEXT: add x25, x21, x23 ; CHECK-NEXT: add x23, x23, #1 ; CHECK-NEXT: add w24, w24, #32 ; CHECK-NEXT: add x26, x5, x23 @@ -550,7 +546,7 @@ ; CHECK-NEXT: add x23, x1, x22 ; CHECK-NEXT: add x23, x2, x23 ; CHECK-NEXT: add x24, x0, x24 -; CHECK-NEXT: cmp x20, x23 +; CHECK-NEXT: cmp x21, x23 ; CHECK-NEXT: add x23, x18, x22 ; CHECK-NEXT: ccmp x17, x24, #2, lo ; CHECK-NEXT: add x22, x16, x22 @@ -558,7 +554,7 @@ ; CHECK-NEXT: add x25, x2, x23 ; CHECK-NEXT: cset w23, lo ; CHECK-NEXT: cmp x22, x24 -; CHECK-NEXT: ccmp x20, x25, #2, lo +; CHECK-NEXT: ccmp x21, x25, #2, lo ; CHECK-NEXT: mov x22, xzr ; CHECK-NEXT: b.lo .LBB1_6 ; CHECK-NEXT: // %bb.9: // %vector.memcheck @@ -566,16 +562,16 @@ ; CHECK-NEXT: tbnz w23, #0, .LBB1_6 ; CHECK-NEXT: // %bb.10: // %vector.main.loop.iter.check ; CHECK-NEXT: // in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: cmp x21, x13 +; CHECK-NEXT: cmp x19, x13 ; CHECK-NEXT: b.ls .LBB1_12 ; CHECK-NEXT: // %bb.11: // in Loop: Header=BB1_4 Depth=1 ; CHECK-NEXT: mov x22, xzr ; CHECK-NEXT: b .LBB1_16 ; CHECK-NEXT: .LBB1_12: // %vector.ph ; CHECK-NEXT: // in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: udiv x22, x13, x21 +; CHECK-NEXT: udiv x22, x13, x19 ; CHECK-NEXT: mov x24, xzr -; CHECK-NEXT: mul x22, x22, x21 +; CHECK-NEXT: mul x22, x22, x19 ; CHECK-NEXT: sub x23, x13, x22 ; CHECK-NEXT: .LBB1_13: // %vector.body ; CHECK-NEXT: // Parent Loop BB1_4 Depth=1 @@ -586,31 +582,31 @@ ; CHECK-NEXT: add x28, x27, #1 ; CHECK-NEXT: ld1b { z4.h }, p0/z, [x17, x24] ; CHECK-NEXT: ld1b { z5.h }, p0/z, [x25, #1, mul vl] -; CHECK-NEXT: ld1b { z6.h }, p0/z, [x25, x19] +; CHECK-NEXT: ld1b { z6.h }, p0/z, [x25, x20] ; CHECK-NEXT: ld1b { z7.h }, p0/z, [x26, #1, mul vl] ; CHECK-NEXT: ld1b { z16.h }, p0/z, [x4, x24] ; CHECK-NEXT: ld1b { z17.h }, p0/z, [x27, #1, mul vl] ; CHECK-NEXT: ld1b { z18.h }, p0/z, [x28, #1, mul vl] -; CHECK-NEXT: ld1b { z19.h }, p0/z, [x27, x19] -; CHECK-NEXT: mul z6.h, z1.h, z6.h -; CHECK-NEXT: mul z7.h, z1.h, z7.h -; CHECK-NEXT: mla z6.h, p0/m, z0.h, z4.h -; CHECK-NEXT: mul z18.h, z1.h, z18.h -; CHECK-NEXT: mla z7.h, p0/m, z0.h, z5.h -; CHECK-NEXT: mul z19.h, z1.h, z19.h -; CHECK-NEXT: mla z18.h, p0/m, z0.h, z17.h -; CHECK-NEXT: mla z19.h, p0/m, z0.h, z16.h -; CHECK-NEXT: mul z4.h, z7.h, z3.h -; CHECK-NEXT: mul z5.h, z6.h, z3.h -; CHECK-NEXT: mla z4.h, p0/m, z18.h, z2.h -; CHECK-NEXT: mla z5.h, p0/m, z19.h, z2.h +; CHECK-NEXT: ld1b { z19.h }, p0/z, [x27, x20] +; CHECK-NEXT: mul z6.h, z3.h, z6.h +; CHECK-NEXT: mul z7.h, z3.h, z7.h +; CHECK-NEXT: mla z6.h, p0/m, z2.h, z4.h +; CHECK-NEXT: mul z18.h, z3.h, z18.h +; CHECK-NEXT: mla z7.h, p0/m, z2.h, z5.h +; CHECK-NEXT: mul z19.h, z3.h, z19.h +; CHECK-NEXT: mla z18.h, p0/m, z2.h, z17.h +; CHECK-NEXT: mla z19.h, p0/m, z2.h, z16.h +; CHECK-NEXT: mul z4.h, z7.h, z1.h +; CHECK-NEXT: mul z5.h, z6.h, z1.h +; CHECK-NEXT: mla z4.h, p0/m, z18.h, z0.h +; CHECK-NEXT: mla z5.h, p0/m, z19.h, z0.h ; CHECK-NEXT: add z4.h, z4.h, #32 // =0x20 ; CHECK-NEXT: add z5.h, z5.h, #32 // =0x20 ; CHECK-NEXT: lsr z4.h, z4.h, #6 ; CHECK-NEXT: lsr z5.h, z5.h, #6 ; CHECK-NEXT: uzp1 z4.b, z5.b, z4.b -; CHECK-NEXT: st1b { z4.b }, p1, [x20, x24] -; CHECK-NEXT: add x24, x24, x21 +; CHECK-NEXT: st1b { z4.b }, p1, [x21, x24] +; CHECK-NEXT: add x24, x24, x19 ; CHECK-NEXT: cmp x22, x24 ; CHECK-NEXT: b.ne .LBB1_13 ; CHECK-NEXT: // %bb.14: // %middle.block @@ -633,17 +629,17 @@ ; CHECK-NEXT: add x26, x4, x24 ; CHECK-NEXT: ld1b { z5.h }, p0/z, [x17, x24] ; CHECK-NEXT: ld1b { z7.h }, p0/z, [x4, x24] -; CHECK-NEXT: ld1b { z4.h }, p0/z, [x25, x19] -; CHECK-NEXT: ld1b { z6.h }, p0/z, [x26, x19] -; CHECK-NEXT: mul z4.h, z1.h, z4.h -; CHECK-NEXT: mul z6.h, z1.h, z6.h -; CHECK-NEXT: mla z4.h, p0/m, z0.h, z5.h -; CHECK-NEXT: mla z6.h, p0/m, z0.h, z7.h -; CHECK-NEXT: mul z4.h, z4.h, z3.h -; CHECK-NEXT: mla z4.h, p0/m, z6.h, z2.h +; CHECK-NEXT: ld1b { z4.h }, p0/z, [x25, x20] +; CHECK-NEXT: ld1b { z6.h }, p0/z, [x26, x20] +; CHECK-NEXT: mul z4.h, z3.h, z4.h +; CHECK-NEXT: mul z6.h, z3.h, z6.h +; CHECK-NEXT: mla z4.h, p0/m, z2.h, z5.h +; CHECK-NEXT: mla z6.h, p0/m, z2.h, z7.h +; CHECK-NEXT: mul z4.h, z4.h, z1.h +; CHECK-NEXT: mla z4.h, p0/m, z6.h, z0.h ; CHECK-NEXT: add z4.h, z4.h, #32 // =0x20 ; CHECK-NEXT: lsr z4.h, z4.h, #6 -; CHECK-NEXT: st1b { z4.h }, p0, [x20, x24] +; CHECK-NEXT: st1b { z4.h }, p0, [x21, x24] ; CHECK-NEXT: add x24, x24, x3 ; CHECK-NEXT: cmp x22, x24 ; CHECK-NEXT: b.ne .LBB1_17 @@ -942,11 +938,11 @@ ; CHECK-NEXT: mov w5, #1 ; CHECK-NEXT: mov x6, x0 ; CHECK-NEXT: rdvl x1, #1 -; CHECK-NEXT: mov z0.h, w12 -; CHECK-NEXT: mov z1.h, w9 -; CHECK-NEXT: mov z2.h, w10 -; CHECK-NEXT: mov z3.h, w14 +; CHECK-NEXT: mov z0.h, w10 +; CHECK-NEXT: mov z1.h, w14 +; CHECK-NEXT: mov z2.h, w12 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z3.h, w9 ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: b .LBB2_4 ; CHECK-NEXT: .LBB2_3: // %for.cond10.for.cond.cleanup12_crit_edge.us @@ -1039,18 +1035,18 @@ ; CHECK-NEXT: ld1b { z17.h }, p0/z, [x25, #1, mul vl] ; CHECK-NEXT: ld1b { z18.h }, p0/z, [x26, #1, mul vl] ; CHECK-NEXT: ld1b { z19.h }, p0/z, [x25, x5] -; CHECK-NEXT: mul z6.h, z1.h, z6.h -; CHECK-NEXT: mul z7.h, z1.h, z7.h -; CHECK-NEXT: mla z6.h, p0/m, z0.h, z4.h -; CHECK-NEXT: mul z18.h, z1.h, z18.h -; CHECK-NEXT: mla z7.h, p0/m, z0.h, z5.h -; CHECK-NEXT: mul z19.h, z1.h, z19.h -; CHECK-NEXT: mla z18.h, p0/m, z0.h, z17.h -; CHECK-NEXT: mla z19.h, p0/m, z0.h, z16.h -; CHECK-NEXT: mul z4.h, z7.h, z3.h -; CHECK-NEXT: mul z5.h, z6.h, z3.h -; CHECK-NEXT: mla z4.h, p0/m, z18.h, z2.h -; CHECK-NEXT: mla z5.h, p0/m, z19.h, z2.h +; CHECK-NEXT: mul z6.h, z3.h, z6.h +; CHECK-NEXT: mul z7.h, z3.h, z7.h +; CHECK-NEXT: mla z6.h, p0/m, z2.h, z4.h +; CHECK-NEXT: mul z18.h, z3.h, z18.h +; CHECK-NEXT: mla z7.h, p0/m, z2.h, z5.h +; CHECK-NEXT: mul z19.h, z3.h, z19.h +; CHECK-NEXT: mla z18.h, p0/m, z2.h, z17.h +; CHECK-NEXT: mla z19.h, p0/m, z2.h, z16.h +; CHECK-NEXT: mul z4.h, z7.h, z1.h +; CHECK-NEXT: mul z5.h, z6.h, z1.h +; CHECK-NEXT: mla z4.h, p0/m, z18.h, z0.h +; CHECK-NEXT: mla z5.h, p0/m, z19.h, z0.h ; CHECK-NEXT: add z4.h, z4.h, #32 // =0x20 ; CHECK-NEXT: add z5.h, z5.h, #32 // =0x20 ; CHECK-NEXT: lsr z4.h, z4.h, #6 @@ -1083,18 +1079,18 @@ ; CHECK-NEXT: uunpklo z6.h, z6.b ; CHECK-NEXT: uunpklo z19.h, z7.b ; CHECK-NEXT: uunpkhi z7.h, z7.b -; CHECK-NEXT: mul z6.h, z1.h, z6.h -; CHECK-NEXT: mul z18.h, z1.h, z18.h -; CHECK-NEXT: mul z7.h, z1.h, z7.h -; CHECK-NEXT: mul z19.h, z1.h, z19.h -; CHECK-NEXT: mla z18.h, p0/m, z0.h, z4.h -; CHECK-NEXT: mla z6.h, p0/m, z0.h, z16.h -; CHECK-NEXT: mla z19.h, p0/m, z0.h, z5.h -; CHECK-NEXT: mla z7.h, p0/m, z0.h, z17.h -; CHECK-NEXT: mul z4.h, z6.h, z3.h -; CHECK-NEXT: mul z5.h, z18.h, z3.h -; CHECK-NEXT: mla z5.h, p0/m, z7.h, z2.h -; CHECK-NEXT: mla z4.h, p0/m, z19.h, z2.h +; CHECK-NEXT: mul z6.h, z3.h, z6.h +; CHECK-NEXT: mul z18.h, z3.h, z18.h +; CHECK-NEXT: mul z7.h, z3.h, z7.h +; CHECK-NEXT: mul z19.h, z3.h, z19.h +; CHECK-NEXT: mla z18.h, p0/m, z2.h, z4.h +; CHECK-NEXT: mla z6.h, p0/m, z2.h, z16.h +; CHECK-NEXT: mla z19.h, p0/m, z2.h, z5.h +; CHECK-NEXT: mla z7.h, p0/m, z2.h, z17.h +; CHECK-NEXT: mul z4.h, z6.h, z1.h +; CHECK-NEXT: mul z5.h, z18.h, z1.h +; CHECK-NEXT: mla z5.h, p0/m, z7.h, z0.h +; CHECK-NEXT: mla z4.h, p0/m, z19.h, z0.h ; CHECK-NEXT: add z4.h, z4.h, #32 // =0x20 ; CHECK-NEXT: add z5.h, z5.h, #32 // =0x20 ; CHECK-NEXT: lsr z5.h, z5.h, #6 @@ -1372,25 +1368,25 @@ ; CHECK-NEXT: add x1, x17, x13 ; CHECK-NEXT: add x18, x16, x13 ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add x3, x2, x17 +; CHECK-NEXT: add x4, x2, x17 ; CHECK-NEXT: add x17, x18, #1 ; CHECK-NEXT: add x18, x1, #1 ; CHECK-NEXT: add x1, x2, x16 -; CHECK-NEXT: mov w4, #1 +; CHECK-NEXT: mov w3, #1 ; CHECK-NEXT: mov x5, x0 -; CHECK-NEXT: mov z0.h, w14 -; CHECK-NEXT: mov z1.h, w9 -; CHECK-NEXT: mov z2.h, w10 -; CHECK-NEXT: mov z3.h, w15 +; CHECK-NEXT: mov z0.h, w10 +; CHECK-NEXT: mov z1.h, w15 ; CHECK-NEXT: rdvl x6, #1 +; CHECK-NEXT: mov z2.h, w14 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z3.h, w9 ; CHECK-NEXT: b .LBB3_4 ; CHECK-NEXT: .LBB3_3: // %for.cond10.for.cond.cleanup12_crit_edge.us ; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1 ; CHECK-NEXT: add x5, x5, x12 ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: add x1, x1, x11 -; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: mov x4, x19 ; CHECK-NEXT: cmp w8, w7 ; CHECK-NEXT: b.eq .LBB3_10 ; CHECK-NEXT: .LBB3_4: // %for.cond10.preheader.us @@ -1404,14 +1400,14 @@ ; CHECK-NEXT: add x21, x0, x21 ; CHECK-NEXT: cmp x5, x20 ; CHECK-NEXT: add x20, x17, x19 -; CHECK-NEXT: ccmp x3, x21, #2, lo +; CHECK-NEXT: ccmp x4, x21, #2, lo ; CHECK-NEXT: add x19, x16, x19 ; CHECK-NEXT: add x19, x2, x19 ; CHECK-NEXT: add x22, x2, x20 ; CHECK-NEXT: cset w20, lo ; CHECK-NEXT: cmp x19, x21 ; CHECK-NEXT: ccmp x5, x22, #2, lo -; CHECK-NEXT: add x19, x3, x11 +; CHECK-NEXT: add x19, x4, x11 ; CHECK-NEXT: b.lo .LBB3_8 ; CHECK-NEXT: // %bb.5: // %for.cond10.preheader.us ; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1 @@ -1423,12 +1419,12 @@ ; CHECK-NEXT: .LBB3_7: // %vector.body ; CHECK-NEXT: // Parent Loop BB3_4 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add x21, x3, x20 +; CHECK-NEXT: add x21, x4, x20 ; CHECK-NEXT: add x22, x1, x20 -; CHECK-NEXT: ld1b { z4.b }, p1/z, [x3, x20] +; CHECK-NEXT: ld1b { z4.b }, p1/z, [x4, x20] ; CHECK-NEXT: ld1b { z5.b }, p1/z, [x1, x20] -; CHECK-NEXT: ld1b { z6.b }, p1/z, [x21, x4] -; CHECK-NEXT: ld1b { z7.b }, p1/z, [x22, x4] +; CHECK-NEXT: ld1b { z6.b }, p1/z, [x21, x3] +; CHECK-NEXT: ld1b { z7.b }, p1/z, [x22, x3] ; CHECK-NEXT: uunpklo z16.h, z4.b ; CHECK-NEXT: uunpkhi z4.h, z4.b ; CHECK-NEXT: uunpkhi z17.h, z5.b @@ -1437,18 +1433,18 @@ ; CHECK-NEXT: uunpklo z6.h, z6.b ; CHECK-NEXT: uunpklo z19.h, z7.b ; CHECK-NEXT: uunpkhi z7.h, z7.b -; CHECK-NEXT: mul z6.h, z1.h, z6.h -; CHECK-NEXT: mul z18.h, z1.h, z18.h -; CHECK-NEXT: mul z7.h, z1.h, z7.h -; CHECK-NEXT: mul z19.h, z1.h, z19.h -; CHECK-NEXT: mla z18.h, p0/m, z0.h, z4.h -; CHECK-NEXT: mla z6.h, p0/m, z0.h, z16.h -; CHECK-NEXT: mla z19.h, p0/m, z0.h, z5.h -; CHECK-NEXT: mla z7.h, p0/m, z0.h, z17.h -; CHECK-NEXT: mul z4.h, z6.h, z3.h -; CHECK-NEXT: mul z5.h, z18.h, z3.h -; CHECK-NEXT: mla z5.h, p0/m, z7.h, z2.h -; CHECK-NEXT: mla z4.h, p0/m, z19.h, z2.h +; CHECK-NEXT: mul z6.h, z3.h, z6.h +; CHECK-NEXT: mul z18.h, z3.h, z18.h +; CHECK-NEXT: mul z7.h, z3.h, z7.h +; CHECK-NEXT: mul z19.h, z3.h, z19.h +; CHECK-NEXT: mla z18.h, p0/m, z2.h, z4.h +; CHECK-NEXT: mla z6.h, p0/m, z2.h, z16.h +; CHECK-NEXT: mla z19.h, p0/m, z2.h, z5.h +; CHECK-NEXT: mla z7.h, p0/m, z2.h, z17.h +; CHECK-NEXT: mul z4.h, z6.h, z1.h +; CHECK-NEXT: mul z5.h, z18.h, z1.h +; CHECK-NEXT: mla z5.h, p0/m, z7.h, z0.h +; CHECK-NEXT: mla z4.h, p0/m, z19.h, z0.h ; CHECK-NEXT: add z4.h, z4.h, #32 // =0x20 ; CHECK-NEXT: add z5.h, z5.h, #32 // =0x20 ; CHECK-NEXT: lsr z5.h, z5.h, #6 @@ -1465,7 +1461,7 @@ ; CHECK-NEXT: .LBB3_9: // %for.body13.us ; CHECK-NEXT: // Parent Loop BB3_4 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add x21, x3, x20 +; CHECK-NEXT: add x21, x4, x20 ; CHECK-NEXT: ldrb w22, [x19, x20] ; CHECK-NEXT: add x23, x19, x20 ; CHECK-NEXT: ldrb w24, [x21] Index: llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll =================================================================== --- llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +++ llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll @@ -313,12 +313,12 @@ ; CHECK-NEXT: for.cond4.preheader.lr.ph: ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[F:%.*]], 0 ; CHECK-NEXT: [[CONV25:%.*]] = sext i16 [[E:%.*]] to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT143:%.*]] = insertelement <4 x i32> poison, i32 [[CONV25]], i32 0 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND4_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND4_PREHEADER_PREHEADER:%.*]] ; CHECK: for.cond4.preheader.us.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV25]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT144:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> zeroinitializer, [[BROADCAST_SPLAT144]] -; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; CHECK-NEXT: [[BROADCAST_SPLAT144:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT143]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> zeroinitializer, [[BROADCAST_SPLAT144]] +; CHECK-NEXT: ret <4 x i32> [[TMP0]] ; CHECK: for.cond4.preheader.preheader: ; CHECK-NEXT: ret <4 x i32> zeroinitializer ;