diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2902,8 +2902,9 @@ /// Try to optimize extending or truncating conversion instructions (like /// zext, trunc, fptoui, uitofp) for the target. - virtual bool optimizeExtendOrTruncateConversion(Instruction *I, - Loop *L) const { + virtual bool + optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, + const TargetTransformInfo &TTI) const { return false; } diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -8065,8 +8065,8 @@ return true; if ((isa(I) || isa(I) || isa(I)) && - TLI->optimizeExtendOrTruncateConversion(I, - LI->getLoopFor(I->getParent()))) + TLI->optimizeExtendOrTruncateConversion( + I, LI->getLoopFor(I->getParent()), *TTI)) return true; if (isa(I) || isa(I)) { @@ -8078,7 +8078,7 @@ return SinkCast(CI); } else { if (TLI->optimizeExtendOrTruncateConversion( - I, LI->getLoopFor(I->getParent()))) + I, LI->getLoopFor(I->getParent()), *TTI)) return true; bool MadeChange = optimizeExt(I); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -637,8 +637,8 @@ bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const override; - bool optimizeExtendOrTruncateConversion(Instruction *I, - Loop *L) const override; + bool optimizeExtendOrTruncateConversion( + Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override; bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14570,8 +14570,8 @@ TI->eraseFromParent(); } -bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I, - Loop *L) const { +bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( + Instruction *I, Loop *L, const TargetTransformInfo &TTI) const { // shuffle_vector instructions are serialized when targeting SVE, // see LowerSPLAT_VECTOR. This peephole is not beneficial. if (Subtarget->useSVEForFixedLengthVectors()) @@ -14596,7 +14596,18 @@ // into i8x lanes. This is enabled for cases where it is beneficial. auto *ZExt = dyn_cast(I); if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) { - auto DstWidth = cast(DstTy->getElementType())->getBitWidth(); + // If the ZExt can be lowered to a single ZExt to the next power-of-2 and + // the remaining ZExt folded into the user, don't use tbl lowering. + auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits(); + auto DstWidth = DstTy->getElementType()->getScalarSizeInBits(); + if (SrcWidth * 4 == DstWidth) { + if (TTI.getCastInstrCost(I->getOpcode(), DstTy, + VectorType::getExtendedElementVectorType(SrcTy), + TargetTransformInfo::getCastContextHint(I), + TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) + return false; + } + if (DstWidth % 8 == 0 && DstWidth > 16 && DstWidth < 64) { createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian()); return true; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -57,7 +57,7 @@ VECTOR_LDST_FOUR_ELEMENTS }; - bool isWideningInstruction(Type *Ty, unsigned Opcode, + bool isWideningInstruction(Type *DstTy, Type *SrcTy, unsigned Opcode, ArrayRef Args); // A helper function called by 'getVectorInstrCost'. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1768,7 +1768,8 @@ llvm_unreachable("Unsupported register kind"); } -bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, +bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, Type *SrcTy, + unsigned Opcode, ArrayRef Args) { // A helper that returns a vector type from the given type. The number of @@ -1826,7 +1827,9 @@ // Legalize the source type and ensure it can be used in a widening // operation. - auto *SrcTy = toVectorTy(Extend->getSrcTy()); + if (!SrcTy) + SrcTy = toVectorTy(Extend->getSrcTy()); + auto SrcTyL = getTypeLegalizationCost(SrcTy); unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) @@ -1856,7 +1859,7 @@ if (I && I->hasOneUser()) { auto *SingleUser = cast(*I->user_begin()); SmallVector Operands(SingleUser->operand_values()); - if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { + if (isWideningInstruction(Dst, Src, SingleUser->getOpcode(), Operands)) { // If the cast is the second operand, it is free. We will generate either // a "wide" or "long" version of the widening instruction. if (I == SingleUser->getOperand(1)) @@ -2466,7 +2469,8 @@ // LT.first = 2 the cost is 28. If both operands are extensions it will not // need to scalarize so the cost can be cheaper (smull or umull). // so the cost can be cheaper (smull or umull). - if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) + if (LT.second != MVT::v2i64 || + isWideningInstruction(Ty, nullptr, Opcode, Args)) return LT.first; return LT.first * 14; case ISD::ADD: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll @@ -6,26 +6,24 @@ define internal i32 @test(ptr nocapture readonly %p1, i32 %i1, ptr nocapture readonly %p2, i32 %i2) { ; SVE256-LABEL: test: -; SVE256: ld1b { z0.h }, p0/z, -; SVE256: ld1b { z1.h }, p0/z, -; SVE256: sub z0.h, z0.h, z1.h -; SVE256-NEXT: sunpklo z1.s, z0.h -; SVE256-NEXT: ext z0.b, z0.b, z0.b, #16 -; SVE256-NEXT: sunpklo z0.s, z0.h -; SVE256-NEXT: add z0.s, z1.s, z0.s -; SVE256-NEXT: uaddv d0, p1, z0.s +; SVE256: ld1b { z0.h }, p0/z, +; SVE256: ld1b { z1.h }, p0/z, +; SVE256: sub z0.h, z0.h, z1.h +; SVE256-NEXT: sunpklo z1.s, z0.h +; SVE256-NEXT: ext z0.b, z0.b, z0.b, #16 +; SVE256-NEXT: sunpklo z0.s, z0.h +; SVE256-NEXT: add z0.s, z1.s, z0.s +; SVE256-NEXT: uaddv d0, p1, z0.s ; NEON-LABEL: test: -; NEON: tbl -; NEON-NEXT: tbl -; NEON-NEXT: tbl -; NEON-NEXT: tbl -; NEON-NEXT: tbl -; NEON-NEXT: tbl -; NEON-NEXT: tbl -; NEON-NEXT: tbl -; NEON: addv - +; NEON: ldr q0, [x0, w9, sxtw] +; NEON: ldr q1, [x2, w10, sxtw] +; NEON: usubl2 v2.8h, v0.16b, v1.16b +; NEON-NEXT: usubl v0.8h, v0.8b, v1.8b +; NEON: saddl2 v1.4s, v0.8h, v2.8h +; NEON-NEXT: saddl v0.4s, v0.4h, v2.4h +; NEON-NEXT: add v0.4s, v0.4s, v1.4s +; NEON-NEXT: addv s0, v0.4s L.entry: br label %L1 diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2727,101 +2727,39 @@ define i32 @test_pr62620_widening_instr(ptr %p1, ptr %p2, i64 %lx, i32 %h) { ; CHECK-LABEL: test_pr62620_widening_instr: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh38: -; CHECK-NEXT: adrp x9, lCPI23_0@PAGE -; CHECK-NEXT: Lloh39: -; CHECK-NEXT: adrp x10, lCPI23_1@PAGE -; CHECK-NEXT: Lloh40: -; CHECK-NEXT: adrp x11, lCPI23_2@PAGE -; CHECK-NEXT: Lloh41: -; CHECK-NEXT: adrp x12, lCPI23_3@PAGE ; CHECK-NEXT: mov x8, x0 ; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: Lloh42: -; CHECK-NEXT: ldr q0, [x9, lCPI23_0@PAGEOFF] ; CHECK-NEXT: lsl x9, x2, #4 -; CHECK-NEXT: Lloh43: -; CHECK-NEXT: ldr q1, [x10, lCPI23_1@PAGEOFF] -; CHECK-NEXT: Lloh44: -; CHECK-NEXT: ldr q2, [x11, lCPI23_2@PAGEOFF] -; CHECK-NEXT: Lloh45: -; CHECK-NEXT: ldr q3, [x12, lCPI23_3@PAGEOFF] ; CHECK-NEXT: LBB23_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q4, [x8, x9] +; CHECK-NEXT: ldr q0, [x8, x9] ; CHECK-NEXT: subs w3, w3, #1 -; CHECK-NEXT: ldr q5, [x1, x9] -; CHECK-NEXT: tbl.16b v6, { v4 }, v0 -; CHECK-NEXT: tbl.16b v7, { v4 }, v1 -; CHECK-NEXT: tbl.16b v16, { v4 }, v2 -; CHECK-NEXT: tbl.16b v4, { v4 }, v3 -; CHECK-NEXT: tbl.16b v17, { v5 }, v2 -; CHECK-NEXT: tbl.16b v18, { v5 }, v3 -; CHECK-NEXT: tbl.16b v19, { v5 }, v0 -; CHECK-NEXT: tbl.16b v5, { v5 }, v1 -; CHECK-NEXT: sabd.4s v16, v16, v17 -; CHECK-NEXT: sabd.4s v4, v4, v18 -; CHECK-NEXT: saba.4s v16, v7, v5 -; CHECK-NEXT: saba.4s v4, v6, v19 -; CHECK-NEXT: add.4s v4, v4, v16 -; CHECK-NEXT: addv.4s s4, v4 -; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: ldr q1, [x1, x9] +; CHECK-NEXT: uabdl.8h v2, v0, v1 +; CHECK-NEXT: uabal2.8h v2, v0, v1 +; CHECK-NEXT: uaddlv.8h s0, v2 +; CHECK-NEXT: fmov w10, s0 ; CHECK-NEXT: add w0, w10, w0 ; CHECK-NEXT: b.ne LBB23_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh41, Lloh45 -; CHECK-NEXT: .loh AdrpLdr Lloh40, Lloh44 -; CHECK-NEXT: .loh AdrpLdr Lloh39, Lloh43 -; CHECK-NEXT: .loh AdrpLdr Lloh38, Lloh42 ; ; CHECK-BE-LABEL: test_pr62620_widening_instr: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: adrp x10, .LCPI23_0 -; CHECK-BE-NEXT: add x10, x10, :lo12:.LCPI23_0 ; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: lsl x9, x2, #4 ; CHECK-BE-NEXT: mov w0, wzr ; CHECK-BE-NEXT: add x8, x8, x9 -; CHECK-BE-NEXT: ld1 { v0.16b }, [x10] -; CHECK-BE-NEXT: adrp x10, .LCPI23_1 -; CHECK-BE-NEXT: add x10, x10, :lo12:.LCPI23_1 ; CHECK-BE-NEXT: add x9, x1, x9 -; CHECK-BE-NEXT: ld1 { v1.16b }, [x10] -; CHECK-BE-NEXT: adrp x10, .LCPI23_2 -; CHECK-BE-NEXT: add x10, x10, :lo12:.LCPI23_2 -; CHECK-BE-NEXT: ld1 { v2.16b }, [x10] -; CHECK-BE-NEXT: adrp x10, .LCPI23_3 -; CHECK-BE-NEXT: add x10, x10, :lo12:.LCPI23_3 -; CHECK-BE-NEXT: ld1 { v3.16b }, [x10] ; CHECK-BE-NEXT: .LBB23_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ld1 { v4.16b }, [x8] +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] ; CHECK-BE-NEXT: subs w3, w3, #1 -; CHECK-BE-NEXT: ld1 { v5.16b }, [x9] -; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v0.16b -; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b -; CHECK-BE-NEXT: tbl v17.16b, { v5.16b }, v0.16b -; CHECK-BE-NEXT: tbl v18.16b, { v5.16b }, v1.16b -; CHECK-BE-NEXT: tbl v16.16b, { v4.16b }, v3.16b -; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v2.16b -; CHECK-BE-NEXT: tbl v19.16b, { v5.16b }, v3.16b -; CHECK-BE-NEXT: tbl v5.16b, { v5.16b }, v2.16b -; CHECK-BE-NEXT: rev32 v7.16b, v7.16b -; CHECK-BE-NEXT: rev32 v6.16b, v6.16b -; CHECK-BE-NEXT: rev32 v18.16b, v18.16b -; CHECK-BE-NEXT: rev32 v17.16b, v17.16b -; CHECK-BE-NEXT: rev32 v16.16b, v16.16b -; CHECK-BE-NEXT: rev32 v4.16b, v4.16b -; CHECK-BE-NEXT: rev32 v19.16b, v19.16b -; CHECK-BE-NEXT: rev32 v5.16b, v5.16b -; CHECK-BE-NEXT: sabd v7.4s, v7.4s, v18.4s -; CHECK-BE-NEXT: sabd v6.4s, v6.4s, v17.4s -; CHECK-BE-NEXT: saba v7.4s, v4.4s, v5.4s -; CHECK-BE-NEXT: saba v6.4s, v16.4s, v19.4s -; CHECK-BE-NEXT: add v4.4s, v6.4s, v7.4s -; CHECK-BE-NEXT: addv s4, v4.4s -; CHECK-BE-NEXT: fmov w10, s4 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] +; CHECK-BE-NEXT: uabdl v2.8h, v0.8b, v1.8b +; CHECK-BE-NEXT: uabal2 v2.8h, v0.16b, v1.16b +; CHECK-BE-NEXT: uaddlv s0, v2.8h +; CHECK-BE-NEXT: fmov w10, s0 ; CHECK-BE-NEXT: add w0, w10, w0 ; CHECK-BE-NEXT: b.ne .LBB23_1 ; CHECK-BE-NEXT: // %bb.2: // %exit