diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -67,6 +67,7 @@ class FastISel; class FunctionLoweringInfo; class GlobalValue; +class Loop; class GISelKnownBits; class IntrinsicInst; class IRBuilderBase; @@ -2798,6 +2799,13 @@ return false; } + /// Try to optimize extending or truncating conversion instructions (like + /// zext, trunc, fptoui, uitofp) for the target. + virtual bool optimizeExtendOrTruncateConversion(Instruction *I, + Loop *L) const { + return false; + } + /// Return true if the target supplies and combines to a paired load /// two loaded values of type LoadedType next to each other in memory. /// RequiredAlignment gives the minimal alignment constraints that must be met diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -8055,6 +8055,10 @@ TargetLowering::TypeExpandInteger) { return SinkCast(CI); } else { + if (TLI->optimizeExtendOrTruncateConversion( + I, LI->getLoopFor(I->getParent()))) + return true; + bool MadeChange = optimizeExt(I); return MadeChange | optimizeExtUses(I); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -606,6 +606,9 @@ bool shouldSinkOperands(Instruction *I, SmallVectorImpl<Use *> &Ops) const override; + bool optimizeExtendOrTruncateConversion(Instruction *I, + Loop *L) const override; + bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -13183,6 +13184,60 @@ return false; } +static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) { + Value *Op = ZExt->getOperand(0); + auto *SrcTy = dyn_cast<FixedVectorType>(Op->getType()); + auto *DstTy = dyn_cast<FixedVectorType>(ZExt->getType()); + unsigned NumElts = SrcTy->getNumElements(); + IRBuilder<> Builder(ZExt); + SmallVector<int> Mask(4 * NumElts, NumElts); + // Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to + // replace the original ZExt. This can later be lowered to a set of tbl + // instructions. + for (unsigned i = 0; i < NumElts; i++) { + if (IsLittleEndian) + Mask[i * 4] = i; + else + Mask[i * 4 + 3] = i; + } + + auto *FirstEltZero = Builder.CreateInsertElement( + PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); + Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask); + Result = Builder.CreateBitCast(Result, DstTy); + ZExt->replaceAllUsesWith(Result); + ZExt->eraseFromParent(); +} + +bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I, + Loop *L) const { + // Try to optimize conversions using tbl. This requires materializing constant + // index vectors, which can increase code size and add loads. Skip the + // transform unless the conversion is in a loop block guaranteed to execute + // and we are not optimizing for size. + Function *F = I->getParent()->getParent(); + if (!L || L->getHeader() != I->getParent() || F->hasMinSize() || + F->hasOptSize()) + return false; + + auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType()); + auto *DstTy = dyn_cast<FixedVectorType>(I->getType()); + if (!SrcTy || !DstTy) + return false; + + // Convert 'zext <(8|16) x i8> %x to <(8|16) x i32>' to a shuffle that can be + // lowered to either 2 or 4 tbl instructions to insert the original i8 + // elements into i32 lanes. + auto *ZExt = dyn_cast<ZExtInst>(I); + if (ZExt && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && + SrcTy->getElementType()->isIntegerTy(8) && + DstTy->getElementType()->isIntegerTy(32)) { + createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian()); + return true; + } + return false; +} + bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const { if (!LoadedType.isSimple() || diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll --- a/llvm/test/CodeGen/AArch64/vselect-ext.ll +++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll @@ -573,35 +573,53 @@ define void @extension_in_loop_v16i8_to_v16i32(i8* %src, i32* %dst) { ; CHECK-LABEL: extension_in_loop_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: movi.2d v0, #0xffffffffffffffff +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x9, lCPI24_0@PAGE +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: adrp x10, lCPI24_1@PAGE +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: adrp x11, lCPI24_2@PAGE +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: adrp x12, lCPI24_3@PAGE +; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: ldr q0, [x9, lCPI24_0@PAGEOFF] +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr q1, [x10, lCPI24_1@PAGEOFF] +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: ldr q3, [x11, lCPI24_2@PAGEOFF] +; CHECK-NEXT: Lloh9: +; CHECK-NEXT: ldr q4, [x12, lCPI24_3@PAGEOFF] ; CHECK-NEXT: LBB24_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q1, [x0, x8] +; CHECK-NEXT: ldr q5, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: cmgt.16b v2, v1, v0 -; CHECK-NEXT: ushll2.8h v3, v1, #0 -; CHECK-NEXT: sshll2.8h v4, v2, #0 -; CHECK-NEXT: ushll2.4s v5, v3, #0 -; CHECK-NEXT: ushll.4s v3, v3, #0 -; CHECK-NEXT: sshll2.4s v6, v4, #0 -; CHECK-NEXT: sshll.4s v4, v4, #0 -; CHECK-NEXT: ushll.8h v1, v1, #0 -; CHECK-NEXT: sshll.8h v2, v2, #0 +; CHECK-NEXT: cmgt.16b v6, v5, v2 +; CHECK-NEXT: tbl.16b v7, { v5 }, v0 +; CHECK-NEXT: tbl.16b v16, { v5 }, v1 +; CHECK-NEXT: sshll2.8h v18, v6, #0 +; CHECK-NEXT: tbl.16b v17, { v5 }, v3 +; CHECK-NEXT: sshll2.4s v19, v18, #0 +; CHECK-NEXT: sshll.4s v18, v18, #0 +; CHECK-NEXT: tbl.16b v5, { v5 }, v4 +; CHECK-NEXT: sshll.8h v6, v6, #0 +; CHECK-NEXT: and.16b v7, v7, v19 +; CHECK-NEXT: and.16b v16, v16, v18 +; CHECK-NEXT: stp q16, q7, [x1, #32] +; CHECK-NEXT: sshll2.4s v7, v6, #0 +; CHECK-NEXT: sshll.4s v6, v6, #0 +; CHECK-NEXT: and.16b v7, v17, v7 ; CHECK-NEXT: and.16b v5, v5, v6 -; CHECK-NEXT: and.16b v3, v3, v4 -; CHECK-NEXT: stp q3, q5, [x1, #32] -; CHECK-NEXT: sshll2.4s v4, v2, #0 -; CHECK-NEXT: sshll.4s v2, v2, #0 -; CHECK-NEXT: ushll2.4s v3, v1, #0 -; CHECK-NEXT: ushll.4s v1, v1, #0 -; CHECK-NEXT: and.16b v3, v3, v4 -; CHECK-NEXT: and.16b v1, v1, v2 -; CHECK-NEXT: stp q1, q3, [x1], #64 +; CHECK-NEXT: stp q5, q7, [x1], #64 ; CHECK-NEXT: b.ne LBB24_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh9 +; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8 +; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 entry: br label %loop @@ -627,23 +645,23 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) { ; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh2: +; CHECK-NEXT: Lloh10: ; CHECK-NEXT: adrp x9, lCPI25_0@PAGE -; CHECK-NEXT: Lloh3: +; CHECK-NEXT: Lloh11: ; CHECK-NEXT: adrp x10, lCPI25_1@PAGE -; CHECK-NEXT: Lloh4: +; CHECK-NEXT: Lloh12: ; CHECK-NEXT: adrp x11, lCPI25_2@PAGE -; CHECK-NEXT: Lloh5: +; CHECK-NEXT: Lloh13: ; CHECK-NEXT: adrp x12, lCPI25_3@PAGE ; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh6: +; CHECK-NEXT: Lloh14: ; CHECK-NEXT: ldr q0, [x9, lCPI25_0@PAGEOFF] -; CHECK-NEXT: Lloh7: +; CHECK-NEXT: Lloh15: ; CHECK-NEXT: ldr q1, [x10, lCPI25_1@PAGEOFF] -; CHECK-NEXT: Lloh8: +; CHECK-NEXT: Lloh16: ; CHECK-NEXT: ldr q3, [x11, lCPI25_2@PAGEOFF] -; CHECK-NEXT: Lloh9: +; CHECK-NEXT: Lloh17: ; CHECK-NEXT: ldr q4, [x12, lCPI25_3@PAGEOFF] ; CHECK-NEXT: LBB25_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -670,10 +688,10 @@ ; CHECK-NEXT: b.ne LBB25_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh9 -; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8 -; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 -; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 +; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17 +; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16 +; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15 +; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14 entry: br label %loop @@ -700,23 +718,23 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) { ; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh10: +; CHECK-NEXT: Lloh18: ; CHECK-NEXT: adrp x9, lCPI26_0@PAGE -; CHECK-NEXT: Lloh11: +; CHECK-NEXT: Lloh19: ; CHECK-NEXT: adrp x10, lCPI26_1@PAGE -; CHECK-NEXT: Lloh12: +; CHECK-NEXT: Lloh20: ; CHECK-NEXT: adrp x11, lCPI26_2@PAGE -; CHECK-NEXT: Lloh13: +; CHECK-NEXT: Lloh21: ; CHECK-NEXT: adrp x12, lCPI26_3@PAGE ; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh14: +; CHECK-NEXT: Lloh22: ; CHECK-NEXT: ldr q0, [x9, lCPI26_0@PAGEOFF] -; CHECK-NEXT: Lloh15: +; CHECK-NEXT: Lloh23: ; CHECK-NEXT: ldr q1, [x10, lCPI26_1@PAGEOFF] -; CHECK-NEXT: Lloh16: +; CHECK-NEXT: Lloh24: ; CHECK-NEXT: ldr q3, [x11, lCPI26_2@PAGEOFF] -; CHECK-NEXT: Lloh17: +; CHECK-NEXT: Lloh25: ; CHECK-NEXT: ldr q4, [x12, lCPI26_3@PAGEOFF] ; CHECK-NEXT: LBB26_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -743,10 +761,10 @@ ; CHECK-NEXT: b.ne LBB26_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17 -; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16 -; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15 -; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14 +; CHECK-NEXT: .loh AdrpLdr Lloh21, Lloh25 +; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh24 +; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23 +; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22 entry: br label %loop diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2,31 +2,199 @@ ; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s | FileCheck --check-prefix=CHECK-BE %s +; CHECK-LABEL: lCPI0_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT:lCPI0_1: +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT:lCPI0_2: +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 9 ; 0x9 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 10 ; 0xa +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 11 ; 0xb +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT:lCPI0_3: +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 13 ; 0xd +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 14 ; 0xe +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 15 ; 0xf +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE: .LCPI0_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .LCPI0_1: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 4 // 0x4 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 6 // 0x6 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .LCPI0_2: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 8 // 0x8 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 9 // 0x9 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 10 // 0xa +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 11 // 0xb +; CHECK-BE-NEXT: .LCPI0_3: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 12 // 0xc +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 13 // 0xd +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 14 // 0xe +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 15 // 0xf + ; It's profitable to convert the zext to a shuffle, which in turn will be ; lowered to 4 tbl instructions. The masks are materialized outside the loop. define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) { ; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x9, lCPI0_0@PAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: adrp x10, lCPI0_1@PAGE +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x11, lCPI0_2@PAGE +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: adrp x12, lCPI0_3@PAGE ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: ldr q0, [x9, lCPI0_0@PAGEOFF] +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr q1, [x10, lCPI0_1@PAGEOFF] +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: ldr q2, [x11, lCPI0_2@PAGEOFF] +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr q3, [x12, lCPI0_3@PAGEOFF] ; CHECK-NEXT: LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q0, [x0, x8] +; CHECK-NEXT: ldr q4, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll2.8h v1, v0, #0 -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ushll2.4s v2, v1, #0 -; CHECK-NEXT: ushll.4s v1, v1, #0 -; CHECK-NEXT: ushll2.4s v3, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: stp q1, q2, [x1, #32] -; CHECK-NEXT: stp q0, q3, [x1], #64 +; CHECK-NEXT: tbl.16b v5, { v4 }, v3 +; CHECK-NEXT: tbl.16b v6, { v4 }, v2 +; CHECK-NEXT: tbl.16b v7, { v4 }, v1 +; CHECK-NEXT: tbl.16b v4, { v4 }, v0 +; CHECK-NEXT: stp q6, q5, [x1, #32] +; CHECK-NEXT: stp q4, q7, [x1], #64 ; CHECK-NEXT: b.ne LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 +; CHECK-NEXT: .loh AdrpLdr Lloh1, Lloh5 +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh4 ; ; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop: ; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI0_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI0_1 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_1 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI0_2 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_2 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI0_3 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_3 +; CHECK-BE-NEXT: ld1 { v3.16b }, [x8] ; CHECK-BE-NEXT: mov x8, xzr ; CHECK-BE-NEXT: .LBB0_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 @@ -34,20 +202,18 @@ ; CHECK-BE-NEXT: add x10, x1, #32 ; CHECK-BE-NEXT: add x8, x8, #16 ; CHECK-BE-NEXT: cmp x8, #128 -; CHECK-BE-NEXT: ld1 { v0.16b }, [x9] +; CHECK-BE-NEXT: ld1 { v4.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #48 -; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0 -; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0 -; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-BE-NEXT: st1 { v2.4s }, [x9] +; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b +; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v0.16b +; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v2.16b +; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v5.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #16 -; CHECK-BE-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-BE-NEXT: st1 { v1.4s }, [x10] -; CHECK-BE-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-BE-NEXT: st1 { v2.4s }, [x1] +; CHECK-BE-NEXT: st1 { v6.16b }, [x1] ; CHECK-BE-NEXT: add x1, x1, #64 -; CHECK-BE-NEXT: st1 { v0.4s }, [x9] +; CHECK-BE-NEXT: st1 { v7.16b }, [x10] +; CHECK-BE-NEXT: st1 { v4.16b }, [x9] ; CHECK-BE-NEXT: b.ne .LBB0_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret @@ -393,39 +559,123 @@ ret void } +; CHECK-LABEL: lCPI6_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI6_1: +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE: .LCPI6_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .LCPI6_1: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 4 // 0x4 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 6 // 0x6 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 7 // 0x7 + define void @zext_v8i8_to_v8i32_in_loop(i8* %src, i32* %dst) { ; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: adrp x9, lCPI6_0@PAGE +; CHECK-NEXT: Lloh9: +; CHECK-NEXT: adrp x10, lCPI6_1@PAGE ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh10: +; CHECK-NEXT: ldr q0, [x9, lCPI6_0@PAGEOFF] +; CHECK-NEXT: Lloh11: +; CHECK-NEXT: ldr q1, [x10, lCPI6_1@PAGEOFF] ; CHECK-NEXT: LBB6_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d0, [x0, x8] +; CHECK-NEXT: ldr d2, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ushll2.4s v1, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: stp q0, q1, [x1], #64 +; CHECK-NEXT: tbl.16b v3, { v2 }, v1 +; CHECK-NEXT: tbl.16b v2, { v2 }, v0 +; CHECK-NEXT: stp q2, q3, [x1], #64 ; CHECK-NEXT: b.ne LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh11 +; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh10 ; ; CHECK-BE-LABEL: zext_v8i8_to_v8i32_in_loop: ; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI6_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI6_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI6_1 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI6_1 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] ; CHECK-BE-NEXT: mov x8, xzr ; CHECK-BE-NEXT: .LBB6_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: add x9, x0, x8 ; CHECK-BE-NEXT: add x8, x8, #16 ; CHECK-BE-NEXT: cmp x8, #128 -; CHECK-BE-NEXT: ld1 { v0.8b }, [x9] +; CHECK-BE-NEXT: ld1 { v2.8b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #16 -; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BE-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-BE-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-BE-NEXT: st1 { v1.4s }, [x1] +; CHECK-BE-NEXT: tbl v3.16b, { v2.16b }, v0.16b +; CHECK-BE-NEXT: tbl v2.16b, { v2.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v3.16b }, [x1] ; CHECK-BE-NEXT: add x1, x1, #64 -; CHECK-BE-NEXT: st1 { v0.4s }, [x9] +; CHECK-BE-NEXT: st1 { v2.16b }, [x9] ; CHECK-BE-NEXT: b.ne .LBB6_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll @@ -15,10 +15,11 @@ ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]] ; CHECK-NEXT: [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>* ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16 -; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 2, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 5, i32 16, i32 16, i32 16, i32 6, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 9, i32 16, i32 16, i32 16, i32 10, i32 16, i32 16, i32 16, i32 11, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 15, i32 16, i32 16, i32 16> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[TMP0]] to <16 x i32> ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[IV]] ; CHECK-NEXT: [[DST_GEP_CAST:%.*]] = bitcast i32* [[DST_GEP]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[EXT]], <16 x i32>* [[DST_GEP_CAST]], align 64 +; CHECK-NEXT: store <16 x i32> [[TMP1]], <16 x i32>* [[DST_GEP_CAST]], align 64 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]