diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -67,6 +67,7 @@ class FastISel; class FunctionLoweringInfo; class GlobalValue; +class Loop; class GISelKnownBits; class IntrinsicInst; class IRBuilderBase; @@ -2798,6 +2799,13 @@ return false; } + /// Try to optimize extending or truncating conversion instructions (like + /// zext, trunc, fptoui, uitofp) for the target. + virtual bool optimizeExtendOrTruncateConversion(Instruction *I, + Loop *L) const { + return false; + } + /// Return true if the target supplies and combines to a paired load /// two loaded values of type LoadedType next to each other in memory. /// RequiredAlignment gives the minimal alignment constraints that must be met diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -8055,6 +8055,10 @@ TargetLowering::TypeExpandInteger) { return SinkCast(CI); } else { + if (TLI->optimizeExtendOrTruncateConversion( + I, LI->getLoopFor(I->getParent()))) + return true; + bool MadeChange = optimizeExt(I); return MadeChange | optimizeExtUses(I); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -606,6 +606,9 @@ bool shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const override; + bool optimizeExtendOrTruncateConversion(Instruction *I, + Loop *L) const override; + bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -13183,6 +13184,60 @@ return false; } +static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) { + Value *Op = ZExt->getOperand(0); + auto *SrcTy = dyn_cast(Op->getType()); + auto *DstTy = dyn_cast(ZExt->getType()); + unsigned NumElts = SrcTy->getNumElements(); + IRBuilder<> Builder(ZExt); + SmallVector Mask(4 * NumElts, NumElts); + // Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to + // replace the original ZExt. This can later be lowered to a set of tbl + // instructions. + for (unsigned i = 0; i < NumElts; i++) { + if (IsLittleEndian) + Mask[i * 4] = i; + else + Mask[i * 4 + 3] = i; + } + + auto *FirstEltZero = Builder.CreateInsertElement( + PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); + Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask); + Result = Builder.CreateBitCast(Result, DstTy); + ZExt->replaceAllUsesWith(Result); + ZExt->eraseFromParent(); +} + +bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I, + Loop *L) const { + // Try to optimize conversions using tbl. This requires materializing constant + // index vectors, which can increase code size and add loads. Skip the + // transform unless the conversion is in a loop block guaranteed to execute + // and we are not optimizing for size. + Function *F = I->getParent()->getParent(); + if (!L || L->getHeader() != I->getParent() || F->hasMinSize() || + F->hasOptSize()) + return false; + + auto *SrcTy = dyn_cast(I->getOperand(0)->getType()); + auto *DstTy = dyn_cast(I->getType()); + if (!SrcTy || !DstTy) + return false; + + // Convert 'zext <(8|16) x i8> %x to <(8|16) x i32>' to a shuffle that can be + // lowered to either 2 or 4 tbl instructions to insert the original i8 + // elements into i32 lanes. + auto *ZExt = dyn_cast(I); + if (ZExt && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && + SrcTy->getElementType()->isIntegerTy(8) && + DstTy->getElementType()->isIntegerTy(32)) { + createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian()); + return true; + } + return false; +} + bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const { if (!LoadedType.isSimple() || diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll --- a/llvm/test/CodeGen/AArch64/vselect-ext.ll +++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll @@ -573,35 +573,53 @@ define void @extension_in_loop_v16i8_to_v16i32(i8* %src, i32* %dst) { ; CHECK-LABEL: extension_in_loop_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: movi.2d v0, #0xffffffffffffffff +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x9, lCPI24_0@PAGE +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: adrp x10, lCPI24_1@PAGE +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: adrp x11, lCPI24_2@PAGE +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: adrp x12, lCPI24_3@PAGE +; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: ldr q0, [x9, lCPI24_0@PAGEOFF] +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr q1, [x10, lCPI24_1@PAGEOFF] +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: ldr q3, [x11, lCPI24_2@PAGEOFF] +; CHECK-NEXT: Lloh9: +; CHECK-NEXT: ldr q4, [x12, lCPI24_3@PAGEOFF] ; CHECK-NEXT: LBB24_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q1, [x0, x8] +; CHECK-NEXT: ldr q5, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: cmgt.16b v2, v1, v0 -; CHECK-NEXT: ushll2.8h v3, v1, #0 -; CHECK-NEXT: sshll2.8h v4, v2, #0 -; CHECK-NEXT: ushll2.4s v5, v3, #0 -; CHECK-NEXT: ushll.4s v3, v3, #0 -; CHECK-NEXT: sshll2.4s v6, v4, #0 -; CHECK-NEXT: sshll.4s v4, v4, #0 -; CHECK-NEXT: ushll.8h v1, v1, #0 -; CHECK-NEXT: sshll.8h v2, v2, #0 +; CHECK-NEXT: cmgt.16b v6, v5, v2 +; CHECK-NEXT: tbl.16b v7, { v5 }, v0 +; CHECK-NEXT: tbl.16b v16, { v5 }, v1 +; CHECK-NEXT: sshll2.8h v18, v6, #0 +; CHECK-NEXT: tbl.16b v17, { v5 }, v3 +; CHECK-NEXT: sshll2.4s v19, v18, #0 +; CHECK-NEXT: sshll.4s v18, v18, #0 +; CHECK-NEXT: tbl.16b v5, { v5 }, v4 +; CHECK-NEXT: sshll.8h v6, v6, #0 +; CHECK-NEXT: and.16b v7, v7, v19 +; CHECK-NEXT: and.16b v16, v16, v18 +; CHECK-NEXT: stp q16, q7, [x1, #32] +; CHECK-NEXT: sshll2.4s v7, v6, #0 +; CHECK-NEXT: sshll.4s v6, v6, #0 +; CHECK-NEXT: and.16b v7, v17, v7 ; CHECK-NEXT: and.16b v5, v5, v6 -; CHECK-NEXT: and.16b v3, v3, v4 -; CHECK-NEXT: stp q3, q5, [x1, #32] -; CHECK-NEXT: sshll2.4s v4, v2, #0 -; CHECK-NEXT: sshll.4s v2, v2, #0 -; CHECK-NEXT: ushll2.4s v3, v1, #0 -; CHECK-NEXT: ushll.4s v1, v1, #0 -; CHECK-NEXT: and.16b v3, v3, v4 -; CHECK-NEXT: and.16b v1, v1, v2 -; CHECK-NEXT: stp q1, q3, [x1], #64 +; CHECK-NEXT: stp q5, q7, [x1], #64 ; CHECK-NEXT: b.ne LBB24_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh9 +; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8 +; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 entry: br label %loop @@ -627,23 +645,23 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(i8* %src, i32* %dst) { ; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh2: +; CHECK-NEXT: Lloh10: ; CHECK-NEXT: adrp x9, lCPI25_0@PAGE -; CHECK-NEXT: Lloh3: +; CHECK-NEXT: Lloh11: ; CHECK-NEXT: adrp x10, lCPI25_1@PAGE -; CHECK-NEXT: Lloh4: +; CHECK-NEXT: Lloh12: ; CHECK-NEXT: adrp x11, lCPI25_2@PAGE -; CHECK-NEXT: Lloh5: +; CHECK-NEXT: Lloh13: ; CHECK-NEXT: adrp x12, lCPI25_3@PAGE ; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh6: +; CHECK-NEXT: Lloh14: ; CHECK-NEXT: ldr q0, [x9, lCPI25_0@PAGEOFF] -; CHECK-NEXT: Lloh7: +; CHECK-NEXT: Lloh15: ; CHECK-NEXT: ldr q1, [x10, lCPI25_1@PAGEOFF] -; CHECK-NEXT: Lloh8: +; CHECK-NEXT: Lloh16: ; CHECK-NEXT: ldr q3, [x11, lCPI25_2@PAGEOFF] -; CHECK-NEXT: Lloh9: +; CHECK-NEXT: Lloh17: ; CHECK-NEXT: ldr q4, [x12, lCPI25_3@PAGEOFF] ; CHECK-NEXT: LBB25_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -670,10 +688,10 @@ ; CHECK-NEXT: b.ne LBB25_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh9 -; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8 -; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 -; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 +; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17 +; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16 +; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15 +; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14 entry: br label %loop @@ -700,23 +718,23 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(i8* %src, i32* %dst) { ; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh10: +; CHECK-NEXT: Lloh18: ; CHECK-NEXT: adrp x9, lCPI26_0@PAGE -; CHECK-NEXT: Lloh11: +; CHECK-NEXT: Lloh19: ; CHECK-NEXT: adrp x10, lCPI26_1@PAGE -; CHECK-NEXT: Lloh12: +; CHECK-NEXT: Lloh20: ; CHECK-NEXT: adrp x11, lCPI26_2@PAGE -; CHECK-NEXT: Lloh13: +; CHECK-NEXT: Lloh21: ; CHECK-NEXT: adrp x12, lCPI26_3@PAGE ; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh14: +; CHECK-NEXT: Lloh22: ; CHECK-NEXT: ldr q0, [x9, lCPI26_0@PAGEOFF] -; CHECK-NEXT: Lloh15: +; CHECK-NEXT: Lloh23: ; CHECK-NEXT: ldr q1, [x10, lCPI26_1@PAGEOFF] -; CHECK-NEXT: Lloh16: +; CHECK-NEXT: Lloh24: ; CHECK-NEXT: ldr q3, [x11, lCPI26_2@PAGEOFF] -; CHECK-NEXT: Lloh17: +; CHECK-NEXT: Lloh25: ; CHECK-NEXT: ldr q4, [x12, lCPI26_3@PAGEOFF] ; CHECK-NEXT: LBB26_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -743,10 +761,10 @@ ; CHECK-NEXT: b.ne LBB26_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17 -; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16 -; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15 -; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14 +; CHECK-NEXT: .loh AdrpLdr Lloh21, Lloh25 +; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh24 +; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23 +; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22 entry: br label %loop diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2,31 +2,199 @@ ; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s | FileCheck --check-prefix=CHECK-BE %s +; CHECK-LABEL: lCPI0_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT:lCPI0_1: +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT:lCPI0_2: +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 9 ; 0x9 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 10 ; 0xa +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 11 ; 0xb +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT:lCPI0_3: +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 13 ; 0xd +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 14 ; 0xe +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 15 ; 0xf +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE: .LCPI0_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .LCPI0_1: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 4 // 0x4 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 6 // 0x6 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .LCPI0_2: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 8 // 0x8 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 9 // 0x9 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 10 // 0xa +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 11 // 0xb +; CHECK-BE-NEXT: .LCPI0_3: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 12 // 0xc +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 13 // 0xd +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 14 // 0xe +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 15 // 0xf + ; It's profitable to convert the zext to a shuffle, which in turn will be ; lowered to 4 tbl instructions. The masks are materialized outside the loop. define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) { ; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x9, lCPI0_0@PAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: adrp x10, lCPI0_1@PAGE +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x11, lCPI0_2@PAGE +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: adrp x12, lCPI0_3@PAGE ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: ldr q0, [x9, lCPI0_0@PAGEOFF] +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr q1, [x10, lCPI0_1@PAGEOFF] +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: ldr q2, [x11, lCPI0_2@PAGEOFF] +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr q3, [x12, lCPI0_3@PAGEOFF] ; CHECK-NEXT: LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q0, [x0, x8] +; CHECK-NEXT: ldr q4, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll2.8h v1, v0, #0 -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ushll2.4s v2, v1, #0 -; CHECK-NEXT: ushll.4s v1, v1, #0 -; CHECK-NEXT: ushll2.4s v3, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: stp q1, q2, [x1, #32] -; CHECK-NEXT: stp q0, q3, [x1], #64 +; CHECK-NEXT: tbl.16b v5, { v4 }, v3 +; CHECK-NEXT: tbl.16b v6, { v4 }, v2 +; CHECK-NEXT: tbl.16b v7, { v4 }, v1 +; CHECK-NEXT: tbl.16b v4, { v4 }, v0 +; CHECK-NEXT: stp q6, q5, [x1, #32] +; CHECK-NEXT: stp q4, q7, [x1], #64 ; CHECK-NEXT: b.ne LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7 +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6 +; CHECK-NEXT: .loh AdrpLdr Lloh1, Lloh5 +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh4 ; ; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop: ; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI0_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI0_1 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_1 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI0_2 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_2 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI0_3 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_3 +; CHECK-BE-NEXT: ld1 { v3.16b }, [x8] ; CHECK-BE-NEXT: mov x8, xzr ; CHECK-BE-NEXT: .LBB0_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 @@ -34,20 +202,18 @@ ; CHECK-BE-NEXT: add x10, x1, #32 ; CHECK-BE-NEXT: add x8, x8, #16 ; CHECK-BE-NEXT: cmp x8, #128 -; CHECK-BE-NEXT: ld1 { v0.16b }, [x9] +; CHECK-BE-NEXT: ld1 { v4.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #48 -; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0 -; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0 -; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-BE-NEXT: st1 { v2.4s }, [x9] +; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b +; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v0.16b +; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v2.16b +; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v5.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #16 -; CHECK-BE-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-BE-NEXT: st1 { v1.4s }, [x10] -; CHECK-BE-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-BE-NEXT: st1 { v2.4s }, [x1] +; CHECK-BE-NEXT: st1 { v6.16b }, [x1] ; CHECK-BE-NEXT: add x1, x1, #64 -; CHECK-BE-NEXT: st1 { v0.4s }, [x9] +; CHECK-BE-NEXT: st1 { v7.16b }, [x10] +; CHECK-BE-NEXT: st1 { v4.16b }, [x9] ; CHECK-BE-NEXT: b.ne .LBB0_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret @@ -393,39 +559,123 @@ ret void } +; CHECK-LABEL: lCPI6_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI6_1: +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE: .LCPI6_0: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 0 // 0x0 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 2 // 0x2 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .LCPI6_1: +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 4 // 0x4 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 6 // 0x6 +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 7 // 0x7 + define void @zext_v8i8_to_v8i32_in_loop(i8* %src, i32* %dst) { ; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: adrp x9, lCPI6_0@PAGE +; CHECK-NEXT: Lloh9: +; CHECK-NEXT: adrp x10, lCPI6_1@PAGE ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh10: +; CHECK-NEXT: ldr q0, [x9, lCPI6_0@PAGEOFF] +; CHECK-NEXT: Lloh11: +; CHECK-NEXT: ldr q1, [x10, lCPI6_1@PAGEOFF] ; CHECK-NEXT: LBB6_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d0, [x0, x8] +; CHECK-NEXT: ldr d2, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ushll2.4s v1, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: stp q0, q1, [x1], #64 +; CHECK-NEXT: tbl.16b v3, { v2 }, v1 +; CHECK-NEXT: tbl.16b v2, { v2 }, v0 +; CHECK-NEXT: stp q2, q3, [x1], #64 ; CHECK-NEXT: b.ne LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh11 +; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh10 ; ; CHECK-BE-LABEL: zext_v8i8_to_v8i32_in_loop: ; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI6_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI6_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: adrp x8, .LCPI6_1 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI6_1 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x8] ; CHECK-BE-NEXT: mov x8, xzr ; CHECK-BE-NEXT: .LBB6_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: add x9, x0, x8 ; CHECK-BE-NEXT: add x8, x8, #16 ; CHECK-BE-NEXT: cmp x8, #128 -; CHECK-BE-NEXT: ld1 { v0.8b }, [x9] +; CHECK-BE-NEXT: ld1 { v2.8b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #16 -; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BE-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-BE-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-BE-NEXT: st1 { v1.4s }, [x1] +; CHECK-BE-NEXT: tbl v3.16b, { v2.16b }, v0.16b +; CHECK-BE-NEXT: tbl v2.16b, { v2.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v3.16b }, [x1] ; CHECK-BE-NEXT: add x1, x1, #64 -; CHECK-BE-NEXT: st1 { v0.4s }, [x9] +; CHECK-BE-NEXT: st1 { v2.16b }, [x9] ; CHECK-BE-NEXT: b.ne .LBB6_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll @@ -15,10 +15,11 @@ ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]] ; CHECK-NEXT: [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>* ; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16 -; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> , <64 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[TMP0]] to <16 x i32> ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[IV]] ; CHECK-NEXT: [[DST_GEP_CAST:%.*]] = bitcast i32* [[DST_GEP]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[EXT]], <16 x i32>* [[DST_GEP_CAST]], align 64 +; CHECK-NEXT: store <16 x i32> [[TMP1]], <16 x i32>* [[DST_GEP_CAST]], align 64 ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]