diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13400,39 +13400,94 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { IRBuilder<> Builder(TI); - SmallVector Parts; + SmallVector Parts, Parts2; + int NumElements = cast(TI->getType())->getNumElements(); + auto *SrcTy = cast(TI->getOperand(0)->getType()); + auto *DstTy = cast(TI->getType()); + assert(SrcTy->getElementType()->isIntegerTy() && + "Non-integer type source vector element is not supported"); + assert(DstTy->getElementType()->isIntegerTy(8) && + "Unsupported destination vector element type"); + unsigned SrcElemTySz = + cast(SrcTy->getElementType())->getBitWidth(); + unsigned TruncFactor = + SrcElemTySz / + cast(DstTy->getElementType())->getBitWidth(); + assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) && + "Unsupported source vector element type size"); Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16); - Parts.push_back(Builder.CreateBitCast( - Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy)); - Parts.push_back(Builder.CreateBitCast( - Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy)); - - Intrinsic::ID TblID = Intrinsic::aarch64_neon_tbl2; - unsigned NumElements = cast(TI->getType())->getNumElements(); - if (NumElements == 16) { - Parts.push_back(Builder.CreateBitCast( - Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), VecTy)); - Parts.push_back(Builder.CreateBitCast( - Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}), - VecTy)); - TblID = Intrinsic::aarch64_neon_tbl4; - } + SmallVector MaskConst; - for (unsigned Idx = 0; Idx < NumElements * 4; Idx += 4) - MaskConst.push_back( - ConstantInt::get(Builder.getInt8Ty(), IsLittleEndian ? Idx : Idx + 3)); + for (int Itr = 0; Itr < 16; Itr++) { + if (Itr < NumElements) + MaskConst.push_back(ConstantInt::get( + Builder.getInt8Ty(), IsLittleEndian + ? Itr * TruncFactor + : Itr * TruncFactor + (TruncFactor - 1))); + else + MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255)); + } + + int MaxTblSz = 128 * 4; + int MaxSrcSz = SrcElemTySz * NumElements; + int ElemsPerTbl = (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz); + assert(ElemsPerTbl <= 16 && "Maximum elements selected using TBL instruction cannot exceed 16!"); + + int ShuffleCount = 128/SrcElemTySz; + SmallVector ShuffleLanes; + for (int i = 0; i < ShuffleCount; ++i) + ShuffleLanes.push_back(i); + + SmallVector Results; + while (ShuffleLanes.back() < NumElements) { + Parts.push_back(Builder.CreateBitCast(Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy)); + + if (Parts.size() >= 4) { + auto *F = Intrinsic::getDeclaration(TI->getModule(), Intrinsic::aarch64_neon_tbl4, VecTy); + Parts.push_back(ConstantVector::get(MaskConst)); + Results.push_back(Builder.CreateCall(F, Parts)); + Parts.clear(); + } - for (unsigned Idx = NumElements * 4; Idx < 64; Idx += 4) - MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255)); + for (int i = 0; i < ShuffleCount; ++i) + ShuffleLanes[i] += ShuffleCount; + } - Parts.push_back(ConstantVector::get(MaskConst)); - auto *F = - Intrinsic::getDeclaration(TI->getModule(), TblID, Parts[0]->getType()); - Value *Res = Builder.CreateCall(F, Parts); + assert((Parts.empty() || Results.empty()) && "Lowering trunc for vectors requiring different TBL instructions is not supported!"); + if(!Parts.empty()) { + Intrinsic::ID TblID; + switch (Parts.size()) { + case 1: TblID = Intrinsic::aarch64_neon_tbl1; break; + case 2: TblID = Intrinsic::aarch64_neon_tbl2; break; + case 3: TblID = Intrinsic::aarch64_neon_tbl3; break; + } + auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy); + Parts.push_back(ConstantVector::get(MaskConst)); + Results.push_back(Builder.CreateCall(F, Parts)); + } + + assert(Results.size() <= 2 && "Trunc lowering does not support generation of more than 2 tbl instructions!"); + Value *FinalResult = Results[0]; + if(Results.size() == 1) { + if (ElemsPerTbl < 16) { + std::vector FinalMask(ElemsPerTbl); + std::iota(FinalMask.begin(), FinalMask.end(), 0); + FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask); + } + } + else { + std::vector FinalMask(ElemsPerTbl * Results.size()); + if(ElemsPerTbl < 16) { + std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0); + std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16); + } + else { + std::iota(FinalMask.begin(), FinalMask.end(), 0); + } + FinalResult = Builder.CreateShuffleVector(Results[0], Results[1], FinalMask); + } - if (NumElements == 8) - Res = Builder.CreateShuffleVector(Res, {0, 1, 2, 3, 4, 5, 6, 7}); - TI->replaceAllUsesWith(Res); + TI->replaceAllUsesWith(FinalResult); TI->eraseFromParent(); } @@ -13495,13 +13550,15 @@ return true; } - // Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4 - // instruction selecting the lowest 8 bits per lane of the input interpreted - // as 2 or 4 <4 x i32> vectors. + // Convert 'trunc <(8|16) x (i16|i32|i64)> %x to <(8|16) x i8>' using tbl + // instructions instruction selecting the lowest 8 bits per lane of the input + // interpreted as 1, 2 or 4 <4 x i32> vectors. auto *TI = dyn_cast(I); - if (TI && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) && - SrcTy->getElementType()->isIntegerTy(32) && - DstTy->getElementType()->isIntegerTy(8)) { + if (TI && DstTy->getElementType()->isIntegerTy(8) && + ((SrcTy->getElementType()->isIntegerTy(16) || + SrcTy->getElementType()->isIntegerTy(32) || + SrcTy->getElementType()->isIntegerTy(64)) && + (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) { createTblForTrunc(TI, Subtarget->isLittleEndian()); return true; } diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -615,20 +615,22 @@ define void @sink_v8z16_0(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) { ; CHECK-LABEL: sink_v8z16_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v0.8b, v0.b[0] -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: .LBB8_1: // %loop -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: add x8, x8, #8 -; CHECK-NEXT: subs x2, x2, #8 -; CHECK-NEXT: umull v1.8h, v1.8b, v0.8b -; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 -; CHECK-NEXT: xtn v1.8b, v1.8h -; CHECK-NEXT: str d1, [x0], #32 -; CHECK-NEXT: b.ne .LBB8_1 -; CHECK-NEXT: // %bb.2: // %exit -; CHECK-NEXT: ret +; CHECK-NEXT: adrp x9, .LCPI8_0 +; CHECK-NEXT: dup v0.8b, v0.b[0] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI8_0] +; CHECK-NEXT: .LBB8_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: add x8, x8, #8 +; CHECK-NEXT: subs x2, x2, #8 +; CHECK-NEXT: umull v2.8h, v2.8b, v0.8b +; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 +; CHECK-NEXT: tbl v2.16b, { v2.16b }, v1.16b +; CHECK-NEXT: str d2, [x0], #32 +; CHECK-NEXT: b.ne .LBB8_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: ret entry: %ext = zext <16 x i8> %a to <16 x i16> %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <8 x i32> @@ -657,23 +659,25 @@ define void @sink_v16s16_8(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) { ; CHECK-LABEL: sink_v16s16_8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v1.8b, v0.b[10] -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: dup v0.16b, v0.b[10] -; CHECK-NEXT: .LBB9_1: // %loop -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x0] -; CHECK-NEXT: add x8, x8, #8 -; CHECK-NEXT: subs x2, x2, #8 -; CHECK-NEXT: smull2 v3.8h, v2.16b, v0.16b -; CHECK-NEXT: smull v2.8h, v2.8b, v1.8b -; CHECK-NEXT: cmlt v3.8h, v3.8h, #0 -; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 -; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b -; CHECK-NEXT: str q2, [x0], #32 -; CHECK-NEXT: b.ne .LBB9_1 -; CHECK-NEXT: // %bb.2: // %exit -; CHECK-NEXT: ret +; CHECK-NEXT: adrp x9, .LCPI9_0 +; CHECK-NEXT: dup v1.8b, v0.b[10] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: dup v0.16b, v0.b[10] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI9_0] +; CHECK-NEXT: .LBB9_1: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q3, [x0] +; CHECK-NEXT: add x8, x8, #8 +; CHECK-NEXT: subs x2, x2, #8 +; CHECK-NEXT: smull2 v4.8h, v3.16b, v0.16b +; CHECK-NEXT: smull v3.8h, v3.8b, v1.8b +; CHECK-NEXT: cmlt v5.8h, v4.8h, #0 +; CHECK-NEXT: cmlt v4.8h, v3.8h, #0 +; CHECK-NEXT: tbl v3.16b, { v4.16b, v5.16b }, v2.16b +; CHECK-NEXT: str q3, [x0], #32 +; CHECK-NEXT: b.ne .LBB9_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: ret entry: %ext = sext <16 x i8> %a to <16 x i16> %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -235,25 +235,60 @@ ret void } +; CHECK-LABEL: lCPI3_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 16 ; 0x10 +; CHECK-NEXT: .byte 24 ; 0x18 +; CHECK-NEXT: .byte 32 ; 0x20 +; CHECK-NEXT: .byte 40 ; 0x28 +; CHECK-NEXT: .byte 48 ; 0x30 +; CHECK-NEXT: .byte 56 ; 0x38 +; CHECK-NEXT: .byte 64 ; 0x40 +; CHECK-NEXT: .byte 72 ; 0x48 +; CHECK-NEXT: .byte 80 ; 0x50 +; CHECK-NEXT: .byte 88 ; 0x58 +; CHECK-NEXT: .byte 96 ; 0x60 +; CHECK-NEXT: .byte 104 ; 0x68 +; CHECK-NEXT: .byte 112 ; 0x70 +; CHECK-NEXT: .byte 120 ; 0x78 + +; CHECK-BE-LABEL: .LCPI3_0: +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .byte 15 // 0xf +; CHECK-BE-NEXT: .byte 23 // 0x17 +; CHECK-BE-NEXT: .byte 31 // 0x1f +; CHECK-BE-NEXT: .byte 39 // 0x27 +; CHECK-BE-NEXT: .byte 47 // 0x2f +; CHECK-BE-NEXT: .byte 55 // 0x37 +; CHECK-BE-NEXT: .byte 63 // 0x3f +; CHECK-BE-NEXT: .byte 71 // 0x47 +; CHECK-BE-NEXT: .byte 79 // 0x4f +; CHECK-BE-NEXT: .byte 87 // 0x57 +; CHECK-BE-NEXT: .byte 95 // 0x5f +; CHECK-BE-NEXT: .byte 103 // 0x67 +; CHECK-BE-NEXT: .byte 111 // 0x6f +; CHECK-BE-NEXT: .byte 119 // 0x77 +; CHECK-BE-NEXT: .byte 127 // 0x7f define void @trunc_v16i64_to_v16i8_in_loop(ptr %A, ptr %dst) { ; CHECK-LABEL: trunc_v16i64_to_v16i8_in_loop: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: adrp x9, lCPI3_0@PAGE ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr q0, [x9, lCPI3_0@PAGEOFF] ; CHECK-NEXT: LBB3_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x0, x8, lsl #7 -; CHECK-NEXT: ldp q3, q2, [x9, #96] -; CHECK-NEXT: ldp q1, q0, [x9, #32] -; CHECK-NEXT: uzp1.4s v2, v3, v2 -; CHECK-NEXT: ldp q5, q4, [x9, #64] -; CHECK-NEXT: uzp1.4s v0, v1, v0 -; CHECK-NEXT: ldp q3, q6, [x9] -; CHECK-NEXT: uzp1.4s v4, v5, v4 -; CHECK-NEXT: uzp1.8h v2, v4, v2 -; CHECK-NEXT: uzp1.4s v1, v3, v6 -; CHECK-NEXT: uzp1.8h v0, v1, v0 -; CHECK-NEXT: uzp1.16b v0, v0, v2 -; CHECK-NEXT: str q0, [x1, x8, lsl #4] +; CHECK-NEXT: ldp q1, q2, [x9] +; CHECK-NEXT: ldp q3, q4, [x9, #32] +; CHECK-NEXT: ldp q16, q17, [x9, #64] +; CHECK-NEXT: tbl.16b v1, { v1, v2, v3, v4 }, v0 +; CHECK-NEXT: ldp q18, q19, [x9, #96] +; CHECK-NEXT: tbl.16b v2, { v16, v17, v18, v19 }, v0 +; CHECK-NEXT: mov.d v1[1], v2[0] +; CHECK-NEXT: str q1, [x1, x8, lsl #4] ; CHECK-NEXT: add x8, x8, #1 ; CHECK-NEXT: cmp x8, #1000 ; CHECK-NEXT: b.eq LBB3_1 @@ -262,39 +297,38 @@ ; CHECK-BE-LABEL: trunc_v16i64_to_v16i8_in_loop: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: adrp x8, .LCPI3_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI3_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr ; CHECK-BE-NEXT: .LBB3_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x9, x0, x8, lsl #7 -; CHECK-BE-NEXT: add x10, x9, #48 -; CHECK-BE-NEXT: add x11, x9, #32 -; CHECK-BE-NEXT: ld1 { v5.2d }, [x9] -; CHECK-BE-NEXT: ld1 { v0.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #80 -; CHECK-BE-NEXT: ld1 { v1.2d }, [x11] -; CHECK-BE-NEXT: add x11, x9, #112 -; CHECK-BE-NEXT: ld1 { v2.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #96 -; CHECK-BE-NEXT: ld1 { v3.2d }, [x11] -; CHECK-BE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; CHECK-BE-NEXT: ld1 { v4.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #64 -; CHECK-BE-NEXT: add x9, x9, #16 -; CHECK-BE-NEXT: ld1 { v6.2d }, [x10] -; CHECK-BE-NEXT: ld1 { v7.2d }, [x9] -; CHECK-BE-NEXT: add x9, x1, x8, lsl #4 -; CHECK-BE-NEXT: uzp1 v3.4s, v4.4s, v3.4s -; CHECK-BE-NEXT: add x8, x8, #1 -; CHECK-BE-NEXT: cmp x8, #1000 -; CHECK-BE-NEXT: uzp1 v2.4s, v6.4s, v2.4s -; CHECK-BE-NEXT: uzp1 v1.4s, v5.4s, v7.4s -; CHECK-BE-NEXT: uzp1 v2.8h, v2.8h, v3.8h -; CHECK-BE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-BE-NEXT: st1 { v0.16b }, [x9] -; CHECK-BE-NEXT: b.eq .LBB3_1 -; CHECK-BE-NEXT: // %bb.2: // %exit -; CHECK-BE-NEXT: ret +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8, lsl #7 +; CHECK-BE-NEXT: add x10, x9, #16 +; CHECK-BE-NEXT: add x11, x9, #32 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] +; CHECK-BE-NEXT: ld1 { v2.16b }, [x10] +; CHECK-BE-NEXT: add x10, x9, #48 +; CHECK-BE-NEXT: ld1 { v3.16b }, [x11] +; CHECK-BE-NEXT: add x11, x9, #64 +; CHECK-BE-NEXT: ld1 { v4.16b }, [x10] +; CHECK-BE-NEXT: add x10, x9, #80 +; CHECK-BE-NEXT: ld1 { v16.16b }, [x11] +; CHECK-BE-NEXT: add x11, x9, #96 +; CHECK-BE-NEXT: add x9, x9, #112 +; CHECK-BE-NEXT: ld1 { v17.16b }, [x10] +; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b +; CHECK-BE-NEXT: ld1 { v18.16b }, [x11] +; CHECK-BE-NEXT: ld1 { v19.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, x8, lsl #4 +; CHECK-BE-NEXT: add x8, x8, #1 +; CHECK-BE-NEXT: cmp x8, #1000 +; CHECK-BE-NEXT: tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b +; CHECK-BE-NEXT: mov v1.d[1], v2.d[0] +; CHECK-BE-NEXT: st1 { v1.16b }, [x9] +; CHECK-BE-NEXT: b.eq .LBB3_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret entry: br label %loop @@ -314,50 +348,87 @@ ret void } +; CHECK-LABEL: lCPI4_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 16 ; 0x10 +; CHECK-NEXT: .byte 24 ; 0x18 +; CHECK-NEXT: .byte 32 ; 0x20 +; CHECK-NEXT: .byte 40 ; 0x28 +; CHECK-NEXT: .byte 48 ; 0x30 +; CHECK-NEXT: .byte 56 ; 0x38 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE-LABEL: .LCPI4_0: +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .byte 15 // 0xf +; CHECK-BE-NEXT: .byte 23 // 0x17 +; CHECK-BE-NEXT: .byte 31 // 0x1f +; CHECK-BE-NEXT: .byte 39 // 0x27 +; CHECK-BE-NEXT: .byte 47 // 0x2f +; CHECK-BE-NEXT: .byte 55 // 0x37 +; CHECK-BE-NEXT: .byte 63 // 0x3f +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff define void @trunc_v8i64_to_v8i8_in_loop(ptr %A, ptr %dst) { ; CHECK-LABEL: trunc_v8i64_to_v8i8_in_loop: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: LBB4_1: ; %loop -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x9, x0, x8, lsl #6 -; CHECK-NEXT: ldp q1, q0, [x9, #32] -; CHECK-NEXT: ldp q3, q2, [x9] -; CHECK-NEXT: uzp1.4s v0, v1, v0 -; CHECK-NEXT: uzp1.4s v1, v3, v2 -; CHECK-NEXT: uzp1.8h v0, v1, v0 -; CHECK-NEXT: xtn.8b v0, v0 -; CHECK-NEXT: str d0, [x1, x8, lsl #3] -; CHECK-NEXT: add x8, x8, #1 -; CHECK-NEXT: cmp x8, #1000 -; CHECK-NEXT: b.eq LBB4_1 -; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: ret +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: adrp x9, lCPI4_0@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr q0, [x9, lCPI4_0@PAGEOFF] +; CHECK-NEXT: LBB4_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #6 +; CHECK-NEXT: ldp q1, q2, [x9] +; CHECK-NEXT: ldp q3, q4, [x9, #32] +; CHECK-NEXT: tbl.16b v1, { v1, v2, v3, v4 }, v0 +; CHECK-NEXT: str d1, [x1, x8, lsl #3] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB4_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7 ; CHECK-BE-LABEL: trunc_v8i64_to_v8i8_in_loop: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr -; CHECK-BE-NEXT: .LBB4_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x9, x0, x8, lsl #6 -; CHECK-BE-NEXT: add x10, x9, #48 -; CHECK-BE-NEXT: ld1 { v1.2d }, [x9] -; CHECK-BE-NEXT: ld1 { v0.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #32 -; CHECK-BE-NEXT: add x9, x9, #16 -; CHECK-BE-NEXT: ld1 { v2.2d }, [x10] -; CHECK-BE-NEXT: ld1 { v3.2d }, [x9] -; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 -; CHECK-BE-NEXT: add x8, x8, #1 -; CHECK-BE-NEXT: cmp x8, #1000 -; CHECK-BE-NEXT: uzp1 v0.4s, v2.4s, v0.4s -; CHECK-BE-NEXT: uzp1 v1.4s, v1.4s, v3.4s -; CHECK-BE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; CHECK-BE-NEXT: xtn v0.8b, v0.8h -; CHECK-BE-NEXT: st1 { v0.8b }, [x9] -; CHECK-BE-NEXT: b.eq .LBB4_1 +; CHECK-BE-NEXT: adrp x8, .LCPI4_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI4_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB4_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8, lsl #6 +; CHECK-BE-NEXT: add x10, x9, #16 +; CHECK-BE-NEXT: add x11, x9, #32 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] +; CHECK-BE-NEXT: add x9, x9, #48 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x10] +; CHECK-BE-NEXT: ld1 { v3.16b }, [x11] +; CHECK-BE-NEXT: ld1 { v4.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 +; CHECK-BE-NEXT: add x8, x8, #1 +; CHECK-BE-NEXT: cmp x8, #1000 +; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b +; CHECK-BE-NEXT: st1 { v1.8b }, [x9] +; CHECK-BE-NEXT: b.eq .LBB4_1 ; CHECK-BE-NEXT: // %bb.2: // %exit -; CHECK-BE-NEXT: ret +; CHECK-BE-NEXT: ret entry: br label %loop @@ -554,3 +625,191 @@ exit: ret void } + +; CHECK-LABEL: lCPI7_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 10 ; 0xa +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 14 ; 0xe +; CHECK-NEXT: .byte 16 ; 0x10 +; CHECK-NEXT: .byte 18 ; 0x12 +; CHECK-NEXT: .byte 20 ; 0x14 +; CHECK-NEXT: .byte 22 ; 0x16 +; CHECK-NEXT: .byte 24 ; 0x18 +; CHECK-NEXT: .byte 26 ; 0x1a +; CHECK-NEXT: .byte 28 ; 0x1c +; CHECK-NEXT: .byte 30 ; 0x1e + +; CHECK-BE-LABEL: .LCPI7_0: +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .byte 9 // 0x9 +; CHECK-BE-NEXT: .byte 11 // 0xb +; CHECK-BE-NEXT: .byte 13 // 0xd +; CHECK-BE-NEXT: .byte 15 // 0xf +; CHECK-BE-NEXT: .byte 17 // 0x11 +; CHECK-BE-NEXT: .byte 19 // 0x13 +; CHECK-BE-NEXT: .byte 21 // 0x15 +; CHECK-BE-NEXT: .byte 23 // 0x17 +; CHECK-BE-NEXT: .byte 25 // 0x19 +; CHECK-BE-NEXT: .byte 27 // 0x1b +; CHECK-BE-NEXT: .byte 29 // 0x1d +; CHECK-BE-NEXT: .byte 31 // 0x1f + + +define void @trunc_v16i16_to_v16i8_in_loop(ptr %A, ptr %dst) { +; CHECK-LABEL: trunc_v16i16_to_v16i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: adrp x9, lCPI7_0@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh9: +; CHECK-NEXT: ldr q0, [x9, lCPI7_0@PAGEOFF] +; CHECK-NEXT: LBB7_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #5 +; CHECK-NEXT: ldp q1, q2, [x9] +; CHECK-NEXT: tbl.16b v1, { v1, v2 }, v0 +; CHECK-NEXT: str q1, [x1, x8, lsl #4] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB7_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: trunc_v16i16_to_v16i8_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI7_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI7_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB7_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8, lsl #5 +; CHECK-BE-NEXT: add x10, x9, #16 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, x8, lsl #4 +; CHECK-BE-NEXT: add x8, x8, #1 +; CHECK-BE-NEXT: ld1 { v2.16b }, [x10] +; CHECK-BE-NEXT: cmp x8, #1000 +; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v0.16b +; CHECK-BE-NEXT: st1 { v1.16b }, [x9] +; CHECK-BE-NEXT: b.eq .LBB7_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <16 x i16>, ptr %A, i64 %iv + %l.A = load <16 x i16>, ptr %gep.A + %trunc = trunc <16 x i16> %l.A to <16 x i8> + %gep.dst = getelementptr inbounds <16 x i8>, ptr %dst, i64 %iv + store <16 x i8> %trunc, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: lCPI8_0: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 10 ; 0xa +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 14 ; 0xe +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff + +; CHECK-BE-LABEL: .LCPI8_0: +; CHECK-BE-NEXT: .byte 1 // 0x1 +; CHECK-BE-NEXT: .byte 3 // 0x3 +; CHECK-BE-NEXT: .byte 5 // 0x5 +; CHECK-BE-NEXT: .byte 7 // 0x7 +; CHECK-BE-NEXT: .byte 9 // 0x9 +; CHECK-BE-NEXT: .byte 11 // 0xb +; CHECK-BE-NEXT: .byte 13 // 0xd +; CHECK-BE-NEXT: .byte 15 // 0xf +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff +; CHECK-BE-NEXT: .byte 255 // 0xff + +define void @trunc_v8i16_to_v8i8_in_loop(ptr %A, ptr %dst) { +; CHECK-LABEL: trunc_v8i16_to_v8i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh10: +; CHECK-NEXT: adrp x9, lCPI8_0@PAGE +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh11: +; CHECK-NEXT: ldr q0, [x9, lCPI8_0@PAGEOFF] +; CHECK-NEXT: LBB8_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q1, [x0, x8, lsl #4] +; CHECK-NEXT: tbl.16b v1, { v1 }, v0 +; CHECK-NEXT: str d1, [x1, x8, lsl #3] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB8_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: trunc_v8i16_to_v8i8_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: adrp x8, .LCPI8_0 +; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI8_0 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB8_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8, lsl #4 +; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 +; CHECK-BE-NEXT: add x8, x8, #1 +; CHECK-BE-NEXT: cmp x8, #1000 +; CHECK-BE-NEXT: tbl v1.16b, { v1.16b }, v0.16b +; CHECK-BE-NEXT: st1 { v1.8b }, [x9] +; CHECK-BE-NEXT: b.eq .LBB8_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds <8 x i16>, ptr %A, i64 %iv + %l.A = load <8 x i16>, ptr %gep.A + %trunc = trunc <8 x i16> %l.A to <8 x i8> + %gep.dst = getelementptr inbounds <8 x i8>, ptr %dst, i64 %iv + store <8 x i8> %trunc, ptr %gep.dst + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret void +}