This is an archive of the discontinued LLVM Phabricator instance.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
13216–13217	Further to our big-endian discussion, I think this `bitcast` will put the least significant byte into elements 3, 7, 11, 15. So the for loop below should account for that.
13289	Shouldn't this be part of the patch that adds the code above?
13292	Destination type wrong.

fhahn mentioned this in rG791a7ae1ba3e: [AArch64] Add big-endian tests for trunc-to-tbl.ll.Sep 15 2022, 7:13 AM

Thanks Tim, I updated the code to account for big-endian targets by adjusting the mask as suggested offline.

Harbormaster completed remote builds in B186855: Diff 460402.Sep 15 2022, 7:30 AM

fhahn marked 2 inline comments as done.Sep 15 2022, 7:30 AM

fhahn added inline comments.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
13216–13217	Thanks, I adjusted the loop below to use Idx+3 for the mask to account for that on big-endian.
13289	Yes, moved to D133494.
13292	Thanks, should be fixed!

Thanks Florian, think this one looks good now.

This revision is now accepted and ready to land.Sep 16 2022, 1:25 AM

This revision was landed with ongoing or failed builds.Sep 16 2022, 4:43 AM

Closed by commit rG8491d01cc385: [AArch64] Lower vector trunc using tbl. (authored by fhahn). · Explain Why

This revision was automatically updated to reflect the committed changes.

fhahn marked 2 inline comments as done.

fhahn added a commit: rG8491d01cc385: [AArch64] Lower vector trunc using tbl..

nilanjana_basu mentioned this in D137221: [MicroBenchmarks] Add benchmarks to check runtime of truncate or zero-extend vector operations in AArch64.Nov 1 2022, 6:31 PM

nilanjana_basu mentioned this in rT3b44b6bdd3e8: [MicroBenchmarks] Add benchmarks to check runtime of truncate or zero-extend….Nov 2 2022, 2:05 PM

fhahn mentioned this in D138059: [MicroBenchmarks,AArch64] Added correctness test & other performance tests for truncate or zero-extend vector operations.Nov 15 2022, 4:00 PM

nilanjana_basu mentioned this in rT08de51078b0a: [MicroBenchmarks,AArch64] Added correctness test & other performance tests for….Dec 1 2022, 10:09 PM

nilanjana_basu mentioned this in rG02d09ffc1b09: [AArch64] Extending lowering of 'trunc <(8|16) x i64> %x to <(8|16) x i8>' to….Dec 15 2022, 7:21 AM

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

CodeGenPrepare.cpp

5 lines

Target/

AArch64/

AArch64ISelLowering.cpp

50 lines

test/

CodeGen/

AArch64/

trunc-to-tbl.ll

134 lines

Diff 460699

llvm/lib/CodeGen/CodeGenPrepare.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,041 Lines • ▼ Show 20 Lines	if (CastInst *CI = dyn_cast<CastInst>(I)) {
// the address of globals out of a loop). If this is the case, we don't		// the address of globals out of a loop). If this is the case, we don't
// want to forward-subst the cast.		// want to forward-subst the cast.
if (isa<Constant>(CI->getOperand(0)))		if (isa<Constant>(CI->getOperand(0)))
return false;		return false;

if (OptimizeNoopCopyExpression(CI, TLI, DL))		if (OptimizeNoopCopyExpression(CI, TLI, DL))
return true;		return true;

if (isa<UIToFPInst>(I) && TLI->optimizeExtendOrTruncateConversion(		if ((isa<UIToFPInst>(I) \|\| isa<TruncInst>(I)) &&
I, LI->getLoopFor(I->getParent())))		TLI->optimizeExtendOrTruncateConversion(I,
		LI->getLoopFor(I->getParent())))
return true;		return true;

if (isa<ZExtInst>(I) \|\| isa<SExtInst>(I)) {		if (isa<ZExtInst>(I) \|\| isa<SExtInst>(I)) {
/// Sink a zext or sext into its user blocks if the target type doesn't		/// Sink a zext or sext into its user blocks if the target type doesn't
/// fit in one register		/// fit in one register
if (TLI->getTypeAction(CI->getContext(),		if (TLI->getTypeAction(CI->getContext(),
TLI->getValueType(*DL, CI->getType())) ==		TLI->getValueType(*DL, CI->getType())) ==
TargetLowering::TypeExpandInteger) {		TargetLowering::TypeExpandInteger) {
▲ Show 20 Lines • Show All 513 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 13,203 Lines • ▼ Show 20 Lines	static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
auto *FirstEltZero = Builder.CreateInsertElement(		auto *FirstEltZero = Builder.CreateInsertElement(
PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));		PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);		Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
Result = Builder.CreateBitCast(Result, DstTy);		Result = Builder.CreateBitCast(Result, DstTy);
ZExt->replaceAllUsesWith(Result);		ZExt->replaceAllUsesWith(Result);
ZExt->eraseFromParent();		ZExt->eraseFromParent();
}		}

		static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
		IRBuilder<> Builder(TI);
		SmallVector<Value *> Parts;
		Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
		Parts.push_back(Builder.CreateBitCast(
		Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy));
		t.p.northoverUnsubmitted Done Reply Inline Actions Further to our big-endian discussion, I think this `bitcast` will put the least significant byte into elements 3, 7, 11, 15. So the for loop below should account for that. t.p.northover: Further to our big-endian discussion, I think this `bitcast` will put the least significant…
		fhahnAuthorUnsubmitted Done Reply Inline Actions Thanks, I adjusted the loop below to use Idx+3 for the mask to account for that on big-endian. fhahn: Thanks, I adjusted the loop below to use Idx+3 for the mask to account for that on big-endian.
		Parts.push_back(Builder.CreateBitCast(
		Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy));

		Intrinsic::ID TblID = Intrinsic::aarch64_neon_tbl2;
		unsigned NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
		if (NumElements == 16) {
		Parts.push_back(Builder.CreateBitCast(
		Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), VecTy));
		Parts.push_back(Builder.CreateBitCast(
		Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}),
		VecTy));
		TblID = Intrinsic::aarch64_neon_tbl4;
		}
		SmallVector<Constant *, 16> MaskConst;
		for (unsigned Idx = 0; Idx < NumElements * 4; Idx += 4)
		MaskConst.push_back(
		ConstantInt::get(Builder.getInt8Ty(), IsLittleEndian ? Idx : Idx + 3));

		for (unsigned Idx = NumElements * 4; Idx < 64; Idx += 4)
		MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255));

		Parts.push_back(ConstantVector::get(MaskConst));
		auto *F =
		Intrinsic::getDeclaration(TI->getModule(), TblID, Parts[0]->getType());
		Value *Res = Builder.CreateCall(F, Parts);

		if (NumElements == 8)
		Res = Builder.CreateShuffleVector(Res, {0, 1, 2, 3, 4, 5, 6, 7});
		TI->replaceAllUsesWith(Res);
		TI->eraseFromParent();
		}

bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,		bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
Loop *L) const {		Loop *L) const {
// Try to optimize conversions using tbl. This requires materializing constant		// Try to optimize conversions using tbl. This requires materializing constant
// index vectors, which can increase code size and add loads. Skip the		// index vectors, which can increase code size and add loads. Skip the
// transform unless the conversion is in a loop block guaranteed to execute		// transform unless the conversion is in a loop block guaranteed to execute
// and we are not optimizing for size.		// and we are not optimizing for size.
Function *F = I->getParent()->getParent();		Function *F = I->getParent()->getParent();
if (!L \|\| L->getHeader() != I->getParent() \|\| F->hasMinSize() \|\|		if (!L \|\| L->getHeader() != I->getParent() \|\| F->hasMinSize() \|\|
Show All 23 Lines	if (UIToFP &&
DstTy->getElementType()->isFloatTy()) {		DstTy->getElementType()->isFloatTy()) {
IRBuilder<> Builder(I);		IRBuilder<> Builder(I);
auto *ZExt = cast<ZExtInst>(		auto *ZExt = cast<ZExtInst>(
Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));		Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
auto *UI = Builder.CreateUIToFP(ZExt, DstTy);		auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
I->replaceAllUsesWith(UI);		I->replaceAllUsesWith(UI);
I->eraseFromParent();		I->eraseFromParent();
createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());		createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
return true;		return true;
		t.p.northoverUnsubmitted Done Reply Inline Actions Shouldn't this be part of the patch that adds the code above? t.p.northover: Shouldn't this be part of the patch that adds the code above?
		fhahnAuthorUnsubmitted Done Reply Inline Actions Yes, moved to D133494. fhahn: Yes, moved to D133494.
}		}

		// Convert 'trunc <(8\|16) x i32> %x to <(8\|16) x i8>' to a single tbl.4
		t.p.northoverUnsubmitted Done Reply Inline Actions Destination type wrong. t.p.northover: Destination type wrong.
		fhahnAuthorUnsubmitted Done Reply Inline Actions Thanks, should be fixed! fhahn: Thanks, should be fixed!
		// instruction selecting the lowest 8 bits per lane of the input interpreted
		// as 2 or 4 <4 x i32> vectors.
		auto *TI = dyn_cast<TruncInst>(I);
		if (TI && (SrcTy->getNumElements() == 8 \|\| SrcTy->getNumElements() == 16) &&
		SrcTy->getElementType()->isIntegerTy(32) &&
		DstTy->getElementType()->isIntegerTy(8)) {
		createTblForTrunc(TI, Subtarget->isLittleEndian());
		return true;
		}

return false;		return false;
}		}

bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,		bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
Align &RequiredAligment) const {		Align &RequiredAligment) const {
if (!LoadedType.isSimple() \|\|		if (!LoadedType.isSimple() \|\|
(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))		(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
return false;		return false;
▲ Show 20 Lines • Show All 9,165 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/trunc-to-tbl.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=arm64-apple-ios -o - %s \| FileCheck %s			; RUN: llc -mtriple=arm64-apple-ios -o - %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s \| FileCheck --check-prefix=CHECK-BE %s			; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s \| FileCheck --check-prefix=CHECK-BE %s

				; CHECK-LABEL: lCPI0_0:
				; CHECK-NEXT: .byte 0 ; 0x0
				; CHECK-NEXT: .byte 4 ; 0x4
				; CHECK-NEXT: .byte 8 ; 0x8
				; CHECK-NEXT: .byte 12 ; 0xc
				; CHECK-NEXT: .byte 16 ; 0x10
				; CHECK-NEXT: .byte 20 ; 0x14
				; CHECK-NEXT: .byte 24 ; 0x18
				; CHECK-NEXT: .byte 28 ; 0x1c
				; CHECK-NEXT: .byte 32 ; 0x20
				; CHECK-NEXT: .byte 36 ; 0x24
				; CHECK-NEXT: .byte 40 ; 0x28
				; CHECK-NEXT: .byte 44 ; 0x2c
				; CHECK-NEXT: .byte 48 ; 0x30
				; CHECK-NEXT: .byte 52 ; 0x34
				; CHECK-NEXT: .byte 56 ; 0x38
				; CHECK-NEXT: .byte 60 ; 0x3c

				; CHECK-BE-LABEL: .LCPI0_0:
				; CHECK-BE-NEXT: .byte 3 // 0x3
				; CHECK-BE-NEXT: .byte 7 // 0x7
				; CHECK-BE-NEXT: .byte 11 // 0xb
				; CHECK-BE-NEXT: .byte 15 // 0xf
				; CHECK-BE-NEXT: .byte 19 // 0x13
				; CHECK-BE-NEXT: .byte 23 // 0x17
				; CHECK-BE-NEXT: .byte 27 // 0x1b
				; CHECK-BE-NEXT: .byte 31 // 0x1f
				; CHECK-BE-NEXT: .byte 35 // 0x23
				; CHECK-BE-NEXT: .byte 39 // 0x27
				; CHECK-BE-NEXT: .byte 43 // 0x2b
				; CHECK-BE-NEXT: .byte 47 // 0x2f
				; CHECK-BE-NEXT: .byte 51 // 0x33
				; CHECK-BE-NEXT: .byte 55 // 0x37
				; CHECK-BE-NEXT: .byte 59 // 0x3b
				; CHECK-BE-NEXT: .byte 63 // 0x3f

	; It's profitable to use a single tbl.4 instruction to lower the truncate.			; It's profitable to use a single tbl.4 instruction to lower the truncate.
	define void @trunc_v16i32_to_v16i8_in_loop(ptr %A, ptr %dst) {			define void @trunc_v16i32_to_v16i8_in_loop(ptr %A, ptr %dst) {
	; CHECK-LABEL: trunc_v16i32_to_v16i8_in_loop:			; CHECK-LABEL: trunc_v16i32_to_v16i8_in_loop:
	; CHECK: ; %bb.0: ; %entry			; CHECK: ; %bb.0: ; %entry
				; CHECK-NEXT: Lloh0:
				; CHECK-NEXT: adrp x9, lCPI0_0@PAGE
	; CHECK-NEXT: mov x8, xzr			; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: Lloh1:
				; CHECK-NEXT: ldr q0, [x9, lCPI0_0@PAGEOFF]
	; CHECK-NEXT: LBB0_1: ; %loop			; CHECK-NEXT: LBB0_1: ; %loop
	; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1			; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: add x9, x0, x8, lsl #6			; CHECK-NEXT: add x9, x0, x8, lsl #6
	; CHECK-NEXT: ldp q1, q0, [x9, #32]			; CHECK-NEXT: ldp q1, q2, [x9]
	; CHECK-NEXT: ldp q3, q2, [x9]			; CHECK-NEXT: ldp q3, q4, [x9, #32]
	; CHECK-NEXT: uzp1.8h v0, v1, v0			; CHECK-NEXT: tbl.16b v1, { v1, v2, v3, v4 }, v0
	; CHECK-NEXT: uzp1.8h v1, v3, v2			; CHECK-NEXT: str q1, [x1, x8, lsl #4]
	; CHECK-NEXT: uzp1.16b v0, v1, v0
	; CHECK-NEXT: str q0, [x1, x8, lsl #4]
	; CHECK-NEXT: add x8, x8, #1			; CHECK-NEXT: add x8, x8, #1
	; CHECK-NEXT: cmp x8, #1000			; CHECK-NEXT: cmp x8, #1000
	; CHECK-NEXT: b.eq LBB0_1			; CHECK-NEXT: b.eq LBB0_1
	; CHECK-NEXT: ; %bb.2: ; %exit			; CHECK-NEXT: ; %bb.2: ; %exit
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
				; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1
	;			;
	; CHECK-BE-LABEL: trunc_v16i32_to_v16i8_in_loop:			; CHECK-BE-LABEL: trunc_v16i32_to_v16i8_in_loop:
	; CHECK-BE: // %bb.0: // %entry			; CHECK-BE: // %bb.0: // %entry
				; CHECK-BE-NEXT: adrp x8, .LCPI0_0
				; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_0
				; CHECK-BE-NEXT: ld1 { v0.16b }, [x8]
	; CHECK-BE-NEXT: mov x8, xzr			; CHECK-BE-NEXT: mov x8, xzr
	; CHECK-BE-NEXT: .LBB0_1: // %loop			; CHECK-BE-NEXT: .LBB0_1: // %loop
	; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1			; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
	; CHECK-BE-NEXT: add x9, x0, x8, lsl #6			; CHECK-BE-NEXT: add x9, x0, x8, lsl #6
	; CHECK-BE-NEXT: add x10, x9, #48			; CHECK-BE-NEXT: add x10, x9, #16
	; CHECK-BE-NEXT: add x11, x9, #32			; CHECK-BE-NEXT: add x11, x9, #32
	; CHECK-BE-NEXT: ld1 { v0.4s }, [x9]			; CHECK-BE-NEXT: ld1 { v1.16b }, [x9]
	; CHECK-BE-NEXT: add x9, x9, #16			; CHECK-BE-NEXT: add x9, x9, #48
	; CHECK-BE-NEXT: ld1 { v1.4s }, [x10]			; CHECK-BE-NEXT: ld1 { v2.16b }, [x10]
	; CHECK-BE-NEXT: ld1 { v2.4s }, [x11]			; CHECK-BE-NEXT: ld1 { v3.16b }, [x11]
	; CHECK-BE-NEXT: ld1 { v3.4s }, [x9]			; CHECK-BE-NEXT: ld1 { v4.16b }, [x9]
	; CHECK-BE-NEXT: add x9, x1, x8, lsl #4			; CHECK-BE-NEXT: add x9, x1, x8, lsl #4
	; CHECK-BE-NEXT: add x8, x8, #1			; CHECK-BE-NEXT: add x8, x8, #1
	; CHECK-BE-NEXT: cmp x8, #1000			; CHECK-BE-NEXT: cmp x8, #1000
	; CHECK-BE-NEXT: uzp1 v1.8h, v2.8h, v1.8h			; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
	; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v3.8h			; CHECK-BE-NEXT: st1 { v1.16b }, [x9]
	; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
	; CHECK-BE-NEXT: st1 { v0.16b }, [x9]
	; CHECK-BE-NEXT: b.eq .LBB0_1			; CHECK-BE-NEXT: b.eq .LBB0_1
	; CHECK-BE-NEXT: // %bb.2: // %exit			; CHECK-BE-NEXT: // %bb.2: // %exit
	; CHECK-BE-NEXT: ret			; CHECK-BE-NEXT: ret

	entry:			entry:
	br label %loop			br label %loop

	loop:			loop:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]			%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
	%gep.A = getelementptr inbounds <16 x i32>, ptr %A, i64 %iv			%gep.A = getelementptr inbounds <16 x i32>, ptr %A, i64 %iv
	%l.A = load <16 x i32>, ptr %gep.A			%l.A = load <16 x i32>, ptr %gep.A
	%trunc = trunc <16 x i32> %l.A to <16 x i8>			%trunc = trunc <16 x i32> %l.A to <16 x i8>
	Show All 36 Lines
	; CHECK-BE-NEXT: ret			; CHECK-BE-NEXT: ret
	entry:			entry:
	%l.A = load <16 x i32>, ptr %A			%l.A = load <16 x i32>, ptr %A
	%trunc = trunc <16 x i32> %l.A to <16 x i8>			%trunc = trunc <16 x i32> %l.A to <16 x i8>
	store <16 x i8> %trunc, ptr %dst			store <16 x i8> %trunc, ptr %dst
	ret void			ret void
	}			}


				; CHECK-LABEL: lCPI2_0:
				; CHECK-NEXT: .byte 0 ; 0x0
				; CHECK-NEXT: .byte 4 ; 0x4
				; CHECK-NEXT: .byte 8 ; 0x8
				; CHECK-NEXT: .byte 12 ; 0xc
				; CHECK-NEXT: .byte 16 ; 0x10
				; CHECK-NEXT: .byte 20 ; 0x14
				; CHECK-NEXT: .byte 24 ; 0x18
				; CHECK-NEXT: .byte 28 ; 0x1c
				; CHECK-NEXT: .byte 255 ; 0xff
				; CHECK-NEXT: .byte 255 ; 0xff
				; CHECK-NEXT: .byte 255 ; 0xff
				; CHECK-NEXT: .byte 255 ; 0xff
				; CHECK-NEXT: .byte 255 ; 0xff
				; CHECK-NEXT: .byte 255 ; 0xff
				; CHECK-NEXT: .byte 255 ; 0xff
				; CHECK-NEXT: .byte 255 ; 0xff

				; CHECK-BE-LABEL: .LCPI2_0:
				; CHECK-BE-NEXT: .byte 3 // 0x3
				; CHECK-BE-NEXT: .byte 7 // 0x7
				; CHECK-BE-NEXT: .byte 11 // 0xb
				; CHECK-BE-NEXT: .byte 15 // 0xf
				; CHECK-BE-NEXT: .byte 19 // 0x13
				; CHECK-BE-NEXT: .byte 23 // 0x17
				; CHECK-BE-NEXT: .byte 27 // 0x1b
				; CHECK-BE-NEXT: .byte 31 // 0x1f
				; CHECK-BE-NEXT: .byte 255 // 0xff
				; CHECK-BE-NEXT: .byte 255 // 0xff
				; CHECK-BE-NEXT: .byte 255 // 0xff
				; CHECK-BE-NEXT: .byte 255 // 0xff
				; CHECK-BE-NEXT: .byte 255 // 0xff
				; CHECK-BE-NEXT: .byte 255 // 0xff
				; CHECK-BE-NEXT: .byte 255 // 0xff
				; CHECK-BE-NEXT: .byte 255 // 0xff
	; It's profitable to use a single tbl.2 instruction to lower the truncate.			; It's profitable to use a single tbl.2 instruction to lower the truncate.
	define void @trunc_v8i32_to_v8i8_in_loop(ptr %A, ptr %dst) {			define void @trunc_v8i32_to_v8i8_in_loop(ptr %A, ptr %dst) {
	; CHECK-LABEL: trunc_v8i32_to_v8i8_in_loop:			; CHECK-LABEL: trunc_v8i32_to_v8i8_in_loop:
	; CHECK: ; %bb.0: ; %entry			; CHECK: ; %bb.0: ; %entry
				; CHECK-NEXT: Lloh2:
				; CHECK-NEXT: adrp x9, lCPI2_0@PAGE
	; CHECK-NEXT: mov x8, xzr			; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: Lloh3:
				; CHECK-NEXT: ldr q0, [x9, lCPI2_0@PAGEOFF]
	; CHECK-NEXT: LBB2_1: ; %loop			; CHECK-NEXT: LBB2_1: ; %loop
	; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1			; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: add x9, x0, x8, lsl #5			; CHECK-NEXT: add x9, x0, x8, lsl #5
	; CHECK-NEXT: ldp q1, q0, [x9]			; CHECK-NEXT: ldp q1, q2, [x9]
	; CHECK-NEXT: uzp1.8h v0, v1, v0			; CHECK-NEXT: tbl.16b v1, { v1, v2 }, v0
	; CHECK-NEXT: xtn.8b v0, v0			; CHECK-NEXT: str d1, [x1, x8, lsl #3]
	; CHECK-NEXT: str d0, [x1, x8, lsl #3]
	; CHECK-NEXT: add x8, x8, #1			; CHECK-NEXT: add x8, x8, #1
	; CHECK-NEXT: cmp x8, #1000			; CHECK-NEXT: cmp x8, #1000
	; CHECK-NEXT: b.eq LBB2_1			; CHECK-NEXT: b.eq LBB2_1
	; CHECK-NEXT: ; %bb.2: ; %exit			; CHECK-NEXT: ; %bb.2: ; %exit
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
				; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
	;			;
	; CHECK-BE-LABEL: trunc_v8i32_to_v8i8_in_loop:			; CHECK-BE-LABEL: trunc_v8i32_to_v8i8_in_loop:
	; CHECK-BE: // %bb.0: // %entry			; CHECK-BE: // %bb.0: // %entry
				; CHECK-BE-NEXT: adrp x8, .LCPI2_0
				; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI2_0
				; CHECK-BE-NEXT: ld1 { v0.16b }, [x8]
	; CHECK-BE-NEXT: mov x8, xzr			; CHECK-BE-NEXT: mov x8, xzr
	; CHECK-BE-NEXT: .LBB2_1: // %loop			; CHECK-BE-NEXT: .LBB2_1: // %loop
	; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1			; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
	; CHECK-BE-NEXT: add x9, x0, x8, lsl #5			; CHECK-BE-NEXT: add x9, x0, x8, lsl #5
	; CHECK-BE-NEXT: add x10, x9, #16			; CHECK-BE-NEXT: add x10, x9, #16
	; CHECK-BE-NEXT: ld1 { v0.4s }, [x9]			; CHECK-BE-NEXT: ld1 { v1.16b }, [x9]
	; CHECK-BE-NEXT: add x9, x1, x8, lsl #3			; CHECK-BE-NEXT: add x9, x1, x8, lsl #3
	; CHECK-BE-NEXT: add x8, x8, #1			; CHECK-BE-NEXT: add x8, x8, #1
	; CHECK-BE-NEXT: ld1 { v1.4s }, [x10]			; CHECK-BE-NEXT: ld1 { v2.16b }, [x10]
	; CHECK-BE-NEXT: cmp x8, #1000			; CHECK-BE-NEXT: cmp x8, #1000
	; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h			; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v0.16b
	; CHECK-BE-NEXT: xtn v0.8b, v0.8h			; CHECK-BE-NEXT: st1 { v1.8b }, [x9]
	; CHECK-BE-NEXT: st1 { v0.8b }, [x9]
	; CHECK-BE-NEXT: b.eq .LBB2_1			; CHECK-BE-NEXT: b.eq .LBB2_1
	; CHECK-BE-NEXT: // %bb.2: // %exit			; CHECK-BE-NEXT: // %bb.2: // %exit
	; CHECK-BE-NEXT: ret			; CHECK-BE-NEXT: ret

	entry:			entry:
	br label %loop			br label %loop

	loop:			loop:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]			%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
	%gep.A = getelementptr inbounds <8 x i32>, ptr %A, i64 %iv			%gep.A = getelementptr inbounds <8 x i32>, ptr %A, i64 %iv
	%l.A = load <8 x i32>, ptr %gep.A			%l.A = load <8 x i32>, ptr %gep.A
	%trunc = trunc <8 x i32> %l.A to <8 x i8>			%trunc = trunc <8 x i32> %l.A to <8 x i8>
	Show All 9 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Lower vector trunc using tbl.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 460699

llvm/lib/CodeGen/CodeGenPrepare.cpp

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/trunc-to-tbl.ll

[AArch64] Lower vector trunc using tbl.
ClosedPublic