This is an archive of the discontinued LLVM Phabricator instance.

[X86] Transform amx pointer.
AbandonedPublic

Authored by LuoYuanke on Dec 23 2020, 6:03 PM.

Download Raw Diff

Details

Reviewers

craig.topper
pengfei
spatel
RKSimon
jyknight
Florian
lebedev.ri

Summary

Front-end bitcast <256 x i32> to x86_amx and generate load/store <256 x i32>*.
In instruction combine pass it transform load/store <256 x i32>* to
load/store x86_amx*. In the amx type lowering pass , we lower the load/store
instructions to amx load/store intrinsics with the stride value 64.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

LuoYuanke created this revision.Dec 23 2020, 6:03 PM

Herald added subscribers: pengfei, hiraditya. · View Herald TranscriptDec 23 2020, 6:03 PM

LuoYuanke requested review of this revision.Dec 23 2020, 6:03 PM

Herald added a project: Restricted Project. · View Herald TranscriptDec 23 2020, 6:03 PM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B83443: Diff 313636.Dec 23 2020, 6:04 PM

LuoYuanke added a parent revision: D91927: [X86] Add x86_amx type for intel AMX..Dec 23 2020, 6:04 PM

LuoYuanke mentioned this in D98247: [X86][AMX] Prevent transforming load pointer from <256 x i32>* to x86_amx*..Mar 9 2021, 4:53 AM

Rebase.

Harbormaster completed remote builds in B94825: Diff 332053.Mar 19 2021, 6:29 PM

Support PHI operation.

Harbormaster completed remote builds in B94855: Diff 332098.Mar 20 2021, 5:41 AM

LuoYuanke edited the summary of this revision. (Show Details)Mar 20 2021, 5:44 AM

LuoYuanke added reviewers: craig.topper, pengfei, lebedev.ri, spatel, RKSimon, jyknight, Florian.

LuoYuanke edited the summary of this revision. (Show Details)

LuoYuanke added a subscriber: annita.zhang.

a few style comments

llvm/lib/Target/X86/X86LowerAMXType.cpp
102	isa<>
130	isa<>
348	''' if (auto *ST = dyn_cast<StoreInst>(I)) '''

What's the status here?

This review seems to be stuck/dead, consider abandoning if no longer relevant.

Herald added a project: Restricted Project. · View Herald TranscriptJan 12 2023, 5:24 PM

Herald added a subscriber: StephenFan. · View Herald Transcript

We have amx cast instrinsics for vector and x86_amx type and front-end would use amx cast instrinsics instead of bitcast. I don't think we need this patch now.

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86LowerAMXType.cpp

136 lines

test/

CodeGen/

X86/

AMX/

amx-type.ll

92 lines

Diff 332098

llvm/lib/Target/X86/X86LowerAMXType.cpp

Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	case Intrinsic::x86_tdpbf16ps_internal: {
}		}
break;		break;
}		}
}		}

return std::make_pair(Row, Col);		return std::make_pair(Row, Col);
}		}

		static std::pair<Value , Value > getShape(PHINode *PN) {
		PHINode *InComingPN = nullptr;
		// Iterate the incoming value to find an AMX intrinsic.
		for (Value *IncValue : PN->incoming_values()) {
		if (dyn_cast<BitCastInst>(IncValue))
		RKSimonUnsubmitted Not Done Reply Inline Actions isa<> RKSimon: isa<>
		continue;
		// The first 2 operand is row and column when define an tile
		if (auto *II = dyn_cast<IntrinsicInst>(IncValue)) {
		Value *Row = II->getArgOperand(0);
		Value *Col = II->getArgOperand(1);
		return std::make_pair(Row, Col);
		}
		if (!InComingPN)
		InComingPN = dyn_cast<PHINode>(IncValue);
		}
		if (InComingPN)
		return getShape(InComingPN);

		// The incoming value is from load.
		return std::make_pair(nullptr, nullptr);
		}

		// %1 = load x86_amx, x86_amx* %0, align 64
		// %2 = call x86_amx @llvm.x86.tdpbssd.internal(%1, %1, %1, ...)
		// -->
		// %1 = call x86_amx @llvm.x86.tileloadd64.internal()
		// %2 = call x86_amx @llvm.x86.tdpbssd.internal(%1, %1, %1, ...)
		static void transformTileLoad(LoadInst *LD) {
		Value Row = nullptr, Col = nullptr;
		PHINode *PN = nullptr;
		for (auto UI = LD->use_begin(), UE = LD->use_end(); UI != UE;) {
		Use &U = *(UI++);
		if (dyn_cast<BitCastInst>(U.getUser()))
		RKSimonUnsubmitted Not Done Reply Inline Actions isa<> RKSimon: isa<>
		continue;
		if (auto *II = dyn_cast<IntrinsicInst>(U.getUser())) {
		unsigned OpNo = U.getOperandNo();
		std::tie(Row, Col) = getShape(II, OpNo);
		break;
		}
		// %1 = phi x86_amx [ %2, %for.body14 ], [ %3, %for.body24 ]
		if (!PN) {
		PN = dyn_cast<PHINode>(U.getUser());
		continue;
		}
		// store x86_amx %9, x86_amx* %addr, align 64
		}
		// No user is AMX intrinsic, we need get shape from PHI node.
		if (!Row) {
		if (PN)
		std::tie(Row, Col) = getShape(PN);
		// All users are store instruction. Transform them to vector
		// load/store.
		else {
		// TODO perform transform.
		return;
		}
		}

		IRBuilder<> Builder(LD);
		// Use the maximun column as stride.
		Value *Stride = Builder.getInt64(64);
		Value *I8Ptr =
		Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy());
		std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};

		Value *NewInst =
		Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
		LD->replaceAllUsesWith(NewInst);
		}

// %src = load <256 x i32>, <256 x i32>* %addr, align 64		// %src = load <256 x i32>, <256 x i32>* %addr, align 64
// %2 = bitcast <256 x i32> %src to x86_amx		// %2 = bitcast <256 x i32> %src to x86_amx
// -->		// -->
// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,		// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
// i8* %addr, i64 %stride64)		// i8* %addr, i64 %stride64)
static void combineLoadBitcast(LoadInst LD, BitCastInst Bitcast) {		static void combineLoadBitcast(LoadInst LD, BitCastInst Bitcast) {
Value Row = nullptr, Col = nullptr;		Value Row = nullptr, Col = nullptr;
Use &U = *(Bitcast->use_begin());		Use &U = *(Bitcast->use_begin());
Show All 15 Lines
// %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,		// %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
// %stride);		// %stride);
// %13 = bitcast x86_amx %src to <256 x i32>		// %13 = bitcast x86_amx %src to <256 x i32>
// store <256 x i32> %13, <256 x i32>* %addr, align 64		// store <256 x i32> %13, <256 x i32>* %addr, align 64
// -->		// -->
// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,		// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
// %stride64, %13)		// %stride64, %13)
static void combineBitcastStore(BitCastInst Bitcast, StoreInst ST) {		static void combineBitcastStore(BitCastInst Bitcast, StoreInst ST) {
		Value Row, Col;
Value *Tile = Bitcast->getOperand(0);		Value *Tile = Bitcast->getOperand(0);
auto *II = cast<IntrinsicInst>(Tile);		if (auto *II = dyn_cast<IntrinsicInst>(Tile)) {
// Tile is output from AMX intrinsic. The first operand of the		// Tile is output from AMX intrinsic. The first operand of the
// intrinsic is row, the second operand of the intrinsic is column.		// intrinsic is row, the second operand of the intrinsic is column.
Value *Row = II->getOperand(0);		Row = II->getOperand(0);
Value *Col = II->getOperand(1);		Col = II->getOperand(1);
		} else if (auto *PN = dyn_cast<PHINode>(Tile))
		std::tie(Row, Col) = getShape(PN);
		// TODO else the def is load, transform load/store to vector load/store;

IRBuilder<> Builder(ST);		IRBuilder<> Builder(ST);
// Use the maximum column as stride. It must be the same with load		// Use the maximum column as stride. It must be the same with load
// stride.		// stride.
Value *Stride = Builder.getInt64(64);		Value *Stride = Builder.getInt64(64);
Value *I8Ptr =		Value *I8Ptr =
Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());		Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());
std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile};		std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile};
Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);		Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	if (Bitcast->getType()->isX86_AMXTy()) {
Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);		Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr);		Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr);
Bitcast->replaceAllUsesWith(NewInst);		Bitcast->replaceAllUsesWith(NewInst);
}		}

return true;		return true;
}		}

		// %addr = bitcast <256 x i32>* %tile to x86_amx*
		// store x86_amx %9, x86_amx* %addr, align 64
		// -->
		// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
		// %stride64, %9)
		static void transformTileStore(StoreInst *ST) {
		auto *II = cast<IntrinsicInst>(ST->getValueOperand());
		Value *Row = II->getOperand(0);
		Value *Col = II->getOperand(1);
		IRBuilder<> Builder(ST);
		// Use the maximum column as stride. It must be the same with load
		// stride.
		Value *Stride = Builder.getInt64(64);
		Value *I8Ptr =
		Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());
		std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride,
		ST->getValueOperand()};
		Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
		}

namespace {		namespace {
class X86LowerAMXType {		class X86LowerAMXType {
Function &Func;		Function &Func;

public:		public:
X86LowerAMXType(Function &F) : Func(F) {}		X86LowerAMXType(Function &F) : Func(F) {}
bool visit();		bool visit();
};		};

bool X86LowerAMXType::visit() {		bool X86LowerAMXType::visit() {
SmallVector<Instruction *, 8> DeadInsts;		SmallVector<Instruction *, 8> DeadInsts;

for (BasicBlock *BB : post_order(&Func)) {		for (BasicBlock *BB : post_order(&Func)) {
for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();		for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
II != IE;) {		II != IE;) {
Instruction &Inst = *II++;		Instruction &Inst = *II++;
auto *Bitcast = dyn_cast<BitCastInst>(&Inst);		auto *Bitcast = dyn_cast<BitCastInst>(&Inst);
if (!Bitcast)		if (!Bitcast)
continue;		continue;

Value *Src = Bitcast->getOperand(0);		Value *Src = Bitcast->getOperand(0);
		Type *Ty = Bitcast->getType();

		if (Ty->isPointerTy() &&
		cast<PointerType>(Ty)->getElementType()->isX86_AMXTy()) {
		for (auto UI = Bitcast->use_begin(), UE = Bitcast->use_end();
		UI != UE;) {
		Value *I = (UI++)->getUser();
		auto *LD = dyn_cast<LoadInst>(I);
		// %0 = bitcast <256 x i32>* %tile to x86_amx*
		// %1 = load x86_amx, x86_amx* %0, align 64
		if (LD) {
		transformTileLoad(LD);
		DeadInsts.push_back(LD);
		}
		auto *ST = dyn_cast<StoreInst>(I);
		if (ST) {
		RKSimonUnsubmitted Not Done Reply Inline Actions ''' if (auto ST = dyn_cast<StoreInst>(I)) ''' RKSimon:* ''' if (auto *ST = dyn_cast<StoreInst>(I)) '''
		// %addr = bitcast <256 x i32>* %tile to x86_amx*
		// store x86_amx %9, x86_amx* %addr, align 64
		// -->
		// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
		// %stride64, %9)
		transformTileStore(ST);
		DeadInsts.push_back(ST);
		}
		}
		// If the dst type is <256 x i32>*, it is valid intruction.
		// %0 = bitcast x86_amx* %tile to <256 x i32>*
		// %1 = load <256 x i32>, <256 x i32>* %0, align 64
		// store <256 x i32> %2, <256 x i32>* %0, align 64
		}
if (Bitcast->getType()->isX86_AMXTy()) {		if (Bitcast->getType()->isX86_AMXTy()) {
if (Bitcast->user_empty()) {		if (Bitcast->user_empty()) {
DeadInsts.push_back(Bitcast);		DeadInsts.push_back(Bitcast);
continue;		continue;
}		}
LoadInst *LD = dyn_cast<LoadInst>(Src);		LoadInst *LD = dyn_cast<LoadInst>(Src);
if (!LD) {		if (!LD) {
if (transformBitcast(Bitcast))		if (transformBitcast(Bitcast))
▲ Show 20 Lines • Show All 117 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/AMX/amx-type.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S \| FileCheck %s		; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S \| FileCheck %s

%struct.__tile_str = type { i16, i16, <256 x i32> }		%struct.__tile_str = type { i16, i16, <256 x i32> }

@buf = dso_local global [1024 x i8] zeroinitializer, align 64		@buf = dso_local global [1024 x i8] zeroinitializer, align 64
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 64		@buf2 = dso_local global [1024 x i8] zeroinitializer, align 64

		define dso_local void @test_amx_store(<256 x i32>* %in, i16 %m, i16 %n, i8 *%buf, i64 %s) #2 {
		; CHECK-LABEL: @test_amx_store(
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[T0:%.]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.]], i16 [[N:%.]], i8 [[BUF:%.]], i64 [[S:%.]])
		; CHECK-NEXT: [[ADDR:%.]] = bitcast <256 x i32> [[IN:%.]] to x86_amx
		; CHECK-NEXT: [[TMP0:%.]] = bitcast x86_amx [[ADDR]] to i8*
		; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP0]], i64 64, x86_amx [[T0]])
		; CHECK-NEXT: ret void
		;
		entry:
		%t0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, i8* %buf, i64 %s) #3
		%addr = bitcast <256 x i32>* %in to x86_amx*
		store x86_amx %t0, x86_amx* %addr, align 64
		ret void
		}

		define dso_local void @test_amx_load(<256 x i32>* %in, i16 %m, i16 %n, i8 *%buf, i64 %s) #2 {
		; CHECK-LABEL: @test_amx_load(
		; CHECK-NEXT: entry:
		; CHECK-NEXT: [[T0:%.]] = bitcast <256 x i32> [[IN:%.]] to x86_amx
		; CHECK-NEXT: [[TMP0:%.]] = bitcast x86_amx [[T0]] to i8*
		; CHECK-NEXT: [[TMP1:%.]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.]], i16 [[N:%.]], i8 [[TMP0]], i64 64)
		; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[BUF:%.]], i64 [[S:%.]], x86_amx [[TMP1]])
		; CHECK-NEXT: ret void
		;
		entry:
		%t0 = bitcast <256 x i32>* %in to x86_amx*
		%t1 = load x86_amx, x86_amx* %t0, align 64
		call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, i8* %buf, i64 %s, x86_amx %t1) #3
		ret void
		}

; test bitcast x86_amx to <256 x i32>		; test bitcast x86_amx to <256 x i32>
define dso_local void @test_user_empty(i16 %m, i16 %n, i8 *%buf, i64 %s) {		define dso_local void @test_user_empty(i16 %m, i16 %n, i8 *%buf, i64 %s) {
; CHECK-LABEL: @test_user_empty(		; CHECK-LABEL: @test_user_empty(
; CHECK-NEXT: entry:		; CHECK-NEXT: entry:
; CHECK-NEXT: [[T1:%.]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.]], i16 [[N:%.]], i8 [[BUF:%.]], i64 [[S:%.]])		; CHECK-NEXT: [[T1:%.]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.]], i16 [[N:%.]], i8 [[BUF:%.]], i64 [[S:%.]])
; CHECK-NEXT: ret void		; CHECK-NEXT: ret void
;		;
entry:		entry:
▲ Show 20 Lines • Show All 203 Lines • ▼ Show 20 Lines	;
%9 = load <256 x i32>, <256 x i32>* %8, align 64		%9 = load <256 x i32>, <256 x i32>* %8, align 64
%10 = bitcast <256 x i32> %9 to x86_amx		%10 = bitcast <256 x i32> %9 to x86_amx
%11 = shl i64 %1, 32		%11 = shl i64 %1, 32
%12 = ashr exact i64 %11, 32		%12 = ashr exact i64 %11, 32
tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %7, i8* %0, i64 %12, x86_amx %10)		tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %7, i8* %0, i64 %12, x86_amx %10)
ret void		ret void
}		}

		define linkonce_odr dso_local void @test_amxptr(<256 x i32>* %arrayidx16, <256 x i32>* %arrayidx29, <256 x i32>* %arrayidx35) {
		; CHECK-LABEL: @test_amxptr(
		; CHECK-NEXT: entry:
		; CHECK-NEXT: br label [[FOR_COND9:%.*]]
		; CHECK: for.cond9:
		; CHECK-NEXT: br i1 undef, label [[FOR_BODY14:%.]], label [[EXIT:%.]]
		; CHECK: for.body14:
		; CHECK-NEXT: [[TMP0:%.]] = bitcast <256 x i32> [[ARRAYIDX16:%.]] to x86_amx
		; CHECK-NEXT: [[TMP1:%.]] = bitcast x86_amx [[TMP0]] to i8*
		; CHECK-NEXT: [[TMP2:%.]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 1, i16 4, i8 [[TMP1]], i64 64)
		; CHECK-NEXT: br label [[FOR_COND18:%.*]]
		; CHECK: for.cond18:
		; CHECK-NEXT: [[TMP3:%.]] = phi x86_amx [ [[TMP2]], [[FOR_BODY14]] ], [ [[T11:%.]], [[FOR_BODY24:%.*]] ]
		; CHECK-NEXT: br i1 undef, label [[FOR_BODY24]], label [[FOR_COND_CLEANUP23:%.*]]
		; CHECK: for.cond.cleanup23:
		; CHECK-NEXT: [[TMP4:%.]] = bitcast <256 x i32> [[ARRAYIDX16]] to i8*
		; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 1, i16 4, i8* [[TMP4]], i64 64, x86_amx [[TMP3]])
		; CHECK-NEXT: br label [[FOR_COND9]]
		; CHECK: for.body24:
		; CHECK-NEXT: [[TMP5:%.]] = bitcast <256 x i32> [[ARRAYIDX29:%.]] to i8
		; CHECK-NEXT: [[TMP6:%.]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 1, i16 4, i8 [[TMP5]], i64 64)
		; CHECK-NEXT: [[TMP7:%.]] = bitcast <256 x i32> [[ARRAYIDX35:%.]] to i8
		; CHECK-NEXT: [[TMP8:%.]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 4, i8 [[TMP7]], i64 64)
		; CHECK-NEXT: [[T11]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 1, i16 4, i16 4, x86_amx [[TMP3]], x86_amx [[TMP6]], x86_amx [[TMP8]])
		; CHECK-NEXT: br label [[FOR_COND18]]
		; CHECK: exit:
		; CHECK-NEXT: ret void
		;
		entry:
		br label %for.cond9

		for.cond9: ; preds = %for.cond.cleanup23, %entry
		br i1 undef, label %for.body14, label %exit

		for.body14: ; preds = %for.cond9
		%0 = bitcast <256 x i32>* %arrayidx16 to x86_amx*
		%t51 = load x86_amx, x86_amx* %0, align 64
		br label %for.cond18

		for.cond18: ; preds = %for.body24, %for.body14
		%1 = phi x86_amx [ %t51, %for.body14 ], [ %t11, %for.body24 ]
		br i1 undef, label %for.body24, label %for.cond.cleanup23

		for.cond.cleanup23: ; preds = %for.cond18
		%2 = bitcast x86_amx %1 to <256 x i32>
		store <256 x i32> %2, <256 x i32>* %arrayidx16, align 64
		br label %for.cond9

		for.body24: ; preds = %for.cond18
		%t6 = load <256 x i32>, <256 x i32>* %arrayidx29, align 64
		%t7 = load <256 x i32>, <256 x i32>* %arrayidx35, align 64
		%t9 = bitcast <256 x i32> %t6 to x86_amx
		%t10 = bitcast <256 x i32> %t7 to x86_amx
		%t11 = call x86_amx @llvm.x86.tdpbssd.internal(i16 1, i16 4, i16 4, x86_amx %1, x86_amx %t9, x86_amx %t10)
		br label %for.cond18

		exit: ; preds = %for.cond9
		ret void
		}

declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)		declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)		declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)		declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)