Diff 321665

clang/include/clang/Basic/BuiltinsX86_64.def

	Show First 20 Lines • Show All 97 Lines • ▼ Show 20 Lines
	TARGET_BUILTIN(__builtin_ia32_clui, "v", "n", "uintr")			TARGET_BUILTIN(__builtin_ia32_clui, "v", "n", "uintr")
	TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr")			TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr")
	TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")			TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr")
	TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")			TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr")

	// AMX internal builtin			// AMX internal builtin
	TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")			TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
	TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")			TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
				TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16")
	TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")			TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")
	TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile")			TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile")
	// AMX			// AMX
	TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")			TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
	TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")			TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
	TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")			TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile")
	TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")			TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile")

	Show All 14 Lines

clang/lib/Headers/amxintrin.h

	/===--------------- amxintrin.h - AMX intrinsics -- C/C++ -*---------------===			/===--------------- amxintrin.h - AMX intrinsics -- C/C++ -*---------------===
				Lint: Lint Inline Actions clang-format not found in user's PATH; not linting file. Lint: Lint: clang-format not found in user's PATH; not linting file.
	*			*
	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	* See https://llvm.org/LICENSE.txt for license information.			* See https://llvm.org/LICENSE.txt for license information.
	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	*			*
	*===------------------------------------------------------------------------===			*===------------------------------------------------------------------------===
	*/			*/
	▲ Show 20 Lines • Show All 210 Lines • ▼ Show 20 Lines
	/// \param src1			/// \param src1
	/// The 2nd source tile. Max size is 1024 Bytes.			/// The 2nd source tile. Max size is 1024 Bytes.
	#define _tile_dpbf16ps(dst, src0, src1) \			#define _tile_dpbf16ps(dst, src0, src1) \
	__builtin_ia32_tdpbf16ps((dst), (src0), (src1))			__builtin_ia32_tdpbf16ps((dst), (src0), (src1))

	#define __DEFAULT_FN_ATTRS_INT8 \			#define __DEFAULT_FN_ATTRS_INT8 \
	__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))			__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))

				#define __DEFAULT_FN_ATTRS_BF16 \
				__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))

	typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));			typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
	static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8			static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
	_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,			_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
	__SIZE_TYPE__ stride) {			__SIZE_TYPE__ stride) {
	return __builtin_ia32_tileloadd64_internal(m, n, base,			return __builtin_ia32_tileloadd64_internal(m, n, base,
	(__SIZE_TYPE__)(stride));			(__SIZE_TYPE__)(stride));
	}			}

	static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8			static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
	_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,			_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
	_tile1024i dst, _tile1024i src1, _tile1024i src2) {			_tile1024i dst, _tile1024i src1, _tile1024i src2) {
	return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);			return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
	}			}

				static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
				_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
				_tile1024i dst, _tile1024i src1, _tile1024i src2) {
				return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
				}

	static __inline__ void __DEFAULT_FN_ATTRS_INT8			static __inline__ void __DEFAULT_FN_ATTRS_INT8
	_tile_stored_internal(unsigned short m, unsigned short n, void *base,			_tile_stored_internal(unsigned short m, unsigned short n, void *base,
	__SIZE_TYPE__ stride, _tile1024i tile) {			__SIZE_TYPE__ stride, _tile1024i tile) {
	return __builtin_ia32_tilestored64_internal(m, n, base,			return __builtin_ia32_tilestored64_internal(m, n, base,
	(__SIZE_TYPE__)(stride), tile);			(__SIZE_TYPE__)(stride), tile);
	}			}

	typedef struct __tile1024i_str {			typedef struct __tile1024i_str {
	Show All 10 Lines

	__DEFAULT_FN_ATTRS_INT8			__DEFAULT_FN_ATTRS_INT8
	static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1,			static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1,
	__tile1024i src2) {			__tile1024i src2) {
	dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,			dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
	src1.tile, src2.tile);			src1.tile, src2.tile);
	}			}

				__DEFAULT_FN_ATTRS_INT8
				static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src1,
				__tile1024i src2) {
				dst->tile = _tile_dpbf16ps_internal(src1.row, src2.col, src1.col, dst->tile,
				src1.tile, src2.tile);
				}

	__DEFAULT_FN_ATTRS_TILE			__DEFAULT_FN_ATTRS_TILE
	static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {			static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
	_tile_stored_internal(src.row, src.col, base, stride, src.tile);			_tile_stored_internal(src.row, src.col, base, stride, src.tile);
	}			}

	__DEFAULT_FN_ATTRS_TILE			__DEFAULT_FN_ATTRS_TILE
	static void __tile_zero(__tile1024i *dst) {			static void __tile_zero(__tile1024i *dst) {
	dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);			dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
	}			}

	#endif /* __x86_64__ */			#endif /* __x86_64__ */
	#endif /* __AMXINTRIN_H */			#endif /* __AMXINTRIN_H */

llvm/include/llvm/IR/IntrinsicsX86.td

Show First 20 Lines • Show All 5,047 Lines • ▼ Show 20 Lines	def int_x86_tileloadd64_internal :
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],		[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
[]>;		[]>;
def int_x86_tdpbssd_internal :		def int_x86_tdpbssd_internal :
GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,		GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,
Intrinsic<[llvm_x86amx_ty],		Intrinsic<[llvm_x86amx_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,		[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
llvm_x86amx_ty, llvm_x86amx_ty,		llvm_x86amx_ty, llvm_x86amx_ty,
llvm_x86amx_ty], []>;		llvm_x86amx_ty], []>;
		def int_x86_tdpbf16ps_internal :
		GCCBuiltin<"__builtin_ia32_tdpbf16ps_internal">,
		Intrinsic<[llvm_x86amx_ty],
		[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
		llvm_x86amx_ty, llvm_x86amx_ty,
		llvm_x86amx_ty], []>;
def int_x86_tilestored64_internal :		def int_x86_tilestored64_internal :
GCCBuiltin<"__builtin_ia32_tilestored64_internal">,		GCCBuiltin<"__builtin_ia32_tilestored64_internal">,
Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty,		Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty,
llvm_i64_ty, llvm_x86amx_ty], []>;		llvm_i64_ty, llvm_x86amx_ty], []>;
def int_x86_tilezero_internal :		def int_x86_tilezero_internal :
GCCBuiltin<"__builtin_ia32_tilezero_internal">,		GCCBuiltin<"__builtin_ia32_tilezero_internal">,
Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty],		Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty],
[]>;		[]>;
Show All 15 Lines

llvm/lib/Target/X86/X86ExpandPseudo.cpp

//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//		//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//
		Lint: Lint Inline Actions clang-format not found in user's PATH; not linting file. Lint: Lint: clang-format not found in user's PATH; not linting file.
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
▲ Show 20 Lines • Show All 461 Lines • ▼ Show 20 Lines	bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case X86::PTDPBSSDV: {		case X86::PTDPBSSDV: {
MI.untieRegOperand(4);		MI.untieRegOperand(4);
for (unsigned i = 3; i > 0; --i)		for (unsigned i = 3; i > 0; --i)
MI.RemoveOperand(i);		MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TDPBSSD));		MI.setDesc(TII->get(X86::TDPBSSD));
MI.tieOperands(0, 1);		MI.tieOperands(0, 1);
return true;		return true;
}		}
		case X86::PTDPBF16PSV: {
		MI.untieRegOperand(4);
		for (unsigned i = 3; i > 0; --i)
		MI.RemoveOperand(i);
		MI.setDesc(TII->get(X86::TDPBF16PS));
		MI.tieOperands(0, 1);
		return true;
		}
case X86::PTILESTOREDV: {		case X86::PTILESTOREDV: {
for (int i = 1; i >= 0; --i)		for (int i = 1; i >= 0; --i)
MI.RemoveOperand(i);		MI.RemoveOperand(i);
MI.setDesc(TII->get(X86::TILESTORED));		MI.setDesc(TII->get(X86::TILESTORED));
return true;		return true;
}		}
case X86::PTILEZEROV: {		case X86::PTILEZEROV: {
for (int i = 2; i > 0; --i) // Remove row, col		for (int i = 2; i > 0; --i) // Remove row, col
▲ Show 20 Lines • Show All 41 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//		//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
		Lint: Lint Inline Actions clang-format not found in user's PATH; not linting file. Lint: Lint: clang-format not found in user's PATH; not linting file.
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
▲ Show 20 Lines • Show All 4,624 Lines • ▼ Show 20 Lines	case Intrinsic::x86_tdpbssd_internal: {
Node->getOperand(6),		Node->getOperand(6),
Node->getOperand(7),		Node->getOperand(7),
Chain};		Chain};
MachineSDNode *CNode =		MachineSDNode *CNode =
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);		CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
ReplaceNode(Node, CNode);		ReplaceNode(Node, CNode);
return;		return;
}		}
		case Intrinsic::x86_tdpbf16ps_internal: {
		if (!Subtarget->hasAMXTILE())
		break;
		SDValue Chain = Node->getOperand(0);
		unsigned Opc = X86::PTDPBF16PSV;
		SDValue Ops[] = {Node->getOperand(2),
		Node->getOperand(3),
		Node->getOperand(4),
		Node->getOperand(5),
		Node->getOperand(6),
		Node->getOperand(7),
		Chain};
		MachineSDNode *CNode =
		CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
		ReplaceNode(Node, CNode);
		return;
		}
case Intrinsic::x86_tilezero_internal: {		case Intrinsic::x86_tilezero_internal: {
if (!Subtarget->hasAMXTILE())		if (!Subtarget->hasAMXTILE())
break;		break;
unsigned Opc = X86::PTILEZEROV;		unsigned Opc = X86::PTILEZEROV;
SDValue Chain = Node->getOperand(0);		SDValue Chain = Node->getOperand(0);
SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain};		SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain};
MachineSDNode *CNode =		MachineSDNode *CNode =
CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);		CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
▲ Show 20 Lines • Show All 1,365 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86InstrAMX.td

Show First 20 Lines • Show All 130 Lines • ▼ Show 20 Lines	let SchedRW = [WriteSystem] in {
let usesCustomInserter = 1 in {		let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.		// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp		// To be translated to the actual instructions in X86ISelLowering.cpp
def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,		def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),		u8imm:$src2, u8imm:$src3),
[(int_x86_tdpbf16ps timm:$src1,		[(int_x86_tdpbf16ps timm:$src1,
timm:$src2, timm:$src3)]>;		timm:$src2, timm:$src3)]>;
}		}
		// Pseduo instruction for RA.
		let Constraints = "$src4 = $dst" in
		def PTDPBF16PSV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
		GR16:$src2, GR16:$src3, TILE:$src4,
		TILE:$src5, TILE:$src6), []>;
}		}
} // HasAMXTILE, HasAMXBF16		} // HasAMXTILE, HasAMXBF16

llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp

//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------- C++ --===//		//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------- C++ --===//
		Lint: Lint Inline Actions clang-format not found in user's PATH; not linting file. Lint: Lint: clang-format not found in user's PATH; not linting file.
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
▲ Show 20 Lines • Show All 292 Lines • ▼ Show 20 Lines	static Value createTileDPBSSDLoops(BasicBlock Start, BasicBlock *End,
Value *NewVecC = B.CreateInsertElement(VecCPhi, ResElt, IdxC);		Value *NewVecC = B.CreateInsertElement(VecCPhi, ResElt, IdxC);
VecCPhi->addIncoming(NewVecC, InnerLoopLatch);		VecCPhi->addIncoming(NewVecC, InnerLoopLatch);
VecPhi_Row_Loop->addIncoming(NewVecC, RowLatch);		VecPhi_Row_Loop->addIncoming(NewVecC, RowLatch);
VecPhi_Col_Loop->addIncoming(NewVecC, ColLoopLatch);		VecPhi_Col_Loop->addIncoming(NewVecC, ColLoopLatch);

return NewVecC;		return NewVecC;
}		}

		static Value createTileDPBF16PSLoops(BasicBlock Start, BasicBlock *End,
		IRBuilderBase &B, DomTreeUpdater &DTU,
		LoopInfo &LI, Value Row, Value Col,
		Value K, Value Acc, Value *LHS,
		Value *RHS) {
		Loop *RowLoop = LI.AllocateLoop();
		Loop *ColLoop = LI.AllocateLoop();
		Loop *InnerLoop = LI.AllocateLoop();
		ColLoop->addChildLoop(InnerLoop);
		RowLoop->addChildLoop(ColLoop);
		if (Loop *ParentL = LI.getLoopFor(Start))
		ParentL->addChildLoop(RowLoop);
		else
		LI.addTopLevelLoop(RowLoop);

		BasicBlock *RowBody =
		createLoop(Start, End, Row, B.getInt16(1), "tiledpbf16ps.unroll.rows", B,
		DTU, RowLoop, LI);
		BasicBlock *RowLatch = RowBody->getSingleSuccessor();

		BasicBlock *ColBody =
		createLoop(RowBody, RowLatch, Col, B.getInt16(1),
		"tiledpbf16ps.unroll.cols", B, DTU, ColLoop, LI);
		BasicBlock *ColLoopLatch = ColBody->getSingleSuccessor();

		B.SetInsertPoint(ColBody->getTerminator());
		pengfeiUnsubmitted Not Done Reply Inline Actions Can we create vecC with <256 x float>? pengfei: Can we create vecC with <256 x float>?
		yubingAuthorUnsubmitted Done Reply Inline Actions In fact, we are trying to find a bitcast whose operand is <256 x i32>, as shown in line229. yubing: In fact, we are trying to find a bitcast whose operand is <256 x i32>, as shown in line229.
		BasicBlock *InnerBody =
		createLoop(ColBody, ColLoopLatch, K, B.getInt16(1),
		"tiledpbf16ps.unroll.inner", B, DTU, InnerLoop, LI);

		BasicBlock *ColumnLoopHeader = ColBody->getSinglePredecessor();
		BasicBlock *RowLoopHeader = RowBody->getSinglePredecessor();
		BasicBlock *InnerLoopHeader = InnerBody->getSinglePredecessor();
		BasicBlock *InnerLoopLatch = InnerBody->getSingleSuccessor();
		Value CurrentRow = &RowLoopHeader->begin();
		Value CurrentCol = &ColumnLoopHeader->begin();
		Value CurrentInner = &InnerLoopHeader->begin();

		FixedVectorType *V256I32Ty = FixedVectorType::get(B.getInt32Ty(), 256);
		// Type *EltTy = V256I32Ty->getElementType();
		Value VecC, VecA, *VecB;
		if (auto BitCast = dyn_cast<BitCastInst>(Acc))
		VecC = BitCast->getOperand(0);
		assert(VecC->getType()->isVectorTy() && "bitcast from non-v256i32 to x86amx");
		// TODO else create BitCast from x86amx to v256i32.
		// Store x86amx to memory, and reload from memory
		// to vector. However with -O0, it doesn't happen.
		if (auto BitCast = dyn_cast<BitCastInst>(LHS))
		pengfeiUnsubmitted Not Done Reply Inline Actions better to use EltCF32 or CF32 pengfei: better to use EltCF32 or CF32
		VecA = BitCast->getOperand(0);
		assert(VecA->getType()->isVectorTy() && "bitcast from non-v256i32 to x86amx");
		if (auto BitCast = dyn_cast<BitCastInst>(RHS))
		VecB = BitCast->getOperand(0);
		assert(VecB->getType()->isVectorTy() && "bitcast from non-v256i32 to x86amx");

		// tiledpbf16ps.unroll.rows.header:
		pengfeiUnsubmitted Not Done Reply Inline Actions ditto pengfei: ditto
		// %vec.phi.rows = phi <256 x i32> [ %vec_c, %continue ], [ %NewVecC,
		pengfeiUnsubmitted Not Done Reply Inline Actions Better to define a variable for it and reuse. pengfei: Better to define a variable for it and reuse.
		// %tiledpbf16ps.unroll.rows.latch ]
		B.SetInsertPoint(RowLoopHeader->getTerminator());
		PHINode *VecPhi_Row_Loop = B.CreatePHI(V256I32Ty, 2, "vec.phi.row");
		VecPhi_Row_Loop->addIncoming(VecC, Start);

		// tiledpbf16ps.unroll.cols.header:
		// %vec.phi.cols = phi <256 x i32> [ %vec.phi.rows,
		// %tiledpbf16ps.unroll.rows.body ], [ %NewVecC, %tiledpbf16ps.unroll.cols.latch ]
		B.SetInsertPoint(ColumnLoopHeader->getTerminator());
		PHINode *VecPhi_Col_Loop = B.CreatePHI(V256I32Ty, 2, "vec.phi.col");
		VecPhi_Col_Loop->addIncoming(VecPhi_Row_Loop, RowBody);

		// Generate PHI vector for C.
		B.SetInsertPoint(InnerLoopHeader->getTerminator());
		PHINode *VecCPhi = B.CreatePHI(V256I32Ty, 2, "vec.phi");
		VecCPhi->addIncoming(VecPhi_Col_Loop, ColBody);

		// Generate accmulate multiply in innerbody.
		B.SetInsertPoint(InnerBody->getTerminator());
		Value *IdxC =
		B.CreateAdd(B.CreateMul(CurrentRow, B.getInt16(16)), CurrentCol);
		Value *IdxA =
		B.CreateAdd(B.CreateMul(CurrentRow, B.getInt16(16)), CurrentInner);
		Value *IdxB =
		B.CreateAdd(B.CreateMul(CurrentInner, B.getInt16(16)), CurrentCol);

		//FixedVectorType *V4I8Ty = FixedVectorType::get(B.getInt8Ty(), 4);
		FixedVectorType *V2I16Ty = FixedVectorType::get(B.getInt16Ty(), 2);
		FixedVectorType *V2I32Ty = FixedVectorType::get(B.getInt32Ty(), 2);
		FixedVectorType *V2F32Ty = FixedVectorType::get(B.getFloatTy(), 2);
		FixedVectorType *V4I32Ty = FixedVectorType::get(B.getInt32Ty(), 4);
		Value *EltC = B.CreateExtractElement(VecCPhi, IdxC);
		Value *C_F32= B.CreateBitCast(EltC, B.getFloatTy());
		Value *EltA = B.CreateExtractElement(VecA, IdxA);
		Value *SubVecA = B.CreateBitCast(EltA, V2I16Ty);
		Value *EltB = B.CreateExtractElement(VecB, IdxB);
		Value *SubVecB = B.CreateBitCast(EltB, V2I16Ty);
		Value *A_V2F32 = B.CreateBitCast(B.CreateShl(B.CreateZExt(SubVecA, V2I32Ty), B.CreateVectorSplat(2,B.getInt32(16))), V2F32Ty);
		Value *B_V2F32 = B.CreateBitCast(B.CreateShl(B.CreateZExt(SubVecB, V2I32Ty), B.CreateVectorSplat(2,B.getInt32(16))), V2F32Ty);
		Value *SubVecR = B.CreateFAddReduce(C_F32, B.CreateFMul(A_V2F32, B_V2F32));
		Value *ResElt = B.CreateBitCast(SubVecR, B.getInt32Ty());
		Value *NewVecC = B.CreateInsertElement(VecCPhi, ResElt, IdxC);
		VecCPhi->addIncoming(NewVecC, InnerLoopLatch);
		VecPhi_Row_Loop->addIncoming(NewVecC, RowLatch);
		VecPhi_Col_Loop->addIncoming(NewVecC, ColLoopLatch);

		return NewVecC;
		}

namespace {		namespace {
class X86LowerAMXIntrinsics {		class X86LowerAMXIntrinsics {
Function &Func;		Function &Func;

public:		public:
X86LowerAMXIntrinsics(Function &F, DominatorTree DT, LoopInfo LI)		X86LowerAMXIntrinsics(Function &F, DominatorTree DT, LoopInfo LI)
: Func(F), DT(DT), LI(LI) {}		: Func(F), DT(DT), LI(LI) {}
bool visit();		bool visit();

private:		private:
DominatorTree *DT;		DominatorTree *DT;
LoopInfo *LI;		LoopInfo *LI;
bool lowerTileLoad(Instruction *TileLoad);		bool lowerTileLoad(Instruction *TileLoad);
bool lowerTileDPBSSD(Instruction *TileDPBSSD);		bool lowerTileDPBSSD(Instruction *TileDPBSSD);
		bool lowerTileDPBF16PS(Instruction *TileDPBSSD);
bool lowerTileStore(Instruction *TileStore);		bool lowerTileStore(Instruction *TileStore);
bool lowerTileZero(Instruction *TileZero);		bool lowerTileZero(Instruction *TileZero);
		pengfeiUnsubmitted Not Done Reply Inline Actions Is it concise to use below? template <Intrinsic::ID IntrID> typename std::enable_if_t< IntrID == Intrinsic::x86_tdpbssd_internal \|\| IntrID == Intrinsic::x86_tdpbf16ps_internal, bool> lowerTileDP(Instruction TileDP); pengfei:* Is it concise to use below? ``` template <Intrinsic::ID IntrID> typename std::enable_if_t<…
};		};

bool X86LowerAMXIntrinsics::lowerTileDPBSSD(Instruction *TileDPBSSD) {		bool X86LowerAMXIntrinsics::lowerTileDPBSSD(Instruction *TileDPBSSD) {
Value M, N, K, C, A, B;		Value M, N, K, C, A, B;
match(TileDPBSSD, m_Intrinsic<Intrinsic::x86_tdpbssd_internal>(		match(TileDPBSSD, m_Intrinsic<Intrinsic::x86_tdpbssd_internal>(
m_Value(M), m_Value(N), m_Value(K), m_Value(C),		m_Value(M), m_Value(N), m_Value(K), m_Value(C),
m_Value(A), m_Value(B)));		m_Value(A), m_Value(B)));
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);		DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
Show All 21 Lines	if (match(I, m_BitCast(m_Value(Vec)))) {
I->replaceAllUsesWith(ResVec);		I->replaceAllUsesWith(ResVec);
I->eraseFromParent();		I->eraseFromParent();
}		}
}		}
TileDPBSSD->eraseFromParent();		TileDPBSSD->eraseFromParent();
return true;		return true;
}		}

		bool X86LowerAMXIntrinsics::lowerTileDPBF16PS(Instruction *TileDPBF16PS) {
		Value M, N, K, C, A, B;
		match(TileDPBF16PS, m_Intrinsic<Intrinsic::x86_tdpbf16ps_internal>(
		m_Value(M), m_Value(N), m_Value(K), m_Value(C),
		m_Value(A), m_Value(B)));
		DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
		Instruction *InsertI = TileDPBF16PS;
		IRBuilder<> Builder_Prepare(TileDPBF16PS);
		Builder_Prepare.SetInsertPoint(TileDPBF16PS);
		// We visit the loop with (m, n/4, k/4):
		// %n_dword = udiv i16 %n, 4
		// %k_dword = udiv i16 %k, 4
		Value *N_DWord = Builder_Prepare.CreateUDiv(N, Builder_Prepare.getInt16(4));
		Value *K_DWord = Builder_Prepare.CreateUDiv(K, Builder_Prepare.getInt16(4));
		BasicBlock *Start = InsertI->getParent();
		BasicBlock *End =
		SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue");
		IRBuilder<> Builder(TileDPBF16PS);
		Value ResVec = createTileDPBF16PSLoops(Start, End, Builder, DTU, LI, M,
		N_DWord, K_DWord, C, A, B);

		// Delete tileloadd6 intrinsic and bitcast instruction.
		for (auto UI = TileDPBF16PS->use_begin(), UE = TileDPBF16PS->use_end();
		UI != UE;) {
		Instruction *I = cast<Instruction>((UI++)->getUser());
		Value *Vec;
		if (match(I, m_BitCast(m_Value(Vec)))) {
		I->replaceAllUsesWith(ResVec);
		I->eraseFromParent();
		}
		}
		TileDPBF16PS->eraseFromParent();
		return true;
		}

bool X86LowerAMXIntrinsics::lowerTileLoad(Instruction *TileLoad) {		bool X86LowerAMXIntrinsics::lowerTileLoad(Instruction *TileLoad) {
Value M, N, Ptr, Stride;		Value M, N, Ptr, Stride;
match(TileLoad, m_Intrinsic<Intrinsic::x86_tileloadd64_internal>(		match(TileLoad, m_Intrinsic<Intrinsic::x86_tileloadd64_internal>(
m_Value(M), m_Value(N), m_Value(Ptr), m_Value(Stride)));		m_Value(M), m_Value(N), m_Value(Ptr), m_Value(Stride)));
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);		DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
Instruction *InsertI = TileLoad;		Instruction *InsertI = TileLoad;
IRBuilder<> Builder_Prepare(TileLoad);		IRBuilder<> Builder_Prepare(TileLoad);
Builder_Prepare.SetInsertPoint(TileLoad);		Builder_Prepare.SetInsertPoint(TileLoad);
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	bool X86LowerAMXIntrinsics::lowerTileZero(Instruction *TileZero) {
}		}
TileZero->eraseFromParent();		TileZero->eraseFromParent();
return true;		return true;
}		}

bool X86LowerAMXIntrinsics::visit() {		bool X86LowerAMXIntrinsics::visit() {
bool C;		bool C;
SmallVector<Instruction *, 8> TileDPBSSDs;		SmallVector<Instruction *, 8> TileDPBSSDs;
		SmallVector<Instruction *, 8> TileDPBF16PSs;
SmallVector<Instruction *, 8> TileLoads;		SmallVector<Instruction *, 8> TileLoads;
SmallVector<Instruction *, 8> TileStores;		SmallVector<Instruction *, 8> TileStores;
SmallVector<Instruction *, 8> TileZeros;		SmallVector<Instruction *, 8> TileZeros;

for (BasicBlock *BB : post_order(&Func)) {		for (BasicBlock *BB : post_order(&Func)) {
for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();		for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
II != IE;) {		II != IE;) {
Instruction &Inst = *II++;		Instruction &Inst = *II++;
if (match(&Inst, m_Intrinsic<Intrinsic::x86_tdpbssd_internal>())) {		if (match(&Inst, m_Intrinsic<Intrinsic::x86_tdpbssd_internal>())) {
// %amx1 = bitcast <256 x i32> %vec to x86_amx		// %amx1 = bitcast <256 x i32> %vec to x86_amx
// %res = call x86_amx @llvm.x86.tdpbssd.internal(i16 m, i16 n, i16 k,		// %res = call x86_amx @llvm.x86.tdpbssd.internal(i16 m, i16 n, i16 k,
// x86_amx, %amx1, ...)		// x86_amx, %amx1, ...)
// %vec2 = bitcast x86_amx %res to <256 x i32>		// %vec2 = bitcast x86_amx %res to <256 x i32>
TileDPBSSDs.push_back(&Inst);		TileDPBSSDs.push_back(&Inst);
		} else if (match(&Inst, m_Intrinsic<Intrinsic::x86_tdpbf16ps_internal>())) {
		// %amx1 = bitcast <256 x i32> %vec to x86_amx
		// %res = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 m, i16 n, i16 k,
		// x86_amx, %amx1, ...)
		// %vec2 = bitcast x86_amx %res to <256 x i32>
		TileDPBF16PSs.push_back(&Inst);
} else if (match(&Inst,		} else if (match(&Inst,
m_Intrinsic<Intrinsic::x86_tileloadd64_internal>())) {		m_Intrinsic<Intrinsic::x86_tileloadd64_internal>())) {
// %17 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %13, i16 %14,		// %17 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %13, i16 %14,
// i8* %15, i64 %16)		// i8* %15, i64 %16)
// %18 = bitcast x86_amx %17 to <256 x i32>		// %18 = bitcast x86_amx %17 to <256 x i32>
TileLoads.push_back(&Inst);		TileLoads.push_back(&Inst);
} else if (match(&Inst,		} else if (match(&Inst,
m_Intrinsic<Intrinsic::x86_tilestored64_internal>())) {		m_Intrinsic<Intrinsic::x86_tilestored64_internal>())) {
Show All 11 Lines	bool X86LowerAMXIntrinsics::visit() {
}		}

for (auto *Inst : TileLoads) {		for (auto *Inst : TileLoads) {
C \|= lowerTileLoad(Inst);		C \|= lowerTileLoad(Inst);
}		}
for (auto *Inst : TileDPBSSDs) {		for (auto *Inst : TileDPBSSDs) {
C \|= lowerTileDPBSSD(Inst);		C \|= lowerTileDPBSSD(Inst);
}		}
		for (auto *Inst : TileDPBF16PSs) {
		C \|= lowerTileDPBF16PS(Inst);
		}
for (auto *Inst : TileStores) {		for (auto *Inst : TileStores) {
C \|= lowerTileStore(Inst);		C \|= lowerTileStore(Inst);
}		}
for (auto *Inst : TileZeros) {		for (auto *Inst : TileZeros) {
C \|= lowerTileZero(Inst);		C \|= lowerTileZero(Inst);
}		}

return C;		return C;
▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86LowerAMXType.cpp

//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------- C++ --===//		//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------- C++ --===//
		Lint: Lint Inline Actions clang-format not found in user's PATH; not linting file. Lint: Lint: clang-format not found in user's PATH; not linting file.
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	static std::pair<Value , Value > getShape(IntrinsicInst *II, unsigned OpNo) {
case Intrinsic::x86_tileloadd64_internal:		case Intrinsic::x86_tileloadd64_internal:
case Intrinsic::x86_tilestored64_internal: {		case Intrinsic::x86_tilestored64_internal: {
Row = II->getArgOperand(0);		Row = II->getArgOperand(0);
Col = II->getArgOperand(1);		Col = II->getArgOperand(1);
break;		break;
}		}
// a * b + c		// a * b + c
// The shape depends on which operand.		// The shape depends on which operand.
case Intrinsic::x86_tdpbssd_internal: {		case Intrinsic::x86_tdpbssd_internal:
		case Intrinsic::x86_tdpbf16ps_internal:{
switch (OpNo) {		switch (OpNo) {
case 3:		case 3:
Row = II->getArgOperand(0);		Row = II->getArgOperand(0);
Col = II->getArgOperand(1);		Col = II->getArgOperand(1);
break;		break;
case 4:		case 4:
Row = II->getArgOperand(0);		Row = II->getArgOperand(0);
Col = II->getArgOperand(2);		Col = II->getArgOperand(2);
▲ Show 20 Lines • Show All 273 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86PreTileConfig.cpp

	//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//			//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//
				Lint: Lint Inline Actions clang-format not found in user's PATH; not linting file. Lint: Lint: clang-format not found in user's PATH; not linting file.
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	▲ Show 20 Lines • Show All 113 Lines • ▼ Show 20 Lines

	static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {			static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
	unsigned Opcode = MI.getOpcode();			unsigned Opcode = MI.getOpcode();
	switch (Opcode) {			switch (Opcode) {
	default:			default:
	llvm_unreachable("Unexpected machine instruction on tile");			llvm_unreachable("Unexpected machine instruction on tile");
	case X86::PTILELOADDV:			case X86::PTILELOADDV:
	case X86::PTDPBSSDV:			case X86::PTDPBSSDV:
				case X86::PTDPBF16PSV:
	case X86::PTILEZEROV:			case X86::PTILEZEROV:
	MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));			MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
	MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));			MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
	ShapeT Shape(&MO1, &MO2, MRI);			ShapeT Shape(&MO1, &MO2, MRI);
	return Shape;			return Shape;
	}			}
	}			}

	▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines

	static bool isAMXInstruction(MachineBasicBlock::iterator MII) {			static bool isAMXInstruction(MachineBasicBlock::iterator MII) {
	switch (MII->getOpcode()) {			switch (MII->getOpcode()) {
	default:			default:
	return false;			return false;
	case X86::PTILELOADDV:			case X86::PTILELOADDV:
	case X86::PTILESTOREDV:			case X86::PTILESTOREDV:
	case X86::PTDPBSSDV:			case X86::PTDPBSSDV:
				case X86::PTDPBF16PSV:
	case X86::PTILEZEROV:			case X86::PTILEZEROV:
	return true;			return true;
	}			}
	}			}

	struct BBInfo {			struct BBInfo {
	bool HasAMX = false;			bool HasAMX = false;
	bool HasCallBeforeAMX = false;			bool HasCallBeforeAMX = false;
	▲ Show 20 Lines • Show All 101 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86RegisterInfo.cpp

//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//		//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
		Lint: Lint Inline Actions clang-format not found in user's PATH; not linting file. Lint: Lint: clang-format not found in user's PATH; not linting file.
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
▲ Show 20 Lines • Show All 859 Lines • ▼ Show 20 Lines	static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
unsigned OpCode = MI->getOpcode();		unsigned OpCode = MI->getOpcode();
switch (OpCode) {		switch (OpCode) {
default:		default:
llvm_unreachable("Unexpected machine instruction on tile register!");		llvm_unreachable("Unexpected machine instruction on tile register!");
break;		break;
// We only collect the tile shape that is defined.		// We only collect the tile shape that is defined.
case X86::PTILELOADDV:		case X86::PTILELOADDV:
case X86::PTDPBSSDV:		case X86::PTDPBSSDV:
		case X86::PTDPBF16PSV:
case X86::PTILEZEROV:		case X86::PTILEZEROV:
MachineOperand &MO1 = MI->getOperand(1);		MachineOperand &MO1 = MI->getOperand(1);
MachineOperand &MO2 = MI->getOperand(2);		MachineOperand &MO2 = MI->getOperand(2);
ShapeT Shape(&MO1, &MO2, MRI);		ShapeT Shape(&MO1, &MO2, MRI);
VRM->assignVirt2Shape(VirtReg, Shape);		VRM->assignVirt2Shape(VirtReg, Shape);
return Shape;		return Shape;
}		}
}		}
▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Pass to transform tdpbf16ps intrinsics to scalar operation.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 321665

clang/include/clang/Basic/BuiltinsX86_64.def

clang/lib/Headers/amxintrin.h

llvm/include/llvm/IR/IntrinsicsX86.td

llvm/lib/Target/X86/X86ExpandPseudo.cpp

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

llvm/lib/Target/X86/X86InstrAMX.td

llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp

llvm/lib/Target/X86/X86LowerAMXType.cpp

llvm/lib/Target/X86/X86PreTileConfig.cpp

llvm/lib/Target/X86/X86RegisterInfo.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Pass to transform tdpbf16ps intrinsics to scalar operation.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 321665

clang/include/clang/Basic/BuiltinsX86_64.def

clang/lib/Headers/amxintrin.h

llvm/include/llvm/IR/IntrinsicsX86.td

llvm/lib/Target/X86/X86ExpandPseudo.cpp

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

llvm/lib/Target/X86/X86InstrAMX.td

llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp

llvm/lib/Target/X86/X86LowerAMXType.cpp

llvm/lib/Target/X86/X86PreTileConfig.cpp

llvm/lib/Target/X86/X86RegisterInfo.cpp

[X86] Pass to transform tdpbf16ps intrinsics to scalar operation.
ClosedPublic