This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/
-
CodeGen/GlobalISel/
-
GlobalISel/
-
IRTranslator.cpp
-
Target/AArch64/
-
AArch64/
-
AArch64ISelLowering.cpp
-
AArch64InstrInfo.td
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
bf16-conversions.ll

Differential D99261

AArch64: support bfloat extend and trunc
Needs ReviewPublic

Authored by t.p.northover on Mar 24 2021, 6:32 AM.

Download Raw Diff

This revision needs review, but there are no reviewers specified.

Details

Reviewers: None

Summary

A useful step on the way to supporting full bfloat arithmetic is allowing code that immediately extends a bfloat before doing anything non-trivial, and truncate it back before storage.

This patch implements that by making sure we don't try any extload/truncstores and adding patterns for the relevant conversions.

I also had to ban GlobalISel from dealing with bfloat here because its type system only has s16 and it thinks they're normal half conversions.

Diff Detail

Event Timeline

t.p.northover created this revision.Mar 24 2021, 6:32 AM

Herald added subscribers: danielkiss, hiraditya, kristof.beyls, mcrosier. · View Herald TranscriptMar 24 2021, 6:32 AM

t.p.northover requested review of this revision.Mar 24 2021, 6:32 AM

Herald added a project: Restricted Project. · View Herald TranscriptMar 24 2021, 6:32 AM

Harbormaster completed remote builds in B95473: Diff 332960.Mar 24 2021, 11:54 AM

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

GlobalISel/

IRTranslator.cpp

3 lines

Target/

AArch64/

AArch64ISelLowering.cpp

14 lines

AArch64InstrInfo.td

16 lines

test/

CodeGen/

AArch64/

bf16-conversions.ll

167 lines

Diff 332960

llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp

Show First 20 Lines • Show All 1,444 Lines • ▼ Show 20 Lines	if (getLLTForType(U.getOperand(0)->getType(), DL) ==
getLLTForType(U.getType(), DL))		getLLTForType(U.getType(), DL))
return translateCopy(U, *U.getOperand(0), MIRBuilder);		return translateCopy(U, *U.getOperand(0), MIRBuilder);

return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder);		return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder);
}		}

bool IRTranslator::translateCast(unsigned Opcode, const User &U,		bool IRTranslator::translateCast(unsigned Opcode, const User &U,
MachineIRBuilder &MIRBuilder) {		MachineIRBuilder &MIRBuilder) {
		if (U.getType()->getScalarType()->isBFloatTy() \|\|
		U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
		return false;
Register Op = getOrCreateVReg(*U.getOperand(0));		Register Op = getOrCreateVReg(*U.getOperand(0));
Register Res = getOrCreateVReg(U);		Register Res = getOrCreateVReg(U);
MIRBuilder.buildInstr(Opcode, {Res}, {Op});		MIRBuilder.buildInstr(Opcode, {Res}, {Op});
return true;		return true;
}		}

bool IRTranslator::translateGetElementPtr(const User &U,		bool IRTranslator::translateGetElementPtr(const User &U,
MachineIRBuilder &MIRBuilder) {		MachineIRBuilder &MIRBuilder) {
▲ Show 20 Lines • Show All 1,774 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 794 Lines • ▼ Show 20 Lines	#undef LCALLNAME5

// Make floating-point constants legal for the large code model, so they don't		// Make floating-point constants legal for the large code model, so they don't
// become loads from the constant pool.		// become loads from the constant pool.
if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {		if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);		setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);		setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
}		}

		// Converting f64 -> bf16 would need a double-round so we must libcall it
		// unless we have fast-math.
		setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom);

// AArch64 does not have floating-point extending loads, i1 sign-extending		// AArch64 does not have floating-point extending loads, i1 sign-extending
// load, floating-point truncating stores, or v2i32->v2i16 truncating store.		// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
for (MVT VT : MVT::fp_valuetypes()) {		for (MVT VT : MVT::fp_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);		setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
		setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);		setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);		setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);		setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
}		}
for (MVT VT : MVT::integer_valuetypes())		for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);		setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);

setTruncStoreAction(MVT::f32, MVT::f16, Expand);		setTruncStoreAction(MVT::f32, MVT::f16, Expand);
		setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);		setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);		setTruncStoreAction(MVT::f64, MVT::f16, Expand);
		setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
setTruncStoreAction(MVT::f128, MVT::f80, Expand);		setTruncStoreAction(MVT::f128, MVT::f80, Expand);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);		setTruncStoreAction(MVT::f128, MVT::f64, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);		setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f16, Expand);		setTruncStoreAction(MVT::f128, MVT::f16, Expand);
		setTruncStoreAction(MVT::f128, MVT::bf16, Expand);

setOperationAction(ISD::BITCAST, MVT::i16, Custom);		setOperationAction(ISD::BITCAST, MVT::i16, Custom);
setOperationAction(ISD::BITCAST, MVT::f16, Custom);		setOperationAction(ISD::BITCAST, MVT::f16, Custom);
setOperationAction(ISD::BITCAST, MVT::bf16, Custom);		setOperationAction(ISD::BITCAST, MVT::bf16, Custom);

// Indexed loads and stores are supported.		// Indexed loads and stores are supported.
for (unsigned im = (unsigned)ISD::PRE_INC;		for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {		im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
▲ Show 20 Lines • Show All 2,299 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);		SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
EVT SrcVT = SrcVal.getValueType();		EVT SrcVT = SrcVal.getValueType();

if (SrcVT != MVT::f128) {		if (SrcVT != MVT::f128) {
// Expand cases where the input is a vector bigger than NEON.		// Expand cases where the input is a vector bigger than NEON.
if (useSVEForFixedLengthVectorVT(SrcVT))		if (useSVEForFixedLengthVectorVT(SrcVT))
return SDValue();		return SDValue();

		if (Op.getValueType() == MVT::bf16 && SrcVT != MVT::f32 &&
		!DAG.getTarget().Options.UnsafeFPMath) {
		report_fatal_error("No way to correctly truncate anything but float to bfloat");
		Lint: Pre-merge checks Inline Actions clang-format: please reformat the code - report_fatal_error("No way to correctly truncate anything but float to bfloat"); + report_fatal_error( + "No way to correctly truncate anything but float to bfloat"); Lint: Pre-merge checks: clang-format: please reformat the code ``` - report_fatal_error("No way to correctly…
		return SDValue();
		}

// It's legal except when f128 is involved		// It's legal except when f128 is involved
return Op;		return Op;
}		}

return SDValue();		return SDValue();
}		}

SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,		SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
▲ Show 20 Lines • Show All 14,367 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64InstrInfo.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 867 Lines • ▼ Show 20 Lines	def : Pat<(v2f32 (int_aarch64_neon_bfdot
(v4i32 (bitconvert		(v4i32 (bitconvert
(v8bf16 (insert_subvector undef,		(v8bf16 (insert_subvector undef,
(v4bf16 V64:$Rm),		(v4bf16 V64:$Rm),
(i64 0))))),		(i64 0))))),
VectorIndexS:$idx)))))),		VectorIndexS:$idx)))))),
(BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn),		(BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),		(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
VectorIndexS:$idx)>;		VectorIndexS:$idx)>;

		def : Pat<(bf16 (fpround f32:$src)), (BFCVT $src)>;
		def : Pat<(v4bf16 (fpround v4f32:$src)), (EXTRACT_SUBREG (BFCVTN $src), dsub)>;
}		}

// ARMv8.6A AArch64 matrix multiplication		// ARMv8.6A AArch64 matrix multiplication
let Predicates = [HasMatMulInt8] in {		let Predicates = [HasMatMulInt8] in {
def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;		def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;		def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;		def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;		defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
▲ Show 20 Lines • Show All 3,277 Lines • ▼ Show 20 Lines
def : Pat<(v4bf16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;		def : Pat<(v4bf16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;		def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;		def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
def : Pat<(v8bf16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;		def : Pat<(v8bf16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
def : Pat<(v8bf16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;		def : Pat<(v8bf16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;		def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;		def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;

		def : Pat<(f32 (fpextend bf16:$src)),
		(EXTRACT_SUBREG (SHLLv4i16 (SUBREG_TO_REG (i64 0), $src, hsub)), ssub)>;

		def : Pat<(v4f32 (fpextend v4bf16:$src)), (SHLLv4i16 $src)>;

		def : Pat<(f64 (fpextend bf16:$src)),
		(FCVTDSr (EXTRACT_SUBREG
		(SHLLv4i16 (SUBREG_TO_REG (i64 0), $src, hsub)),
		ssub))>;

		def : Pat<(bf16 (fpround f64:$src)), (BFCVT (FCVTSDr $src))>;


// Patterns for vector long shift (by element width). These need to match all		// Patterns for vector long shift (by element width). These need to match all
// three of zext, sext and anyext so it's easier to pull the patterns out of the		// three of zext, sext and anyext so it's easier to pull the patterns out of the
// definition.		// definition.
multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {		multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),		def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
(SHLLv8i8 V64:$Rn)>;		(SHLLv8i8 V64:$Rn)>;
def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),		def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
(SHLLv16i8 V128:$Rn)>;		(SHLLv16i8 V128:$Rn)>;
▲ Show 20 Lines • Show All 3,808 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/bf16-conversions.ll

This file was added.

				; RUN: llc -mtriple=arm64-apple-macosx %s -o - -mattr=+bf16 \| FileCheck %s
				; RUN: llc -mtriple=arm64-apple-macosx %s -o - -mattr=+bf16 -global-isel -global-isel-abort=0 \| FileCheck %s
				; RUN: llc -mtriple=arm64-apple-macosx %s -o - -mattr=+bf16 -fast-isel \| FileCheck %s

				define <4 x float> @extendvec_bf16_f32(<4 x bfloat> %in) {
				; CHECK-LABEL: extendvec_bf16_f32:
				; CHECK: shll.4s v0, v0, #16

				%res = fpext <4 x bfloat> %in to <4 x float>
				ret <4 x float> %res
				}

				define float @extend_bf16_f32(bfloat %in) {
				; CHECK-LABEL: extend_bf16_f32:
				; CHECK: shll.4s v0, v0, #16

				%res = fpext bfloat %in to float
				ret float %res
				}

				; Scalarized
				define <4 x double> @extendvec_bf16_f64(<4 x bfloat> %in) {
				; CHECK-LABEL: extendvec_bf16_f64:
				; CHECK: shll.4s v[[TMP:[0-9]]], {{.*}}, #16
				; CHECK: fcvt {{d.*}}, s[[TMP]]
				; CHECK: fcvt
				; CHECK: fcvt
				; CHECK: fcvt

				%res = fpext <4 x bfloat> %in to <4 x double>
				ret <4 x double> %res
				}

				define double @extend_bf16_f64(bfloat %in) {
				; CHECK-LABEL: extend_bf16_f64
				; CHECK: shll.4s v[[TMP:[0-9]]], v0, #16
				; CHECK: fcvt d0, s[[TMP]]
				%res = fpext bfloat %in to double
				ret double %res
				}

				define <4 x bfloat> @truncvec_f32_bf16(<4 x float> %in) {
				; CHECK-LABEL: truncvec_f32_bf16:
				; CHECK: bfcvtn.4h v0, v0
				%res = fptrunc <4 x float> %in to <4 x bfloat>
				ret <4 x bfloat> %res
				}

				define bfloat @trunc_f32_bf16(float %in) {
				; CHECK-LABEL: trunc_f32_bf16:
				; CHECK: bfcvt h0, s0
				%res = fptrunc float %in to bfloat
				ret bfloat %res
				}

				; Scalarized
				define <4 x bfloat> @truncvec_f64_bf16(<4 x double> %in) "unsafe-fp-math"="true" {
				; CHECK-LABEL: truncvec_f64_bf16:
				; CHECK: fcvt [[TMP:s[0-9]+]], {{d.*}}
				; CHECK: bfcvt {{h.*}}, [[TMP]]
				; CHECK: bfcvt
				; CHECK: bfcvt
				; CHECK: bfcvt

				%res = fptrunc <4 x double> %in to <4 x bfloat>
				ret <4 x bfloat> %res
				}

				define bfloat @trunc_f64_bf16(double %in) "unsafe-fp-math"="true" {
				; CHECK-LABEL: trunc_f64_bf16:
				; CHECK: fcvt [[TMP:s[0-9]+]], d0
				; CHECK: bfcvt h0, [[TMP]]

				%res = fptrunc double %in to bfloat
				ret bfloat %res
				}

				define float @extload_bf16_f32(bfloat* %ptr) {
				; CHECK-LABEL: extload_bf16_f32:
				; CHECK: ldr h[[TMP:[0-9]+]], [x0]
				; CHECK: shll.4s v0, v[[TMP]], #16

				%tmp = load bfloat, bfloat* %ptr
				%res = fpext bfloat %tmp to float
				ret float %res
				}

				define double @extload_bf16_f64(bfloat* %ptr) {
				; CHECK-LABEL: extload_bf16_f64:
				; CHECK: ldr h[[TMP:[0-9]+]], [x0]
				; CHECK: shll.4s v[[TMP1:[0-9]+]], v[[TMP]], #16
				; CHECK: fcvt d0, s[[TMP1]]

				%tmp = load bfloat, bfloat* %ptr
				%res = fpext bfloat %tmp to double
				ret double %res
				}

				define <4 x float> @extloadvec_bf16_f32(<4 x bfloat>* %ptr) {
				; CHECK-LABEL: extloadvec_bf16_f32:
				; CHECK: ldr d[[TMP:[0-9]+]], [x0]
				; CHECK: shll.4s v0, v[[TMP]], #16

				%tmp = load <4 x bfloat>, <4 x bfloat>* %ptr
				%res = fpext <4 x bfloat> %tmp to <4 x float>
				ret <4 x float> %res
				}

				; Scalarized
				define <4 x double> @extloadvec_bf16_f64(<4 x bfloat>* %ptr) {
				; CHECK-LABEL: extloadvec_bf16_f64:
				; CHECK: ldr d[[TMP:[0-9]+]], [x0]
				; CHECK: shll.4s v[[TMP1:[0-9]+]], v[[TMP]], #16
				; CHECK: fcvt {{d.*}}, s[[TMP1]]
				; CHECK: fcvt
				; CHECK: fcvt
				; CHECK: fcvt

				%tmp = load <4 x bfloat>, <4 x bfloat>* %ptr
				%res = fpext <4 x bfloat> %tmp to <4 x double>
				ret <4 x double> %res
				}

				define void @truncstore_f32_bf16(float %in, bfloat* %ptr) {
				; CHECK-LABEL: truncstore_f32_bf16:
				; CHECK: bfcvt [[TMP:h[0-9]+]], s0
				; CHECK: str [[TMP]], [x0]

				%val = fptrunc float %in to bfloat
				store bfloat %val, bfloat* %ptr
				ret void
				}

				define void @truncstore_f64_bf16(double %in, bfloat* %ptr) "unsafe-fp-math"="true" {
				; CHECK-LABEL: truncstore_f64_bf16:
				; CHECK: fcvt [[TMP:s[0-9]+]], d0
				; CHECK: bfcvt [[TMP1:h[0-9]+]], [[TMP]]
				; CHECK: str [[TMP1]], [x0]


				%val = fptrunc double %in to bfloat
				store bfloat %val, bfloat* %ptr
				ret void
				}

				define void @truncstorevec_f32_bf16(<4 x float> %in, <4 x bfloat>* %ptr) {
				; CHECK-LABEL: truncstorevec_f32_bf16:
				; CHECK: bfcvtn.4h v[[TMP:[0-9]+]], v0
				; CHECK: str d[[TMP]], [x0]
				%val = fptrunc <4 x float> %in to <4 x bfloat>
				store <4 x bfloat> %val, <4 x bfloat>* %ptr
				ret void
				}

				; Scalarized
				define void @truncstorevec_f64_bf16(<4 x double> %in, <4 x bfloat>* %ptr) "unsafe-fp-math"="true" {
				; CHECK-LABEL: truncstorevec_f64_bf16:
				; CHECK: fcvt [[TMP:s[0-9]+]], d0
				; CHECK: bfcvt [[TMP1:h[0-9]+]], [[TMP]]
				; CHECK: bfcvt
				; CHECK: bfcvt
				; CHECK: bfcvt

				%val = fptrunc <4 x double> %in to <4 x bfloat>
				store <4 x bfloat> %val, <4 x bfloat>* %ptr
				ret void
				}