Diff 192674

llvm/lib/Target/NVPTX/NVPTXISelLowering.h

	Show First 20 Lines • Show All 550 Lines • ▼ Show 20 Lines
	private:			private:
	const NVPTXSubtarget &STI; // cache the subtarget here			const NVPTXSubtarget &STI; // cache the subtarget here
	SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;			SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;

	SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;			SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;			SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;			SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;

				SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
				SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
				SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;			SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;			SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;			SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;			SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;			SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;			SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
	Show All 14 Lines

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Show First 20 Lines • Show All 540 Lines • ▼ Show 20 Lines	NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// There's no neg.f16 instruction. Expand to (0-x).		// There's no neg.f16 instruction. Expand to (0-x).
setOperationAction(ISD::FNEG, MVT::f16, Expand);		setOperationAction(ISD::FNEG, MVT::f16, Expand);
setOperationAction(ISD::FNEG, MVT::v2f16, Expand);		setOperationAction(ISD::FNEG, MVT::v2f16, Expand);

// (would be) Library functions.		// (would be) Library functions.

// These map to conversion instructions for scalar FP types.		// These map to conversion instructions for scalar FP types.
for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,		for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
ISD::FROUND, ISD::FTRUNC}) {		ISD::FTRUNC}) {
setOperationAction(Op, MVT::f16, Legal);		setOperationAction(Op, MVT::f16, Legal);
setOperationAction(Op, MVT::f32, Legal);		setOperationAction(Op, MVT::f32, Legal);
setOperationAction(Op, MVT::f64, Legal);		setOperationAction(Op, MVT::f64, Legal);
setOperationAction(Op, MVT::v2f16, Expand);		setOperationAction(Op, MVT::v2f16, Expand);
}		}

		setOperationAction(ISD::FROUND, MVT::f16, Promote);
		setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
		setOperationAction(ISD::FROUND, MVT::f32, Custom);
		setOperationAction(ISD::FROUND, MVT::f64, Custom);


// 'Expand' implements FCOPYSIGN without calling an external library.		// 'Expand' implements FCOPYSIGN without calling an external library.
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);		setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);		setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);		setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);		setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

// These map to corresponding instructions for f32/f64. f16 must be		// These map to corresponding instructions for f32/f64. f16 must be
// promoted to f32. v2f16 is expanded to f16, which is then promoted		// promoted to f32. v2f16 is expanded to f16, which is then promoted
▲ Show 20 Lines • Show All 1,499 Lines • ▼ Show 20 Lines	else {
SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);		SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);		SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);

SDValue Ops[2] = { Lo, Hi };		SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);		return DAG.getMergeValues(Ops, dl);
}		}
}		}

		SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
		EVT VT = Op.getValueType();

		if (VT == MVT::f32)
		return LowerFROUND32(Op, DAG);

		if (VT == MVT::f64)
		return LowerFROUND64(Op, DAG);

		llvm_unreachable("unhandled type");
		}

		// This is the the rounding method used in CUDA libdevice in C like code:
		// float roundf(float A)
		// {
		// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
		// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
		// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
		// }
		SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
		SelectionDAG &DAG) const {
		SDLoc SL(Op);
		SDValue A = Op.getOperand(0);
		EVT VT = Op.getValueType();

		SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);

		// RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
		SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
		SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
		DAG.getConstant(-2147483648, SL, MVT::i32));
		traUnsubmitted Not Done Reply Inline Actions Do we have FP32-related constants defined somewhere in LLVM tree? It would be easier to understand if we could use `1 << SIGN_BIT_SHIFT` here or `((0 + EXP_OFFSET) << EXP_SHIFT) \| mantissa` below. tra: Do we have FP32-related constants defined somewhere in LLVM tree? It would be easier to…
		SDValue PointFiveRaw = DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
		DAG.getConstant(1056964608, SL, MVT::i32));
		SDValue PointFive = DAG.getNode(ISD::BITCAST, SL, VT, PointFiveRaw);
		SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFive);
		SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);

		// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
		EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
		SDValue IsLarge = DAG.getSetCC(SL, SetCCVT, AbsA,
		DAG.getConstantFP(0x4160000000000000, SL, VT),
		ISD::SETOGT);
		RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);

		// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
		SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
		DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
		SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
		return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
		}

		// The implementation of round(double) is similar to that of round(float) in
		// that they both separate the value range into three regions and use a method
		// specific to the region to round the values. However, round(double) first
		// calculates the round of the absolute value and then adds the sign back while
		// round(float) directly rounds the value with sign.
		traUnsubmitted Not Done Reply Inline Actions This explains what's going on, but not why. Could you elaborate what's the reason behind double using different algorithm? tra: This explains what's going on, but not why. Could you elaborate what's the reason behind double…
		SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
		SelectionDAG &DAG) const {
		SDLoc SL(Op);
		SDValue A = Op.getOperand(0);
		EVT VT = Op.getValueType();

		SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);

		// double RoundedA = (double) (int) (abs(A) + 0.5f);
		SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
		DAG.getConstantFP(0.5, SL, VT));
		SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);

		// RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
		EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
		SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
		DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
		RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
		DAG.getConstantFP(0, SL, VT),
		RoundedA);

		// Add sign to rounded_A
		RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
		DAG.getNode(ISD::FTRUNC, SL, VT, A);

		// RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
		SDValue IsLarge = DAG.getSetCC(SL, SetCCVT, AbsA,
		DAG.getConstantFP(0x4330000000000000, SL, VT),
		ISD::SETOGT);
		return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
		}



SDValue		SDValue
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {		NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {		switch (Op.getOpcode()) {
case ISD::RETURNADDR:		case ISD::RETURNADDR:
return SDValue();		return SDValue();
case ISD::FRAMEADDR:		case ISD::FRAMEADDR:
return SDValue();		return SDValue();
case ISD::GlobalAddress:		case ISD::GlobalAddress:
Show All 14 Lines	case ISD::LOAD:
return LowerLOAD(Op, DAG);		return LowerLOAD(Op, DAG);
case ISD::SHL_PARTS:		case ISD::SHL_PARTS:
return LowerShiftLeftParts(Op, DAG);		return LowerShiftLeftParts(Op, DAG);
case ISD::SRA_PARTS:		case ISD::SRA_PARTS:
case ISD::SRL_PARTS:		case ISD::SRL_PARTS:
return LowerShiftRightParts(Op, DAG);		return LowerShiftRightParts(Op, DAG);
case ISD::SELECT:		case ISD::SELECT:
return LowerSelect(Op, DAG);		return LowerSelect(Op, DAG);
		case ISD::FROUND:
		return LowerFROUND(Op, DAG);
default:		default:
llvm_unreachable("Custom lowering not defined for operation");		llvm_unreachable("Custom lowering not defined for operation");
}		}
}		}

SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {		SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
SDValue Op0 = Op->getOperand(0);		SDValue Op0 = Op->getOperand(0);
SDValue Op1 = Op->getOperand(1);		SDValue Op1 = Op->getOperand(1);
▲ Show 20 Lines • Show All 2,693 Lines • Show Last 20 Lines

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Show First 20 Lines • Show All 2,996 Lines • ▼ Show 20 Lines	def : Pat<(ffloor Float16Regs:$a),
(CVT_f16_f16 Float16Regs:$a, CvtRMI)>;		(CVT_f16_f16 Float16Regs:$a, CvtRMI)>;
def : Pat<(ffloor Float32Regs:$a),		def : Pat<(ffloor Float32Regs:$a),
(CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;		(CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(ffloor Float32Regs:$a),		def : Pat<(ffloor Float32Regs:$a),
(CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;		(CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
def : Pat<(ffloor Float64Regs:$a),		def : Pat<(ffloor Float64Regs:$a),
(CVT_f64_f64 Float64Regs:$a, CvtRMI)>;		(CVT_f64_f64 Float64Regs:$a, CvtRMI)>;

def : Pat<(f16 (fround Float16Regs:$a)),
(CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
def : Pat<(fround Float32Regs:$a),
(CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(f32 (fround Float32Regs:$a)),
(CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
def : Pat<(f64 (fround Float64Regs:$a)),
(CVT_f64_f64 Float64Regs:$a, CvtRNI)>;

def : Pat<(ftrunc Float16Regs:$a),		def : Pat<(ftrunc Float16Regs:$a),
(CVT_f16_f16 Float16Regs:$a, CvtRZI)>;		(CVT_f16_f16 Float16Regs:$a, CvtRZI)>;
def : Pat<(ftrunc Float32Regs:$a),		def : Pat<(ftrunc Float32Regs:$a),
(CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;		(CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(ftrunc Float32Regs:$a),		def : Pat<(ftrunc Float32Regs:$a),
(CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;		(CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
def : Pat<(ftrunc Float64Regs:$a),		def : Pat<(ftrunc Float64Regs:$a),
(CVT_f64_f64 Float64Regs:$a, CvtRZI)>;		(CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
▲ Show 20 Lines • Show All 119 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/f16-instructions.ll

	Show First 20 Lines • Show All 1,102 Lines • ▼ Show 20 Lines
	; CHECK: ret;			; CHECK: ret;
	define half @test_nearbyint(half %a) #0 {			define half @test_nearbyint(half %a) #0 {
	%r = call half @llvm.nearbyint.f16(half %a)			%r = call half @llvm.nearbyint.f16(half %a)
	ret half %r			ret half %r
	}			}

	; CHECK-LABEL: test_round(			; CHECK-LABEL: test_round(
	; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_round_param_0];			; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_round_param_0];
	; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];			; llvm.round can't be replaced with cvt.rni. We will have cuda runnable tests
				; to check for codegen correctness.
				; CHECK-NOT: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]];
	; CHECK: st.param.b16 [func_retval0+0], [[R]];			; CHECK: st.param.b16 [func_retval0+0], [[R]];
	; CHECK: ret;			; CHECK: ret;
	define half @test_round(half %a) #0 {			define half @test_round(half %a) #0 {
	%r = call half @llvm.round.f16(half %a)			%r = call half @llvm.round.f16(half %a)
	ret half %r			ret half %r
	}			}

	; CHECK-LABEL: test_fmuladd(			; CHECK-LABEL: test_fmuladd(
	Show All 19 Lines

llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

	Show First 20 Lines • Show All 1,374 Lines • ▼ Show 20 Lines
	define <2 x half> @test_nearbyint(<2 x half> %a) #0 {			define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
	%r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a)			%r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a)
	ret <2 x half> %r			ret <2 x half> %r
	}			}

	; CHECK-LABEL: test_round(			; CHECK-LABEL: test_round(
	; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0];			; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0];
	; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];			; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
	; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];			; llvm.round can't be replaced with cvt.rni. We will have cuda runnable tests
	; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];			; to check for codegen correctness.
				; CHECK-NOT: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
	; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}			; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
	; CHECK: st.param.b32 [func_retval0+0], [[R]];			; CHECK: st.param.b32 [func_retval0+0], [[R]];
	; CHECK: ret;			; CHECK: ret;
	define <2 x half> @test_round(<2 x half> %a) #0 {			define <2 x half> @test_round(<2 x half> %a) #0 {
	%r = call <2 x half> @llvm.round.f16(<2 x half> %a)			%r = call <2 x half> @llvm.round.f16(<2 x half> %a)
	ret <2 x half> %r			ret <2 x half> %r
	}			}

	▲ Show 20 Lines • Show All 47 Lines • Show Last 20 Lines

llvm/test/CodeGen/NVPTX/math-intrins.ll

Show First 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	define double @floor_double(double %a) {
%b = call double @llvm.floor.f64(double %a)		%b = call double @llvm.floor.f64(double %a)
ret double %b		ret double %b
}		}

; ---- round ----		; ---- round ----

; CHECK-LABEL: round_float		; CHECK-LABEL: round_float
define float @round_float(float %a) {		define float @round_float(float %a) {
; CHECK: cvt.rni.f32.f32		; CHECK: cvt.rzi.f32.f32
		traUnsubmitted Not Done Reply Inline Actions Why do we end up with .rzi (round to nearest integer in the direction of zero) here? Wasn't this patch supposed to change rounding to be away from zero? tra: Why do we end up with .rzi (round to nearest integer in the direction of zero) here? Wasn't…
		bixiaAuthorUnsubmitted Done Reply Inline Actions The emulate implementation is eventually translated to cvt.rzi. I really don't have a good idea on how to modify this test, was also thinking to modify the original CHECK to CHECK-NOT, or delete the test. What do you suggest? bixia: The emulate implementation is eventually translated to cvt.rzi. I really don't have a good…
%b = call float @llvm.round.f32(float %a)		%b = call float @llvm.round.f32(float %a)
ret float %b		ret float %b
}		}

; CHECK-LABEL: round_float_ftz		; CHECK-LABEL: round_float_ftz
define float @round_float_ftz(float %a) #1 {		define float @round_float_ftz(float %a) #1 {
; CHECK: cvt.rni.ftz.f32.f32		; CHECK: cvt.rzi.ftz.f32.f32
%b = call float @llvm.round.f32(float %a)		%b = call float @llvm.round.f32(float %a)
ret float %b		ret float %b
}		}

; CHECK-LABEL: round_double		; CHECK-LABEL: round_double
define double @round_double(double %a) {		define double @round_double(double %a) {
; CHECK: cvt.rni.f64.f64		; CHECK: cvt.rzi.f64.f64
%b = call double @llvm.round.f64(double %a)		%b = call double @llvm.round.f64(double %a)
ret double %b		ret double %b
}		}

; ---- nearbyint ----		; ---- nearbyint ----

; CHECK-LABEL: nearbyint_float		; CHECK-LABEL: nearbyint_float
define float @nearbyint_float(float %a) {		define float @nearbyint_float(float %a) {
▲ Show 20 Lines • Show All 187 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[NVPTX] Fix the codegen for llvm.round.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 192674

llvm/lib/Target/NVPTX/NVPTXISelLowering.h

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

llvm/test/CodeGen/NVPTX/f16-instructions.ll

llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

llvm/test/CodeGen/NVPTX/math-intrins.ll

This is an archive of the discontinued LLVM Phabricator instance.

[NVPTX] Fix the codegen for llvm.round.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 192674

llvm/lib/Target/NVPTX/NVPTXISelLowering.h

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

llvm/test/CodeGen/NVPTX/f16-instructions.ll

llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

llvm/test/CodeGen/NVPTX/math-intrins.ll

[NVPTX] Fix the codegen for llvm.round.
ClosedPublic