This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/Target/AMDGPU/
-
Target/
-
AMDGPU/
-
SIISelLowering.h
9
SIISelLowering.cpp
-
test/CodeGen/AMDGPU/
-
CodeGen/
-
AMDGPU/
-
fp_to_sint.ll
-
fp_to_uint.ll
-
fptosi.f16.ll
-
fptoui.f16.ll
-
sitofp.f16.ll
-
uitofp.f16.ll

Differential D26767

[AMDGPU] Promote f16/i16 conversions to f32/i32 + custom lower f16 = fp_round f64
ClosedPublic

Authored by kzhuravl on Nov 16 2016, 1:44 PM.

Download Raw Diff

Details

Reviewers

• tstellarAMD
arsenm

Summary

I will split this change in 2 when submitting (it is easier to test everything in bulk):

Promote f16/i16 conversions to f32/i32
Custom lower f16 = fp_round f64

Testing done:

Conformance: half (passed), conversions (passed)
make check-all (passed)

Diff Detail

Event Timeline

kzhuravl updated this revision to Diff 78256.Nov 16 2016, 1:44 PM

kzhuravl retitled this revision from to [AMDGPU] Promote f16/i16 conversions to f32/i32 + custom lower f16 = fp_round f64.

kzhuravl updated this object.

kzhuravl added reviewers: • tstellarAMD, arsenm.

kzhuravl added a subscriber: llvm-commits.

Herald edited edge metadata. · View Herald TranscriptNov 16 2016, 1:44 PM

Herald added subscribers: tony-tye, yaxunl, nhaehnle, wdng. · View Herald Transcript

kzhuravl added a subscriber: b-sumner.Nov 16 2016, 1:44 PM

arsenm added inline comments.Nov 16 2016, 1:50 PM

lib/Target/AMDGPU/SIISelLowering.cpp
3664–3668	This should be very unnecessary

kzhuravl added inline comments.Nov 16 2016, 1:51 PM

lib/Target/AMDGPU/SIISelLowering.cpp
3664–3668	I am getting a "cannot select: i16 = bitcast i16". Do you have a suggestion on how to solve it? Thanks

arsenm added inline comments.Nov 16 2016, 1:53 PM

lib/Target/AMDGPU/SIISelLowering.cpp
3664–3668	Where is that coming from? I thought getNode folded out trivial bitcasts like this already

kzhuravl added inline comments.Nov 16 2016, 1:54 PM

lib/Target/AMDGPU/SIISelLowering.cpp
3664–3668	It is coming right after: https://github.com/llvm-mirror/llvm/blob/master/lib/Target/AMDGPU/AMDGPUISelLowering.cpp#L2100 in the optimized dag phase

arsenm added inline comments.Nov 16 2016, 1:59 PM

lib/Target/AMDGPU/SIISelLowering.cpp
3664–3668	I don't see how that would create a bitcast, or create one that somehow bypasses the no-op fold

kzhuravl added inline comments.Nov 16 2016, 2:47 PM

lib/Target/AMDGPU/SIISelLowering.cpp
3664–3668	It gets inserted in legalize phase, when legalizing `fp_round`: t16: f16 = fp_round t14, TargetConstant:i32<0> t19: i16 = bitcast t16 Then `fp_round` gets legalized to `(truncate to i16 (fp_to_fp16)`. Another approach is to legalize `fp_round` to `(bitcast to f16 (truncate to i16 (fp_to_fp16)`?

arsenm added inline comments.Nov 16 2016, 3:06 PM

lib/Target/AMDGPU/SIISelLowering.cpp
3664–3668	I'm confused. Is it i16 to i16 or i16 to f16?

kzhuravl added inline comments.Nov 16 2016, 3:09 PM

lib/Target/AMDGPU/SIISelLowering.cpp
3664–3668	in the end it is i16 to i16. after `fp_round` gets legalized: t21: i32 = fp_to_fp16 t14 t22: i16 = truncate t21 t19: i16 = bitcast t22

If fp_tound has integer type after legalization, the legalization for it is broken

Address review feedback

Herald edited edge metadata. · View Herald TranscriptNov 16 2016, 3:22 PM

In D26767#597992, @arsenm wrote:

If fp_tound has integer type after legalization, the legalization for it is broken

Agreed, I have bitcasted it to f16 and removed bitcast from combining. However I found that in some existing cases we legalize fp_to_fp16 to integer types (which got me confused), namely:
https://github.com/llvm-mirror/llvm/blob/master/lib/Target/AMDGPU/AMDGPUISelLowering.cpp#L2100

Also, why are our f16->f32 and f32->f16 are using i32?
https://github.com/llvm-mirror/llvm/blob/master/lib/Target/AMDGPU/VOP1Instructions.td#L132

In D26767#598009, @kzhuravl wrote:

In D26767#597992, @arsenm wrote:

If fp_tound has integer type after legalization, the legalization for it is broken

Agreed, I have bitcasted it to f16 and removed bitcast from combining. However I found that in some existing cases we legalize fp_to_fp16 to integer types (which got me confused), namely:
https://github.com/llvm-mirror/llvm/blob/master/lib/Target/AMDGPU/AMDGPUISelLowering.cpp#L2100

Also, why are our f16->f32 and f32->f16 are using i32?
https://github.com/llvm-mirror/llvm/blob/master/lib/Target/AMDGPU/VOP1Instructions.td#L132

fp_to_fp16 is not the same as fp_round. fp_to_fp16 is f16 stored in an integer type

LGTM

lib/Target/AMDGPU/SIISelLowering.cpp
2063–2064	This can be return of the getNode directly

This revision is now accepted and ready to land.Nov 16 2016, 6:00 PM

kzhuravl updated this object.Nov 16 2016, 7:59 PM

kzhuravl edited edge metadata.

rL287201, rL287203, rL287204

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIISelLowering.h

7 lines

SIISelLowering.cpp

68 lines

test/

CodeGen/

AMDGPU/

6 lines

10 lines

29 lines

44 lines

30 lines

32 lines

Diff 78256

lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	class SITargetLowering final : public AMDGPUTargetLowering {
SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,		SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
SDValue Op,		SDValue Op,
const SDLoc &DL,		const SDLoc &DL,
EVT VT) const;		EVT VT) const;

/// \brief Custom lowering for ISD::ConstantFP.		/// \brief Custom lowering for ISD::ConstantFP.
SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;

/// \brief Custom lowering for ISD::FP_TO_SINT, ISD::FP_TO_UINT.		/// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFpToInt(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;

/// \brief Custom lowering for ISD::SINT_TO_FP, ISD::UINT_TO_FP.
SDValue lowerIntToFp(SDValue Op, SelectionDAG &DAG) const;

SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;		SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;

void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;		void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;

SDValue performUCharToFloatCombine(SDNode *N,		SDValue performUCharToFloatCombine(SDNode *N,
▲ Show 20 Lines • Show All 125 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

Show First 20 Lines • Show All 264 Lines • ▼ Show 20 Lines	if (Subtarget->has16BitInsts()) {

setTruncStoreAction(MVT::i64, MVT::i16, Expand);		setTruncStoreAction(MVT::i64, MVT::i16, Expand);

setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);		setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);		AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);		setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);		AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);

setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);		setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom);		setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);		setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom);		setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

// F16 - Constant Actions.		// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Custom);		setOperationAction(ISD::ConstantFP, MVT::f16, Custom);

// F16 - Load/Store Actions.		// F16 - Load/Store Actions.
setOperationAction(ISD::LOAD, MVT::f16, Promote);		setOperationAction(ISD::LOAD, MVT::f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);		AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
setOperationAction(ISD::STORE, MVT::f16, Promote);		setOperationAction(ISD::STORE, MVT::f16, Promote);
AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);		AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);

// F16 - VOP1 Actions.		// F16 - VOP1 Actions.
		setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FCOS, MVT::f16, Promote);		setOperationAction(ISD::FCOS, MVT::f16, Promote);
setOperationAction(ISD::FSIN, MVT::f16, Promote);		setOperationAction(ISD::FSIN, MVT::f16, Promote);
		setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
		setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
		setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
		setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);

// F16 - VOP2 Actions.		// F16 - VOP2 Actions.
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);		setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);		setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
setOperationAction(ISD::FMINNUM, MVT::f16, Legal);		setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
setOperationAction(ISD::FDIV, MVT::f16, Promote);		setOperationAction(ISD::FDIV, MVT::f16, Promote);

// F16 - VOP3 Actions.		// F16 - VOP3 Actions.
setOperationAction(ISD::FMA, MVT::f16, Legal);		setOperationAction(ISD::FMA, MVT::f16, Legal);
if (!Subtarget->hasFP16Denormals())		if (!Subtarget->hasFP16Denormals())
setOperationAction(ISD::FMAD, MVT::f16, Legal);		setOperationAction(ISD::FMAD, MVT::f16, Legal);
}		}

		setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::FADD);		setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);		setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);		setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);		setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SMIN);		setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);		setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMIN);		setTargetDAGCombine(ISD::UMIN);
setTargetDAGCombine(ISD::UMAX);		setTargetDAGCombine(ISD::UMAX);
▲ Show 20 Lines • Show All 1,511 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);		case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);		case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);		case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);		case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
case ISD::TRAP: return lowerTRAP(Op, DAG);		case ISD::TRAP: return lowerTRAP(Op, DAG);

case ISD::ConstantFP:		case ISD::ConstantFP:
return lowerConstantFP(Op, DAG);		return lowerConstantFP(Op, DAG);
case ISD::FP_TO_SINT:		case ISD::FP_ROUND:
case ISD::FP_TO_UINT:		return lowerFP_ROUND(Op, DAG);
return lowerFpToInt(Op, DAG);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return lowerIntToFp(Op, DAG);
}		}
return SDValue();		return SDValue();
}		}

/// \brief Helper function for LowerBRCOND		/// \brief Helper function for LowerBRCOND
static SDNode *findUser(SDValue Value, unsigned Opcode) {		static SDNode *findUser(SDValue Value, unsigned Opcode) {

SDNode *Parent = Value.getNode();		SDNode *Parent = Value.getNode();
▲ Show 20 Lines • Show All 195 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::lowerConstantFP(SDValue Op, SelectionDAG &DAG) const {
if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Op)) {		if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(Op)) {
return DAG.getConstant(FP->getValueAPF().bitcastToAPInt().getZExtValue(),		return DAG.getConstant(FP->getValueAPF().bitcastToAPInt().getZExtValue(),
SDLoc(Op), MVT::i32);		SDLoc(Op), MVT::i32);
}		}

return SDValue();		return SDValue();
}		}

SDValue SITargetLowering::lowerFpToInt(SDValue Op, SelectionDAG &DAG) const {		SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
EVT DstVT = Op.getValueType();		EVT DstVT = Op.getValueType();
EVT SrcVT = Op.getOperand(0).getValueType();		EVT SrcVT = Op.getOperand(0).getValueType();
if (DstVT == MVT::i64) {
return Op.getOpcode() == ISD::FP_TO_SINT ?
AMDGPUTargetLowering::LowerFP_TO_SINT(Op, DAG) :
AMDGPUTargetLowering::LowerFP_TO_UINT(Op, DAG);
}

if (SrcVT == MVT::f16)
return Op;

SDLoc DL(Op);
SDValue OrigSrc = Op.getOperand(0);
SDValue FPRoundFlag = DAG.getIntPtrConstant(0, DL);
SDValue FPRoundSrc =
DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, OrigSrc, FPRoundFlag);

return DAG.getNode(Op.getOpcode(), DL, DstVT, FPRoundSrc);
}

SDValue SITargetLowering::lowerIntToFp(SDValue Op, SelectionDAG &DAG) const {		assert(DstVT == MVT::f16 &&
EVT DstVT = Op.getValueType();		"Do not know how to custom lower FP_ROUND for non-f16 type");
EVT SrcVT = Op.getOperand(0).getValueType();
if (SrcVT == MVT::i64) {
return Op.getOpcode() == ISD::SINT_TO_FP ?
AMDGPUTargetLowering::LowerSINT_TO_FP(Op, DAG) :
AMDGPUTargetLowering::LowerUINT_TO_FP(Op, DAG);
}

if (DstVT == MVT::f16)		if (SrcVT != MVT::f64)
return Op;		return Op;

SDLoc DL(Op);		SDLoc DL(Op);
SDValue OrigSrc = Op.getOperand(0);		SDValue Src = Op.getOperand(0);
SDValue SExtOrZExtOrTruncSrc = Op.getOpcode() == ISD::SINT_TO_FP ?		SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
DAG.getSExtOrTrunc(OrigSrc, DL, MVT::i32) :		SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
DAG.getZExtOrTrunc(OrigSrc, DL, MVT::i32);		return Trunc;

return DAG.getNode(Op.getOpcode(), DL, DstVT, SExtOrZExtOrTruncSrc);
}		}
		arsenmUnsubmitted Not Done Reply Inline Actions This can be return of the getNode directly arsenm: This can be return of the getNode directly

SDValue SITargetLowering::getSegmentAperture(unsigned AS,		SDValue SITargetLowering::getSegmentAperture(unsigned AS,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc SL;		SDLoc SL;
MachineFunction &MF = DAG.getMachineFunction();		MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();		SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
unsigned UserSGPR = Info->getQueuePtrUserSGPR();		unsigned UserSGPR = Info->getQueuePtrUserSGPR();
assert(UserSGPR != AMDGPU::NoRegister);		assert(UserSGPR != AMDGPU::NoRegister);
▲ Show 20 Lines • Show All 1,583 Lines • ▼ Show 20 Lines
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,		SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {		DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;		SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);		SDLoc DL(N);

switch (N->getOpcode()) {		switch (N->getOpcode()) {
default:		default:
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);		return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
		case ISD::BITCAST: {
		if (N->getValueType(0) == N->getOperand(0).getValueType())
		return N->getOperand(0);
		break;
		}
		arsenmUnsubmitted Not Done Reply Inline Actions This should be very unnecessary arsenm: This should be very unnecessary
		kzhuravlAuthorUnsubmitted Not Done Reply Inline Actions I am getting a "cannot select: i16 = bitcast i16". Do you have a suggestion on how to solve it? Thanks kzhuravl: I am getting a "cannot select: i16 = bitcast i16". Do you have a suggestion on how to solve it?
		arsenmUnsubmitted Not Done Reply Inline Actions Where is that coming from? I thought getNode folded out trivial bitcasts like this already arsenm: Where is that coming from? I thought getNode folded out trivial bitcasts like this already
		kzhuravlAuthorUnsubmitted Not Done Reply Inline Actions It is coming right after: https://github.com/llvm-mirror/llvm/blob/master/lib/Target/AMDGPU/AMDGPUISelLowering.cpp#L2100 in the optimized dag phase kzhuravl: It is coming right after: https://github.com/llvm…
		arsenmUnsubmitted Not Done Reply Inline Actions I don't see how that would create a bitcast, or create one that somehow bypasses the no-op fold arsenm: I don't see how that would create a bitcast, or create one that somehow bypasses the no-op fold
		kzhuravlAuthorUnsubmitted Not Done Reply Inline Actions It gets inserted in legalize phase, when legalizing `fp_round`: t16: f16 = fp_round t14, TargetConstant:i32<0> t19: i16 = bitcast t16 Then `fp_round` gets legalized to `(truncate to i16 (fp_to_fp16)`. Another approach is to legalize `fp_round` to `(bitcast to f16 (truncate to i16 (fp_to_fp16)`? kzhuravl: It gets inserted in legalize phase, when legalizing `fp_round`: ``` t16: f16 = fp_round t14…
		arsenmUnsubmitted Not Done Reply Inline Actions I'm confused. Is it i16 to i16 or i16 to f16? arsenm: I'm confused. Is it i16 to i16 or i16 to f16?
		kzhuravlAuthorUnsubmitted Not Done Reply Inline Actions in the end it is i16 to i16. after `fp_round` gets legalized: t21: i32 = fp_to_fp16 t14 t22: i16 = truncate t21 t19: i16 = bitcast t22 kzhuravl: in the end it is i16 to i16. after `fp_round` gets legalized: ``` t21: i32 =…
case ISD::SETCC:		case ISD::SETCC:
return performSetCCCombine(N, DCI);		return performSetCCCombine(N, DCI);
case ISD::FMAXNUM:		case ISD::FMAXNUM:
case ISD::FMINNUM:		case ISD::FMINNUM:
case ISD::SMAX:		case ISD::SMAX:
case ISD::SMIN:		case ISD::SMIN:
case ISD::UMAX:		case ISD::UMAX:
case ISD::UMIN:		case ISD::UMIN:
▲ Show 20 Lines • Show All 571 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/fp_to_sint.ll

	Show First 20 Lines • Show All 243 Lines • ▼ Show 20 Lines
	define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {			define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
	%in.fabs = call float @llvm.fabs.f32(float %in)			%in.fabs = call float @llvm.fabs.f32(float %in)
	%conv = fptosi float %in.fabs to i1			%conv = fptosi float %in.fabs to i1
	store i1 %conv, i1 addrspace(1)* %out			store i1 %conv, i1 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}fp_to_sint_f32_i16:			; FUNC-LABEL: {{^}}fp_to_sint_f32_i16:
	; SI: v_cvt_i32_f32_e32 v[[VAL:[0-9]+]], s{{[0-9]+}}			; GCN: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
	; VI: v_cvt_f16_f32_e32 v[[IN_F16:[0-9]+]], s{{[0-9]+}}			; GCN: buffer_store_short [[VAL]]
	; VI: v_cvt_i16_f16_e32 v[[VAL:[0-9]+]], v[[IN_F16]]
	; SI: buffer_store_short v[[VAL]]
	define void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {			define void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {
	%sint = fptosi float %in to i16			%sint = fptosi float %in to i16
	store i16 %sint, i16 addrspace(1)* %out			store i16 %sint, i16 addrspace(1)* %out
	ret void			ret void
	}			}

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone }			attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/fp_to_uint.ll

	Show First 20 Lines • Show All 234 Lines • ▼ Show 20 Lines
	define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {			define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
	%in.fabs = call float @llvm.fabs.f32(float %in)			%in.fabs = call float @llvm.fabs.f32(float %in)
	%conv = fptoui float %in.fabs to i1			%conv = fptoui float %in.fabs to i1
	store i1 %conv, i1 addrspace(1)* %out			store i1 %conv, i1 addrspace(1)* %out
	ret void			ret void
	}			}

	; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i16:			; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i16:
	; SI: v_cvt_u32_f32_e32 v[[VAL:[0-9]+]], s{{[0-9]+}}			; The reason different instructions are used on SI and VI is because for
	; VI: v_cvt_f16_f32_e32 v[[IN_F16:[0-9]+]], s{{[0-9]+}}			; SI fp_to_uint is legalized by the type legalizer and for VI it is
	; VI: v_cvt_u16_f16_e32 v[[VAL:[0-9]+]], v[[IN_F16]]			; legalized by the dag legalizer and they legalize fp_to_uint differently.
	; GCN: buffer_store_short v[[VAL]]			; SI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
				; VI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
				; GCN: buffer_store_short [[VAL]]
	define void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {			define void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {
	%uint = fptoui float %in to i16			%uint = fptoui float %in to i16
	store i16 %uint, i16 addrspace(1)* %out			store i16 %uint, i16 addrspace(1)* %out
	ret void			ret void
	}			}

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone }			attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/fptosi.f16.ll

; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s		; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s		; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s

; GCN-LABEL: {{^}}fptosi_f16_to_i16		; GCN-LABEL: {{^}}fptosi_f16_to_i16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]		; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]		; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; SI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]		; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
; VI: v_cvt_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
; GCN: buffer_store_short v[[R_I16]]		; GCN: buffer_store_short v[[R_I16]]
; GCN: s_endpgm		; GCN: s_endpgm
define void @fptosi_f16_to_i16(		define void @fptosi_f16_to_i16(
i16 addrspace(1)* %r,		i16 addrspace(1)* %r,
half addrspace(1)* %a) {		half addrspace(1)* %a) {
entry:		entry:
%a.val = load half, half addrspace(1)* %a		%a.val = load half, half addrspace(1)* %a
%r.val = fptosi half %a.val to i16		%r.val = fptosi half %a.val to i16
Show All 32 Lines	entry:
%r.val = fptosi half %a.val to i64		%r.val = fptosi half %a.val to i64
store i64 %r.val, i64 addrspace(1)* %r		store i64 %r.val, i64 addrspace(1)* %r
ret void		ret void
}		}

; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i16		; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]		; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]		; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]		; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]		; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]		; GCN: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]		; GCN: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
; VI: v_cvt_i16_f16_e32 v[[R_I16_0:[0-9]+]], v[[A_V2_F16]]
; VI: v_cvt_i16_f16_e32 v[[R_I16_1:[0-9]+]], v[[A_F16_1]]
; GCN: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]		; GCN: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]		; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]		; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
; GCN: buffer_store_dword v[[R_V2_I16]]		; GCN: buffer_store_dword v[[R_V2_I16]]
; GCN: s_endpgm		; GCN: s_endpgm
define void @fptosi_v2f16_to_v2i16(		define void @fptosi_v2f16_to_v2i16(
<2 x i16> addrspace(1)* %r,		<2 x i16> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {		<2 x half> addrspace(1)* %a) {
entry:		entry:
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a		%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
%r.val = fptosi <2 x half> %a.val to <2 x i16>		%r.val = fptosi <2 x half> %a.val to <2 x i16>
store <2 x i16> %r.val, <2 x i16> addrspace(1)* %r		store <2 x i16> %r.val, <2 x i16> addrspace(1)* %r
ret void		ret void
}		}

; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i32		; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i32
; GCN: buffer_load_dword		; GCN: buffer_load_dword
; GCN: v_cvt_f32_f16_e32		; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32		; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_i32_f32_e32		; GCN: v_cvt_i32_f32_e32
; GCN: v_cvt_i32_f32_e32		; GCN: v_cvt_i32_f32_e32
; GCN: buffer_store_dwordx2		; GCN: buffer_store_dwordx2
; GCN: s_endpgm		; GCN: s_endpgm
define void @fptosi_v2f16_to_v2i32(		define void @fptosi_v2f16_to_v2i32(
<2 x i32> addrspace(1)* %r,		<2 x i32> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {		<2 x half> addrspace(1)* %a) {
entry:		entry:
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a		%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
%r.val = fptosi <2 x half> %a.val to <2 x i32>		%r.val = fptosi <2 x half> %a.val to <2 x i32>
store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r		store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r
ret void		ret void
Show All 19 Lines

test/CodeGen/AMDGPU/fptoui.f16.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s			; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s
	; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s			; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s

	; GCN-LABEL: {{^}}fptoui_f16_to_i16			; GCN-LABEL: {{^}}fptoui_f16_to_i16
	; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]			; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
	; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]			; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
	; SI: v_cvt_u32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]			; SI: v_cvt_u32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
	; VI: v_cvt_u16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]			; VI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
	; GCN: buffer_store_short v[[R_I16]]			; GCN: buffer_store_short v[[R_I16]]
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @fptoui_f16_to_i16(			define void @fptoui_f16_to_i16(
	i16 addrspace(1)* %r,			i16 addrspace(1)* %r,
	half addrspace(1)* %a) {			half addrspace(1)* %a) {
	entry:			entry:
	%a.val = load half, half addrspace(1)* %a			%a.val = load half, half addrspace(1)* %a
	%r.val = fptoui half %a.val to i16			%r.val = fptoui half %a.val to i16
	Show All 30 Lines
	entry:			entry:
	%a.val = load half, half addrspace(1)* %a			%a.val = load half, half addrspace(1)* %a
	%r.val = fptoui half %a.val to i64			%r.val = fptoui half %a.val to i64
	store i64 %r.val, i64 addrspace(1)* %r			store i64 %r.val, i64 addrspace(1)* %r
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i16			; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i16
	; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]			; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
	; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]			; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
	; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]			; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
	; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]			; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
	; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]			; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
	; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]			; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
	; VI: v_cvt_u16_f16_e32 v[[R_I16_0:[0-9]+]], v[[A_V2_F16]]			; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
	; VI: v_cvt_u16_f16_e32 v[[R_I16_1:[0-9]+]], v[[A_F16_1]]			; VI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
	; VI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
	; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]			; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
	; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]			; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
	; VI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
	; GCN: buffer_store_dword v[[R_V2_I16]]			; GCN: buffer_store_dword v[[R_V2_I16]]
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @fptoui_v2f16_to_v2i16(			define void @fptoui_v2f16_to_v2i16(
	<2 x i16> addrspace(1)* %r,			<2 x i16> addrspace(1)* %r,
	<2 x half> addrspace(1)* %a) {			<2 x half> addrspace(1)* %a) {
	entry:			entry:
	%a.val = load <2 x half>, <2 x half> addrspace(1)* %a			%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
	%r.val = fptoui <2 x half> %a.val to <2 x i16>			%r.val = fptoui <2 x half> %a.val to <2 x i16>
	store <2 x i16> %r.val, <2 x i16> addrspace(1)* %r			store <2 x i16> %r.val, <2 x i16> addrspace(1)* %r
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i32			; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i32
	; GCN: buffer_load_dword			; GCN: buffer_load_dword
	; GCN: v_cvt_f32_f16_e32			; GCN: v_cvt_f32_f16_e32
	; GCN: v_cvt_f32_f16_e32			; GCN: v_cvt_f32_f16_e32
	; GCN: v_cvt_u32_f32_e32			; GCN: v_cvt_u32_f32_e32
	; GCN: v_cvt_u32_f32_e32			; GCN: v_cvt_u32_f32_e32
	; GCN: buffer_store_dwordx2			; GCN: buffer_store_dwordx2
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @fptoui_v2f16_to_v2i32(			define void @fptoui_v2f16_to_v2i32(
	<2 x i32> addrspace(1)* %r,			<2 x i32> addrspace(1)* %r,
	<2 x half> addrspace(1)* %a) {			<2 x half> addrspace(1)* %a) {
	entry:			entry:
	%a.val = load <2 x half>, <2 x half> addrspace(1)* %a			%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
	%r.val = fptoui <2 x half> %a.val to <2 x i32>			%r.val = fptoui <2 x half> %a.val to <2 x i32>
	store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r			store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r
	ret void			ret void
	Show All 19 Lines

test/CodeGen/AMDGPU/sitofp.f16.ll

; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s		; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s		; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s

; GCN-LABEL: {{^}}sitofp_i16_to_f16		; GCN-LABEL: {{^}}sitofp_i16_to_f16
; GCN: buffer_load_{{sshort\|ushort}} v[[A_I16:[0-9]+]]		; GCN: buffer_load_{{sshort\|ushort}} v[[A_I16:[0-9]+]]
; SI: v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]		; GCN: v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]		; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
; VI: v_cvt_f16_i16_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
; GCN: buffer_store_short v[[R_F16]]		; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm		; GCN: s_endpgm
define void @sitofp_i16_to_f16(		define void @sitofp_i16_to_f16(
half addrspace(1)* %r,		half addrspace(1)* %r,
i16 addrspace(1)* %a) {		i16 addrspace(1)* %a) {
entry:		entry:
%a.val = load i16, i16 addrspace(1)* %a		%a.val = load i16, i16 addrspace(1)* %a
%r.val = sitofp i16 %a.val to half		%r.val = sitofp i16 %a.val to half
Show All 15 Lines	entry:
%r.val = sitofp i32 %a.val to half		%r.val = sitofp i32 %a.val to half
store half %r.val, half addrspace(1)* %r		store half %r.val, half addrspace(1)* %r
ret void		ret void
}		}

; f16 = sitofp i64 is in sint_to_fp.i64.ll		; f16 = sitofp i64 is in sint_to_fp.i64.ll

; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16		; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16
; GCN: buffer_load_dword v[[A_V2_I16:[0-9]+]]		; GCN: buffer_load_dword
; SI: v_bfe_i32 v[[A_I16_0:[0-9]+]], v[[A_V2_I16]], 0, 16		; GCN: v_cvt_f32_i32_e32
; SI: v_ashrrev_i32_e32 v[[A_I16_1:[0-9]+]], 16, v[[A_V2_I16]]		; GCN: v_cvt_f32_i32_e32
; SI: v_cvt_f32_i32_e32 v[[A_F32_1:[0-9]+]], v[[A_I16_1]]		; GCN: v_cvt_f16_f32_e32
; SI: v_cvt_f32_i32_e32 v[[A_F32_0:[0-9]+]], v[[A_I16_0]]		; GCN: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]		; GCN-DAG: v_lshlrev_b32_e32
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]		; GCN-DAG: v_or_b32_e32
; VI: v_lshrrev_b32_e32 v[[A_I16_1:[0-9]+]], 16, v[[A_V2_I16]]		; GCN: buffer_store_dword
; VI: v_cvt_f16_i16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_I16]]
; VI: v_cvt_f16_i16_e32 v[[R_F16_1:[0-9]+]], v[[A_I16_1]]
; VI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm		; GCN: s_endpgm
define void @sitofp_v2i16_to_v2f16(		define void @sitofp_v2i16_to_v2f16(
<2 x half> addrspace(1)* %r,		<2 x half> addrspace(1)* %r,
<2 x i16> addrspace(1)* %a) {		<2 x i16> addrspace(1)* %a) {
entry:		entry:
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a		%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
%r.val = sitofp <2 x i16> %a.val to <2 x half>		%r.val = sitofp <2 x i16> %a.val to <2 x half>
store <2 x half> %r.val, <2 x half> addrspace(1)* %r		store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void		ret void
Show All 24 Lines

test/CodeGen/AMDGPU/uitofp.f16.ll

; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s		; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s		; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s

; GCN-LABEL: {{^}}uitofp_i16_to_f16		; GCN-LABEL: {{^}}uitofp_i16_to_f16
; GCN: buffer_load_ushort v[[A_I16:[0-9]+]]		; GCN: buffer_load_ushort v[[A_I16:[0-9]+]]
; SI: v_cvt_f32_u32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]		; SI: v_cvt_f32_u32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]		; VI: v_cvt_f32_i32_e32 v[[A_F32:[0-9]+]], v[[A_I16]]
; VI: v_cvt_f16_u16_e32 v[[R_F16:[0-9]+]], v[[A_I16]]		; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
; GCN: buffer_store_short v[[R_F16]]		; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm		; GCN: s_endpgm
define void @uitofp_i16_to_f16(		define void @uitofp_i16_to_f16(
half addrspace(1)* %r,		half addrspace(1)* %r,
i16 addrspace(1)* %a) {		i16 addrspace(1)* %a) {
entry:		entry:
%a.val = load i16, i16 addrspace(1)* %a		%a.val = load i16, i16 addrspace(1)* %a
%r.val = uitofp i16 %a.val to half		%r.val = uitofp i16 %a.val to half
Show All 15 Lines	entry:
%r.val = uitofp i32 %a.val to half		%r.val = uitofp i32 %a.val to half
store half %r.val, half addrspace(1)* %r		store half %r.val, half addrspace(1)* %r
ret void		ret void
}		}

; f16 = uitofp i64 is in uint_to_fp.i64.ll		; f16 = uitofp i64 is in uint_to_fp.i64.ll

; GCN-LABEL: {{^}}uitofp_v2i16_to_v2f16		; GCN-LABEL: {{^}}uitofp_v2i16_to_v2f16
; GCN: buffer_load_dword v[[A_V2_I16:[0-9]+]]		; GCN: buffer_load_dword
; SI: s_mov_b32 s[[MASK:[0-9]+]], 0xffff{{$}}		; SI: v_cvt_f32_u32_e32
; SI: v_and_b32_e32 v[[A_I16_0:[0-9]+]], s[[MASK]], v[[A_V2_I16]]		; SI: v_cvt_f32_u32_e32
; GCN: v_lshrrev_b32_e32 v[[A_I16_1:[0-9]+]], 16, v[[A_V2_I16]]		; VI: v_cvt_f32_i32_e32
; SI: v_cvt_f32_u32_e32 v[[A_F32_1:[0-9]+]], v[[A_I16_1]]		; VI: v_cvt_f32_i32_e32
; SI: v_cvt_f32_u32_e32 v[[A_F32_0:[0-9]+]], v[[A_I16_0]]		; GCN: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]		; GCN: v_cvt_f16_f32_e32
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]		; GCN-DAG: v_and_b32_e32
; VI: v_cvt_f16_u16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_I16]]		; GCN-DAG: v_lshlrev_b32_e32
; VI: v_cvt_f16_u16_e32 v[[R_F16_1:[0-9]+]], v[[A_I16_1]]		; GCN-DAG: v_or_b32_e32
; VI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]		; GCN: buffer_store_dword
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], s[[MASK]], v[[R_F16_0]]
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm		; GCN: s_endpgm
define void @uitofp_v2i16_to_v2f16(		define void @uitofp_v2i16_to_v2f16(
<2 x half> addrspace(1)* %r,		<2 x half> addrspace(1)* %r,
<2 x i16> addrspace(1)* %a) {		<2 x i16> addrspace(1)* %a) {
entry:		entry:
%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a		%a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
%r.val = uitofp <2 x i16> %a.val to <2 x half>		%r.val = uitofp <2 x i16> %a.val to <2 x half>
store <2 x half> %r.val, <2 x half> addrspace(1)* %r		store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void		ret void
Show All 24 Lines