This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Custom lower vector_shuffle for v4i16/v4f16
ClosedPublic

Authored by arsenm on Jun 27 2019, 8:57 AM.

Download Raw Diff

Details

Reviewers

Summary

Ordinarily it is lowered as a build_vector of each extract_vector_elt,
which in turn get lowered to bitcasts and bit shifts. Very little
understand the lowered extract pattern, resulting in much worse
code. We treat concat_vectors of v2i16 as legal, so prefer that.

Diff Detail

Event Timeline

arsenm created this revision.Jun 27 2019, 8:57 AM

Herald added subscribers: t-tye, tpr, dstuttard and 5 others. · View Herald TranscriptJun 27 2019, 8:57 AM

LGTM with small suggestion.

lib/Target/AMDGPU/SIISelLowering.cpp
4750	It's better to swap conditions. That way you will not read beyond the array even if accidentally pass an odd index.

This revision is now accepted and ready to land.Jun 27 2019, 2:24 PM

Fix test failure

arsenm requested review of this revision.Jul 2 2019, 8:09 AM

LGTM

This revision is now accepted and ready to land.Jul 2 2019, 11:37 AM

r364959

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIISelLowering.h

1 line

SIISelLowering.cpp

62 lines

test/

CodeGen/

AMDGPU/

vector_shuffle.packed.ll

272 lines

Diff 207564

lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 117 Lines • ▼ Show 20 Lines	private:
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;

SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,		SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const;		SelectionDAG &DAG) const;

SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
		SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;

SDNode adjustWritemask(MachineSDNode &N, SelectionDAG &DAG) const;		SDNode adjustWritemask(MachineSDNode &N, SelectionDAG &DAG) const;

SDValue performUCharToFloatCombine(SDNode *N,		SDValue performUCharToFloatCombine(SDNode *N,
DAGCombinerInfo &DCI) const;		DAGCombinerInfo &DCI) const;
▲ Show 20 Lines • Show All 247 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 624 Lines • ▼ Show 20 Lines	if (Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);		setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);		setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);

setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);		setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);		setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);		setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);

		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);

setOperationAction(ISD::SHL, MVT::v4i16, Custom);		setOperationAction(ISD::SHL, MVT::v4i16, Custom);
setOperationAction(ISD::SRA, MVT::v4i16, Custom);		setOperationAction(ISD::SRA, MVT::v4i16, Custom);
setOperationAction(ISD::SRL, MVT::v4i16, Custom);		setOperationAction(ISD::SRL, MVT::v4i16, Custom);
setOperationAction(ISD::ADD, MVT::v4i16, Custom);		setOperationAction(ISD::ADD, MVT::v4i16, Custom);
setOperationAction(ISD::SUB, MVT::v4i16, Custom);		setOperationAction(ISD::SUB, MVT::v4i16, Custom);
setOperationAction(ISD::MUL, MVT::v4i16, Custom);		setOperationAction(ISD::MUL, MVT::v4i16, Custom);

setOperationAction(ISD::SMIN, MVT::v4i16, Custom);		setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
▲ Show 20 Lines • Show All 3,311 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);		case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);		case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);		case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);		case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
case ISD::INSERT_VECTOR_ELT:		case ISD::INSERT_VECTOR_ELT:
return lowerINSERT_VECTOR_ELT(Op, DAG);		return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:		case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);		return lowerEXTRACT_VECTOR_ELT(Op, DAG);
		case ISD::VECTOR_SHUFFLE:
		return lowerVECTOR_SHUFFLE(Op, DAG);
case ISD::BUILD_VECTOR:		case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);		return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:		case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);		return lowerFP_ROUND(Op, DAG);
case ISD::TRAP:		case ISD::TRAP:
return lowerTRAP(Op, DAG);		return lowerTRAP(Op, DAG);
case ISD::DEBUGTRAP:		case ISD::DEBUGTRAP:
return lowerDEBUGTRAP(Op, DAG);		return lowerDEBUGTRAP(Op, DAG);
▲ Show 20 Lines • Show All 767 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (ResultVT == MVT::f16) {		if (ResultVT == MVT::f16) {
SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);		SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);		return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
}		}

return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);		return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
}		}

		static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
		assert(Elt % 2 == 0);
		return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
		rampitecUnsubmitted Not Done Reply Inline Actions It's better to swap conditions. That way you will not read beyond the array even if accidentally pass an odd index. rampitec: It's better to swap conditions. That way you will not read beyond the array even if…
		}

		SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
		SelectionDAG &DAG) const {
		SDLoc SL(Op);
		EVT ResultVT = Op.getValueType();
		ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);

		EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
		EVT EltVT = PackVT.getVectorElementType();
		int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();

		// vector_shuffle <0,1,6,7> lhs, rhs
		// -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
		//
		// vector_shuffle <6,7,2,3> lhs, rhs
		// -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
		//
		// vector_shuffle <6,7,0,1> lhs, rhs
		// -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)

		// Avoid scalarizing when both halves are reading from consecutive elements.
		SmallVector<SDValue, 4> Pieces;
		for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
		if (elementPairIsContiguous(SVN->getMask(), I)) {
		const int Idx = SVN->getMaskElt(I);
		int VecIdx = Idx < SrcNumElts ? 0 : 1;
		int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
		SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
		PackVT, SVN->getOperand(VecIdx),
		DAG.getConstant(EltIdx, SL, MVT::i32));
		Pieces.push_back(SubVec);
		} else {
		const int Idx0 = SVN->getMaskElt(I);
		const int Idx1 = SVN->getMaskElt(I + 1);
		int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
		int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
		int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
		int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;

		SDValue Vec0 = SVN->getOperand(VecIdx0);
		SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
		Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));

		SDValue Vec1 = SVN->getOperand(VecIdx1);
		SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
		Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
		Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
		}
		}

		return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
		}

SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,		SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc SL(Op);		SDLoc SL(Op);
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();

if (VT == MVT::v4i16 \|\| VT == MVT::v4f16) {		if (VT == MVT::v4i16 \|\| VT == MVT::v4f16) {
EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);		EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);

▲ Show 20 Lines • Show All 5,794 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/vector_shuffle.packed.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GFX9 %s		; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GFX9 %s

define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_23uu:		; GFX9-LABEL: shuffle_v4f16_23uu:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1		; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_234u:		; GFX9-LABEL: shuffle_v4f16_234u:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5		; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_u1u3(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
▲ Show 20 Lines • Show All 116 Lines • ▼ Show 20 Lines
}		}

define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_0101:		; GFX9-LABEL: shuffle_v4f16_0101:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v0		; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

Show All 13 Lines
define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_0145:		; GFX9-LABEL: shuffle_v4f16_0145:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_0167:		; GFX9-LABEL: shuffle_v4f16_0167:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, v1, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX9-NEXT: v_and_b32_e32 v1, v1, v2
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_2301(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_2301:		; GFX9-LABEL: shuffle_v4f16_2301:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1		; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX9-NEXT: v_and_b32_e32 v2, v2, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_2323:		; GFX9-LABEL: shuffle_v4f16_2323:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1		; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_2345:		; GFX9-LABEL: shuffle_v4f16_2345:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX9-NEXT: v_and_b32_e32 v1, v0, v1
; GFX9-NEXT: v_and_b32_e32 v2, v0, v2
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_2367:		; GFX9-LABEL: shuffle_v4f16_2367:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1		; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_and_b32_e32 v1, v0, v1
; GFX9-NEXT: v_and_b32_e32 v3, v0, v3
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_4501:		; GFX9-LABEL: shuffle_v4f16_4501:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
; GFX9-NEXT: v_and_b32_e32 v2, v2, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_4523:		; GFX9-LABEL: shuffle_v4f16_4523:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1		; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: v_and_b32_e32 v1, v0, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX9-NEXT: v_and_b32_e32 v2, v0, v2
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_4545:		; GFX9-LABEL: shuffle_v4f16_4545:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v0		; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

Show All 9 Lines	; GFX9-NEXT: s_setpc_b64 s[30:31]
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_6701:		; GFX9-LABEL: shuffle_v4f16_6701:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0		; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX9-NEXT: v_and_b32_e32 v2, v1, v2
; GFX9-NEXT: v_and_b32_e32 v1, v1, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_6723:		; GFX9-LABEL: shuffle_v4f16_6723:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1		; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: v_and_b32_e32 v1, v0, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX9-NEXT: v_and_b32_e32 v3, v0, v3
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_6745(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_6745:		; GFX9-LABEL: shuffle_v4f16_6745:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1		; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX9-NEXT: v_and_b32_e32 v2, v2, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_6767:		; GFX9-LABEL: shuffle_v4f16_6767:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1		; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_2356:		; GFX9-LABEL: shuffle_v4f16_2356:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1		; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v2		; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_5623:		; GFX9-LABEL: shuffle_v4f16_5623:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2		; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	; GFX9-NEXT: s_setpc_b64 s[30:31]
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {		define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4i16_2356:		; GFX9-LABEL: shuffle_v4i16_2356:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_bfi_b32 v0, v4, v1, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v1, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1		; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
		; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0		%val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
%val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1		%val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
%shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>		%shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
ret <4 x i16> %shuffle		ret <4 x i16> %shuffle
}		}

define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {		define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4i16_0167:		; GFX9-LABEL: shuffle_v4i16_0167:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: v_bfi_b32 v0, v1, v0, v0
; GFX9-NEXT: v_bfi_b32 v1, v1, v2, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0		%val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
%val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1		%val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
%shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>		%shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
ret <4 x i16> %shuffle		ret <4 x i16> %shuffle
}		}

define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_0000(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	; GFX9-NEXT: s_setpc_b64 s[30:31]
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_2333:		; GFX9-LABEL: shuffle_v4f16_2333:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1		; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX9-NEXT: v_and_b32_e32 v0, v2, v1		; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX9-NEXT: v_and_b32_e32 v1, v2, v3		; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0		; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_6667:		; GFX9-LABEL: shuffle_v4f16_6667:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1		; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX9-NEXT: v_and_b32_e32 v0, v2, v1		; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX9-NEXT: v_and_b32_e32 v1, v2, v3		; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0		; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v8f16_0101:		; GFX9-LABEL: shuffle_v8f16_0101:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off		; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v0		; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0		%val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
%val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1		%val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>		%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

Show All 11 Lines
}		}

define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v8f16_4589(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v8f16_4589:		; GFX9-LABEL: shuffle_v8f16_4589:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8		; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8
; GFX9-NEXT: global_load_dword v1, v[2:3], off		; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0		%val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
%val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1		%val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>		%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v8f16_10_11_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v8f16_10_11_2_3:		; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4		; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
; GFX9-NEXT: v_and_b32_e32 v2, v2, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0		%val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
%val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1		%val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>		%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v8f16_13_14_2_3:		; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
; GFX9-NEXT: global_load_dwordx4 v[0:3], v[2:3], off		; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v4, v0, v4		; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0		%val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
%val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1		%val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>		%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v3f16_0122:		; GFX9-LABEL: shuffle_v3f16_0122:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0		; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
; GFX9-NEXT: v_and_b32_e32 v2, v2, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2		; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0		%val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
%val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1		%val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
%shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>		%shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

Show All 12 Lines	; GFX9-NEXT: s_setpc_b64 s[30:31]
%shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>		%shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {		define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v6f16_452367:		; GFX9-LABEL: shuffle_v6f16_452367:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx3 v[4:6], v[0:1], off		; GFX9-NEXT: v_mov_b32_e32 v4, v3
; GFX9-NEXT: global_load_dword v2, v[2:3], off		; GFX9-NEXT: v_mov_b32_e32 v3, v2
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff		; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
		; GFX9-NEXT: global_load_dword v3, v[3:4], off
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v5		; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v6
; GFX9-NEXT: v_and_b32_e32 v3, v0, v6
; GFX9-NEXT: v_and_b32_e32 v5, v0, v5
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v5
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
		; GFX9-NEXT: v_mov_b32_e32 v2, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0		%val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0
%val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1		%val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1
%shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>		%shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
ret <6 x half> %shuffle		ret <6 x half> %shuffle
}		}

define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) {		define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) {
Show All 15 Lines
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off		; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_fma_f16 v6, v0, v2, v6 op_sel_hi:[0,1,1]		; GFX9-NEXT: v_pk_fma_f16 v6, v0, v2, v6 op_sel_hi:[0,1,1]
; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v7 op_sel_hi:[0,1,1]		; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v7 op_sel_hi:[0,1,1]
; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v6 op_sel:[1,0,0]		; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v6 op_sel:[1,0,0]
; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]		; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off		; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_endpgm		; GFX9-NEXT: s_endpgm
entry:		entry:
%tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()		%tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp12 = zext i32 %tmp1 to i64		%tmp12 = zext i32 %tmp1 to i64
%arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12		%arrayidx = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %A, i64 %tmp12
%tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8		%tmp14 = load <4 x half>, <4 x half> addrspace(1)* %arrayidx, align 8
%arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12		%arrayidx1 = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %B, i64 %tmp12
Show All 15 Lines	entry:
%tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>		%tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
%tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)		%tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
%tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>		%tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>		%tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8		store <4 x half> %tmp32, <4 x half> addrspace(1)* %arrayidx2, align 8
ret void		ret void
}		}

		define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
		; GFX9-LABEL: shuffle_v4f16_0456:
		; GFX9: ; %bb.0:
		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
		; GFX9-NEXT: s_waitcnt vmcnt(0)
		; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
		; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
		; GFX9-NEXT: v_and_b32_e32 v0, v3, v0
		; GFX9-NEXT: s_waitcnt vmcnt(0)
		; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
		; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
		; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v3
		; GFX9-NEXT: s_setpc_b64 s[30:31]
		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
		ret <4 x half> %shuffle
		}

declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0		declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
declare i32 @llvm.amdgcn.workitem.id.x() #0		declare i32 @llvm.amdgcn.workitem.id.x() #0

attributes #0 = { nounwind readnone speculatable }		attributes #0 = { nounwind readnone speculatable }