Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -137,6 +137,10 @@ return false; } + static inline SDValue stripBitcast(SDValue Val) { + return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; + } + static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4); bool isFAbsFree(EVT VT) const override; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3144,6 +3144,28 @@ } } + // Equivalent of above for accessing the high element of a vector as an + // integer operation. + // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) + if (Src.getOpcode() == ISD::SRL) { + if (auto K = isConstOrConstSplat(Src.getOperand(1))) { + if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) { + SDValue BV = stripBitcast(Src.getOperand(0)); + if (BV.getOpcode() == ISD::BUILD_VECTOR && + BV.getValueType().getVectorNumElements() == 2) { + SDValue SrcElt = BV.getOperand(1); + EVT SrcEltVT = SrcElt.getValueType(); + if (SrcEltVT.isFloatingPoint()) { + SrcElt = DAG.getNode(ISD::BITCAST, SL, + SrcEltVT.changeTypeToInteger(), SrcElt); + } + + return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); + } + } + } + } + // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit. // // i16 (trunc (srl i64:x, K)), K <= 16 -> Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -84,6 +84,7 @@ SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -288,13 +288,24 @@ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); + // Avoid stack access for these. // TODO: Generalize to more vector types. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); @@ -3333,6 +3344,8 @@ return lowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::BUILD_VECTOR: + return lowerBUILD_VECTOR(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); case ISD::TRAP: @@ -4157,34 +4170,72 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { + SDValue Vec = Op.getOperand(0); + SDValue InsVal = Op.getOperand(1); SDValue Idx = Op.getOperand(2); + EVT VecVT = Vec.getValueType(); + + assert(VecVT.getScalarSizeInBits() == 16); + + unsigned NumElts = VecVT.getVectorNumElements(); + SDLoc SL(Op); + auto KIdx = dyn_cast(Idx); + + if (NumElts == 4 && KIdx) { + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec); + + SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, + DAG.getConstant(0, SL, MVT::i32)); + SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, + DAG.getConstant(1, SL, MVT::i32)); + + SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf); + SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf); + + unsigned Idx = KIdx->getZExtValue(); + bool InsertLo = Idx < 2; + SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, + InsertLo ? LoVec : HiVec, + DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal), + DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32)); + + InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf); + + SDValue Concat = InsertLo ? + DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) : + DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf }); + + return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); + } + + assert(NumElts == 2 || NumElts == 4); + if (isa(Idx)) return SDValue(); + EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64; + // Avoid stack access for dynamic indexing. - SDLoc SL(Op); - SDValue Vec = Op.getOperand(0); - SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1)); + SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val); + SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); // Convert vector index to bit-index. SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, DAG.getConstant(4, SL, MVT::i32)); - SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); - - SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32, - DAG.getConstant(0xffff, SL, MVT::i32), + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); + SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, + DAG.getConstant(0xffff, SL, IntVT), ScaledIdx); - SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal); - SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32, - DAG.getNOT(SL, BFM, MVT::i32), BCVec); + SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); + SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT, + DAG.getNOT(SL, BFM, IntVT), BCVec); - SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS); - return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI); + SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS); + return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); } SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, @@ -4194,6 +4245,9 @@ EVT ResultVT = Op.getValueType(); SDValue Vec = Op.getOperand(0); SDValue Idx = Op.getOperand(1); + EVT VecVT = Vec.getValueType(); + unsigned NumElts = VecVT.getVectorNumElements(); + assert(VecVT.getScalarSizeInBits() == 16 && (NumElts == 2 || NumElts == 4)); DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); @@ -4204,19 +4258,43 @@ if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; + EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64; SDValue Four = DAG.getConstant(4, SL, MVT::i32); // Convert vector index to bit-index (* 16) SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Four); - SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); - SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); + SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx); + + if (ResultVT == MVT::f16) { + SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt); + return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); + } + + return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT); +} + +SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4i16 || VT == MVT::v4f16); + + EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + + // Turn into pair of packed build_vectors. + // TODO: Special case for constants that can be materialized with s_mov_b64. + SDValue Lo = DAG.getBuildVector(HalfVT, SL, + { Op.getOperand(0), Op.getOperand(1) }); + SDValue Hi = DAG.getBuildVector(HalfVT, SL, + { Op.getOperand(2), Op.getOperand(3) }); - SDValue Result = Elt; - if (ResultVT.bitsLT(MVT::i32)) - Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); + SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); + SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); - return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); + SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } bool Index: test/CodeGen/AMDGPU/extload-align.ll =================================================================== --- test/CodeGen/AMDGPU/extload-align.ll +++ test/CodeGen/AMDGPU/extload-align.ll @@ -7,17 +7,17 @@ ; size and not 4 corresponding to the sign-extended size (i32). ; DEBUG: {{^}}# Machine code for function extload_align: -; DEBUG: (load 2, addrspace 5) +; DEBUG: (volatile load 2 from %ir.a, addrspace 5) ; DEBUG: {{^}}# End machine code for function extload_align. define amdgpu_kernel void @extload_align(i32 addrspace(5)* %out, i32 %index) #0 { %v0 = alloca [4 x i16], addrspace(5) %a1 = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 0 %a2 = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 1 - store i16 0, i16 addrspace(5)* %a1 - store i16 1, i16 addrspace(5)* %a2 + store volatile i16 0, i16 addrspace(5)* %a1 + store volatile i16 1, i16 addrspace(5)* %a2 %a = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 %index - %val = load i16, i16 addrspace(5)* %a + %val = load volatile i16, i16 addrspace(5)* %a %eval = sext i16 %val to i32 store i32 %eval, i32 addrspace(5)* %out ret void Index: test/CodeGen/AMDGPU/extract_vector_elt-f16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -70,31 +70,20 @@ ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v4f16: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -define amdgpu_kernel void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 { - %p0 = extractelement <4 x half> %foo, i32 0 - %p1 = extractelement <4 x half> %foo, i32 2 - %out1 = getelementptr half, half addrspace(1)* %out, i32 10 - store half %p1, half addrspace(1)* %out, align 2 - store half %p0, half addrspace(1)* %out1, align 2 - ret void -} - ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GFX9-DAG: global_load_short_d16_hi v +; GFX9-DAG: global_load_short_d16 v -; GCN: buffer_load_ushort -; GCN: buffer_store_short +; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 +; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v + +; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} + +; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 { %p0 = extractelement <3 x half> %foo, i32 %idx %out1 = getelementptr half, half addrspace(1)* %out, i32 1 @@ -102,23 +91,45 @@ ret void } -; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4f16: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; GCN-LABEL: {{^}}v_extractelement_v4f16_2: +; SI: buffer_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: buffer_store_short [[LOAD]] -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; VI: flat_load_dword v +; VI: flat_store_short -; GCN: buffer_load_ushort -; GCN: buffer_store_short -define amdgpu_kernel void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 { - %p0 = extractelement <4 x half> %foo, i32 %idx - %out1 = getelementptr half, half addrspace(1)* %out, i32 1 - store half %p0, half addrspace(1)* %out +; GFX9: global_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]] +define amdgpu_kernel void @v_extractelement_v4f16_2(half addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext + %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep + %vec.extract = extractelement <4 x half> %vec, i32 2 + store half %vec.extract, half addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v4f16_dynamic_vgpr: +; GCN-DAG: {{flat|global|buffer}}_load_dword [[IDX:v[0-9]+]], +; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] + +; GFX89: v_lshrrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], v{{\[}}[[LO]]:[[HI]]{{\]}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[SHIFT_LO]] + +; SI: v_lshr_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}, [[SCALED_IDX]] +; SI: buffer_store_short v[[SHIFT_LO]] +define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(half addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext + %idx.val = load volatile i32, i32 addrspace(1)* undef + %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep + %vec.extract = extractelement <4 x half> %vec, i32 %idx.val + store half %vec.extract, half addrspace(1)* %out.gep ret void } Index: test/CodeGen/AMDGPU/extract_vector_elt-i16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,GFX89 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}extract_vector_elt_v2i16: ; GCN: s_load_dword [[VEC:s[0-9]+]] @@ -96,20 +96,15 @@ ; SICIVI: buffer_load_ushort ; SICIVI: buffer_load_ushort -; SICIVI: buffer_store_short -; SICIVI: buffer_store_short -; SICIVI: buffer_store_short +; GFX9-DAG: global_load_short_d16_hi v +; GFX9-DAG: global_load_short_d16 v -; SICIVI: buffer_load_ushort -; SICIVI: buffer_store_short +; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 +; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v + +; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -; GFX9: buffer_load_ushort -; GFX9: global_load_short_d16_hi -; GFX9: global_load_short_d16 v -; GFX9: buffer_store_dword -; GFX9: buffer_store_dword -; GFX9: buffer_load_ushort -; GFX9: buffer_store_short +; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 { %p0 = extractelement <3 x i16> %foo, i32 %idx %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 @@ -117,29 +112,15 @@ ret void } -; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i16: -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort - -; SICIVI: buffer_store_short -; SICIVI: buffer_store_short -; SICIVI: buffer_store_short -; SICIVI: buffer_store_short - -; SICIVI: buffer_load_ushort -; SICIVI: buffer_store_short - -; GFX9: s_load_dword -; GFX9: buffer_store_dword -; GFX9: buffer_store_dword -; GFX9: buffer_load_ushort -; GFX9: buffer_store_short -define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 { - %p0 = extractelement <4 x i16> %foo, i32 %idx - %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - store i16 %p0, i16 addrspace(1)* %out +; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_sgpr: +define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %idx) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext + %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %vec.extract = extractelement <4 x i16> %vec, i32 %idx + store i16 %vec.extract, i16 addrspace(1)* %out.gep ret void } Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -201,33 +201,6 @@ ret void } -; GCN-LABEL: {{^}}dynamic_insertelement_v4i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off -; GCN: buffer_load_ushort v{{[0-9]+}}, off -; GCN: buffer_load_ushort v{{[0-9]+}}, off -; GCN: buffer_load_ushort v{{[0-9]+}}, off - -; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 8{{$}} -; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}} -; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}} - -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:14 -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:10 -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8 -; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} - -; GCN-NO-TONGA: s_waitcnt expcnt - -; GCN: buffer_load_dwordx2 - -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off -define amdgpu_kernel void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { - %vecins = insertelement <4 x i16> %a, i16 5, i32 %b - store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8 - ret void -} - ; GCN-LABEL: {{^}}dynamic_insertelement_v2i8: ; GCN: buffer_load_ubyte v{{[0-9]+}}, off ; GCN: buffer_load_ubyte v{{[0-9]+}}, off Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s ; GCN-LABEL: {{^}}s_insertelement_v2i16_0: ; GCN: s_load_dword [[VEC:s[0-9]+]] @@ -484,6 +484,187 @@ ret void } +; GCN-LABEL: {{^}}v_insertelement_v4f16_0: +; GCN-DAG: s_load_dword [[VAL:s[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} + +; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}} +; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]] + +; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]] +; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL]], [[AND]] + +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}} +define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to half + %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0 + store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v4f16_1: +; GCN-DAG: s_load_dword [[VAL:s[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} + +; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]] +; GFX9: v_lshl_or_b32 v[[INS_HALF:[0-9]+]], [[VAL]], 16, [[AND]] + +; VI: s_lshl_b32 [[VAL]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]] +; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]] +; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL]], [[AND]] + +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_HALF]]:[[HI]]{{\]}} +define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to half + %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1 + store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v4f16_2: +; GCN-DAG: s_load_dword [[VAL:s[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} + +; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}} +; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]] + +; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]] +; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]] + +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}} +define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to half + %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2 + store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v4f16_3: +; GCN-DAG: s_load_dword [[VAL:s[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} + +; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]] +; GFX9: v_lshl_or_b32 v[[INS_HI:[0-9]+]], [[VAL]], 16, [[AND]] + +; VI: s_lshl_b32 [[VAL]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]] +; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]] +; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]] + +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}} +define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to half + %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3 + store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v4i16_2: +; GCN-DAG: s_load_dword [[VAL:s[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} + +; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}} +; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]] + +; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]] +; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]] + +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}} +define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to i16 + %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2 + store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep + ret void +} + +; FIXME: Better code on CI? +; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_vgpr: +; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]], +; GCN-DAG: s_load_dword [[VAL:s[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} + +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GCN-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0 +; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff{{$}} + +; GFX89: v_lshlrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}} +; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_LO:[0-9+]]], v[[SHIFT_LO]] +; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_HI:[0-9+]]], v[[SHIFT_HI]] +; GFX89-DAG: v_and_b32_e32 v[[MASK:[0-9]+]], [[VAL]], v[[SHIFT_LO]] + +; GFX89-DAG: v_and_b32_e32 v[[AND0:[0-9]+]], v[[NOT_SHIFT_LO]], v[[LO]] +; GFX89-DAG: v_and_b32_e32 v[[AND1:[0-9]+]], v[[NOT_SHIFT_HI]], v[[HI]] +; GFX89: v_or_b32_sdwa v[[OR_SDWA:[0-9]+]], v[[MASK]], v[[AND0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + + +; CI: v_lshl_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] +; CI-DAG: v_bfi_b32 v[[OR_SDWA:[0-9]+]], v[[SHIFT_LO]], +; CI-DAG: v_bfi_b32 v[[AND1:[0-9]+]], v[[SHIFT_HI]], 0, + +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[OR_SDWA]]:[[AND1]]{{\]}} +define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %idx.val = load volatile i32, i32 addrspace(1)* undef + %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to i16 + %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val + store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_insertelement_v4f16_dynamic_sgpr: +define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to half + %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval + store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/min.ll =================================================================== --- test/CodeGen/AMDGPU/min.ll +++ test/CodeGen/AMDGPU/min.ll @@ -289,9 +289,9 @@ ; SI-NOT: v_min_u32_e32 ; VI: v_min_u16_e32 -; VI: v_min_u16_sdwa +; VI: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_min_u16_e32 -; VI-NOT: v_min_u16_e32 +; VI-NOT: v_min_u16 ; GFX9: v_pk_min_u16 ; GFX9: v_pk_min_u16