Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -27,7 +27,7 @@ SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, - uint64_t Offset, bool Signed, + uint64_t Offset, unsigned Align, bool Signed, const ISD::InputArg *Arg = nullptr) const; SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1069,15 +1069,13 @@ SDValue SITargetLowering::lowerKernargMemParameter( SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, - uint64_t Offset, bool Signed, + uint64_t Offset, unsigned Align, bool Signed, const ISD::InputArg *Arg) const { const DataLayout &DL = DAG.getDataLayout(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - unsigned Align = DL.getABITypeAlignment(Ty); - SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, MachineMemOperand::MODereferenceable | @@ -1664,7 +1662,15 @@ SmallVector Chains; - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + // FIXME: This is the minimum kernel argument alignment. We should improve + // this to the maximum alignment of the arguments. + // + // FIXME: Alignment of explicit arguments totally broken with non-0 explicit + // kern arg offset. + const unsigned KernelArgBaseAlign = 16; + const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn); + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; if (Skipped[i]) { InVals.push_back(DAG.getUNDEF(Arg.VT)); @@ -1678,14 +1684,14 @@ VT = Ins[i].VT; EVT MemVT = VA.getLocVT(); - const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(Fn) + - VA.getLocMemOffset(); + const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset(); Info->setABIArgOffset(Offset + MemVT.getStoreSize()); + unsigned Align = MinAlign(KernelArgBaseAlign, Offset); // The first 36 bytes of the input buffer contains information about - // thread group and global sizes. + // thread group and global sizes for clover. SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]); + DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = @@ -4305,7 +4311,7 @@ unsigned Offset) const { SDLoc SL(Op); SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, false); + DAG.getEntryNode(), Offset, 4, false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); @@ -4406,37 +4412,37 @@ return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, false); + SI::KernelInputOffsets::NGROUPS_X, 4, false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, false); + SI::KernelInputOffsets::NGROUPS_Y, 4, false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, false); + SI::KernelInputOffsets::NGROUPS_Z, 4, false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); Index: test/CodeGen/AMDGPU/fneg-fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -88,11 +88,9 @@ ; Combine turns this into integer op when bitcast source (from load) ; GCN-LABEL: {{^}}s_fneg_fabs_v2f16_bc_src: -; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}} -; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]] -; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]] ; FIXME: Random commute +; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}} define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %out, <2 x half> %in) { @@ -103,16 +101,12 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v4f16: -; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 -; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, v{{[0-9]+}} -; CI: v_or_b32_e32 [[OR0:v[0-9]+]], v{{[0-9]+}}, [[SHL0]] -; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, v{{[0-9]+}} -; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]] -; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]] -; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]] ; FIXME: Random commute -; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 +; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 + +; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -13,17 +13,10 @@ ret void } -; FIXME: Should always be the same ; GCN-LABEL: {{^}}load_v2f16_arg: -; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] -; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] -; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} - -; VI: s_load_dword [[ARG:s[0-9]+]] -; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]] -; VI: buffer_store_dword [[V_ARG]] +; GCN: s_load_dword [[ARG:s[0-9]+]] +; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]] +; GCN: buffer_store_dword [[V_ARG]] define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { store <2 x half> %arg, <2 x half> addrspace(1)* %out ret void @@ -31,8 +24,8 @@ ; GCN-LABEL: {{^}}load_v3f16_arg: ; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; GCN: s_load_dword s + ; GCN-NOT: buffer_load ; GCN-DAG: buffer_store_dword ; GCN-DAG: buffer_store_short @@ -43,19 +36,14 @@ ret void } -; GCN-LABEL: {{^}}load_v4f16_arg: -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_store_dwordx2 ; FIXME: Why not one load? -; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]] -; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]] -; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}} +; GCN-LABEL: {{^}}load_v4f16_arg: +; GCN-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]] +; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]] +; GCN: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}} define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { store <4 x half> %arg, <4 x half> addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -162,10 +162,11 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte + +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; HSA: flat_load_ushort define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { entry: store <2 x i8> %in, <2 x i8> addrspace(1)* %out @@ -179,10 +180,9 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; SI: buffer_load_ushort -; SI: buffer_load_ushort - -; VI: s_load_dword s +; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb +; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(1)* %out @@ -226,11 +226,14 @@ ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; MESA-VI: buffer_load_ushort +; MESA-VI: buffer_load_ubyte + +; HSA-VI: flat_load_ushort ; HSA-VI: flat_load_ubyte define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { entry: @@ -245,12 +248,9 @@ ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort + +; GCN-DAG: s_load_dword s +; GCN-DAG: {{buffer|flat}}_load_ushort define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { entry: store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 @@ -293,14 +293,13 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte + +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; VI: s_load_dword s define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(1)* %out @@ -315,13 +314,14 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort +; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb +; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 -; VI: s_load_dword s -; VI: s_load_dword s +; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x30 + +; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x8 +; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { entry: store <4 x i16> %in, <4 x i16> addrspace(1)* %out @@ -372,21 +372,17 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; HSA-GCN: float_load_ubyte -; HSA-GCN: float_load_ubyte -; HSA-GCN: float_load_ubyte -; HSA-GCN: float_load_ubyte -; HSA-GCN: float_load_ubyte -; HSA-GCN: float_load_ubyte -; HSA-GCN: float_load_ubyte -; HSA-GCN: float_load_ubyte + +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; VI: s_load_dwordx2 +; VI: s_load_dwordx2 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { entry: store <8 x i8> %in, <8 x i8> addrspace(1)* %out @@ -405,15 +401,11 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort +; SI: s_load_dwordx2 +; SI: s_load_dwordx2 +; SI: s_load_dwordx2 +; VI: s_load_dwordx2 ; VI: s_load_dword s ; VI: s_load_dword s ; VI: s_load_dword s @@ -481,38 +473,27 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; MESA-GCN: buffer_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte -; HSA-VI: flat_load_ubyte + +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; VI: s_load_dwordx2 +; VI: s_load_dwordx2 +; VI: s_load_dwordx2 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { entry: store <16 x i8> %in, <16 x i8> addrspace(1)* %out @@ -539,22 +520,13 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dwordx2 +; SI: s_load_dwordx2 +; SI: s_load_dwordx2 ; VI: s_load_dword s ; VI: s_load_dword s Index: test/CodeGen/AMDGPU/reduce-store-width-alignment.ll =================================================================== --- test/CodeGen/AMDGPU/reduce-store-width-alignment.ll +++ test/CodeGen/AMDGPU/reduce-store-width-alignment.ll @@ -39,10 +39,8 @@ } ; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; GCN: s_load_dword s +; GCN: s_load_dwordx2 s ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 { %x.bc = bitcast <4 x i16> %x to <2 x i32>