Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6747,8 +6747,9 @@ M, DAG, Ops); // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) + if (!LoadVT.isVector() && + (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16)) return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, @@ -7407,7 +7408,8 @@ // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) + if (!VData.getValueType().isVector() && + (VDataType == MVT::i8 || VDataType == MVT::i16)) return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-store.v2i16.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-store.v2i16.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s + +declare <2 x i16> @llvm.amdgcn.buffer.load.v2i16(<4 x i32>, i32, i32, i1 immarg, i1 immarg) +declare void @llvm.amdgcn.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i1 immarg, i1 immarg) + +define hidden <2 x i16> @buffer_load_v2i16(i16* %0, i32 %1, i32 %2) local_unnamed_addr { +; GFX900-LABEL: buffer_load_v2i16: +; GFX900: buffer_load_v2i16$local: +; GFX900-NEXT: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, -1 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x27000 +; GFX900-NEXT: v_add_lshl_u32 v3, v3, v2, 1 +; GFX900-NEXT: s_mov_b64 s[6:7], exec +; GFX900-NEXT: BB0_1: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: v_readfirstlane_b32 s8, v0 +; GFX900-NEXT: v_readfirstlane_b32 s9, v1 +; GFX900-NEXT: v_readfirstlane_b32 s10, v4 +; GFX900-NEXT: v_readfirstlane_b32 s11, v5 +; GFX900-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GFX900-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5] +; GFX900-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen +; GFX900-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz BB0_1 +; GFX900-NEXT: ; %bb.2: +; GFX900-NEXT: s_mov_b64 exec, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] + %4 = ptrtoint i16* %0 to i64 + %5 = bitcast i64 %4 to <2 x i32> + %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %7 = shufflevector <4 x i32> %6, <4 x i32> <i32 undef, i32 undef, i32 -1, i32 159744>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + %8 = add i32 %2, %1 + %9 = shl i32 %8, 1 + %10 = tail call <2 x i16> @llvm.amdgcn.buffer.load.v2i16(<4 x i32> %7, i32 0, i32 %9, i1 zeroext false, i1 zeroext false) + ret <2 x i16> %10 +} + +define hidden void @buffer_store_v2i16(i16* nocapture readonly %0, i16* %1, i32 %2, i32 %3) local_unnamed_addr { +; GFX900-LABEL: buffer_store_v2i16: +; GFX900: buffer_store_v2i16$local: +; GFX900-NEXT: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: flat_load_ushort v1, v[0:1] +; GFX900-NEXT: v_add_lshl_u32 v0, v5, v4, 1 +; GFX900-NEXT: v_mov_b32_e32 v6, -1 +; GFX900-NEXT: v_mov_b32_e32 v7, 0x27000 +; GFX900-NEXT: s_mov_b64 s[6:7], exec +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX900-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX900-NEXT: BB1_1: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: v_readfirstlane_b32 s8, v2 +; GFX900-NEXT: v_readfirstlane_b32 s9, v3 +; GFX900-NEXT: v_readfirstlane_b32 s10, v6 +; GFX900-NEXT: v_readfirstlane_b32 s11, v7 +; GFX900-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3] +; GFX900-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7] +; GFX900-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen +; GFX900-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz BB1_1 +; GFX900-NEXT: ; %bb.2: +; GFX900-NEXT: s_mov_b64 exec, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] + %5 = ptrtoint i16* %1 to i64 + %6 = bitcast i64 %5 to <2 x i32> + %7 = shufflevector <2 x i32> %6, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %8 = shufflevector <4 x i32> %7, <4 x i32> <i32 undef, i32 undef, i32 -1, i32 159744>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + %9 = load i16, i16* %0, align 2 + %10 = insertelement <2 x i16> undef, i16 %9, i32 0 + %11 = shufflevector <2 x i16> %10, <2 x i16> undef, <2 x i32> zeroinitializer + %12 = add i32 %3, %2 + %13 = shl i32 %12, 1 + tail call void @llvm.amdgcn.buffer.store.v2i16(<2 x i16> %11, <4 x i32> %8, i32 0, i32 %13, i1 zeroext false, i1 zeroext false) + ret void +}