diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -202,6 +202,11 @@ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); + setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trunc-store-vec-i16-to-i8.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}short_char: +; GCN: global_store_byte v +define protected amdgpu_kernel void @short_char(i8 addrspace(1)* %out) { +entry: + %tmp = load i16, i16 addrspace(1)* undef + %tmp1 = trunc i16 %tmp to i8 + store i8 %tmp1, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}short2_char4: +; GCN: global_store_dword v +define protected amdgpu_kernel void @short2_char4(<4 x i8> addrspace(1)* %out) { +entry: + %tmp = load <2 x i16>, <2 x i16> addrspace(1)* undef, align 4 + %vecinit = shufflevector <2 x i16> %tmp, <2 x i16> undef, <4 x i32> + %vecinit2 = shufflevector <4 x i16> %vecinit, <4 x i16> , <4 x i32> + %tmp1 = trunc <4 x i16> %vecinit2 to <4 x i8> + store <4 x i8> %tmp1, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}short4_char8: +; GCN: global_store_dwordx2 v +define protected amdgpu_kernel void @short4_char8(<8 x i8> addrspace(1)* %out) { +entry: + %tmp = load <4 x i16>, <4 x i16> addrspace(1)* undef, align 8 + %vecinit = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> + %vecinit2 = shufflevector <8 x i16> %vecinit, <8 x i16> , <8 x i32> + %tmp1 = trunc <8 x i16> %vecinit2 to <8 x i8> + store <8 x i8> %tmp1, <8 x i8> addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}short8_char16: +; GCN: global_store_dwordx4 v +define protected amdgpu_kernel void @short8_char16(<16 x i8> addrspace(1)* %out) { +entry: + %tmp = load <8 x i16>, <8 x i16> addrspace(1)* undef, align 16 + %vecinit = shufflevector <8 x i16> %tmp, <8 x i16> undef, <16 x i32> + %vecinit2 = shufflevector <16 x i16> %vecinit, <16 x i16> , <16 x i32> + %tmp1 = trunc <16 x i16> %vecinit2 to <16 x i8> + store <16 x i8> %tmp1, <16 x i8> addrspace(1)* %out, align 16 + ret void +} + +; GCN-LABEL: {{^}}short16_char32: +; GCN: global_store_dwordx4 v +; GCN: global_store_dwordx4 v +define protected amdgpu_kernel void @short16_char32(<32 x i8> addrspace(1)* %out) { +entry: + %tmp = load <16 x i16>, <16 x i16> addrspace(1)* undef, align 32 + %vecinit = shufflevector <16 x i16> %tmp, <16 x i16> undef, <32 x i32> + %vecinit2 = shufflevector <32 x i16> %vecinit, <32 x i16> , <32 x i32> + %tmp1 = trunc <32 x i16> %vecinit2 to <32 x i8> + store <32 x i8> %tmp1, <32 x i8> addrspace(1)* %out, align 32 + ret void +}