Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -126,6 +126,11 @@ addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); } + if (Subtarget->hasVOP3PInsts()) { + addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); + } + computeRegisterProperties(STI.getRegisterInfo()); // We need to custom lower vector stores from local memory @@ -201,7 +206,8 @@ // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, + MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -371,6 +377,41 @@ setOperationAction(ISD::FMAD, MVT::f16, Legal); } + if (Subtarget->hasVOP3PInsts()) { + for (MVT VT : {MVT::v2i16, MVT::v2f16}) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { + switch (Op) { + case ISD::LOAD: + case ISD::STORE: + case ISD::BUILD_VECTOR: + case ISD::BITCAST: + case ISD::EXTRACT_VECTOR_ELT: + case ISD::INSERT_VECTOR_ELT: + case ISD::INSERT_SUBVECTOR: + case ISD::EXTRACT_SUBVECTOR: + case ISD::SCALAR_TO_VECTOR: + break; + case ISD::CONCAT_VECTORS: + setOperationAction(Op, VT, Custom); + break; + default: + setOperationAction(Op, VT, Expand); + break; + } + } + } + + setOperationAction(ISD::STORE, MVT::v2i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); + setOperationAction(ISD::STORE, MVT::v2f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32); + + setOperationAction(ISD::LOAD, MVT::v2i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); + setOperationAction(ISD::LOAD, MVT::v2f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); + } + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); Index: test/CodeGen/AMDGPU/inlineasm-packed.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/inlineasm-packed.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s + +; GCN-LABEL: {{^}}inline_asm_input_v2i16: +; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +define void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 { +entry: + %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x i16> %in) #0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}inline_asm_input_v2f16: +; GCN: s_mov_b32 s0, s{{[0-9]+}} +define void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 { +entry: + %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}inline_asm_output_v2i16: +; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +define void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 { +entry: + %val = call <2 x i16> asm "s_mov_b32 $0, $1", "=r,r"(i32 %in) #0 + store <2 x i16> %val, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}inline_asm_output_v2f16: +; GCN: v_mov_b32 v{{[0-9]+}}, s{{[0-9]+}} +define void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { +entry: + %val = call <2 x half> asm "v_mov_b32 $0, $1", "=v,r"(i32 %in) #0 + store <2 x half> %val, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}inline_asm_packed_v2i16: +; GCN: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +define void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 { +entry: + %val = call <2 x i16> asm "v_pk_add_u16 $0, $1, $2", "=v,r,v"(<2 x i16> %in0, <2 x i16> %in1) #0 + store <2 x i16> %val, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}inline_asm_packed_v2f16: +; GCN: v_pk_add_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +define void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 { +entry: + %val = call <2 x half> asm "v_pk_add_f16 $0, $1, $2", "=v,r,v"(<2 x half> %in0, <2 x half> %in1) #0 + store <2 x half> %val, <2 x half> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind }