Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -534,6 +534,9 @@ // Legalization hack. setOperationAction(ISD::SELECT, MVT::v2i16, Custom); setOperationAction(ISD::SELECT, MVT::v2f16, Custom); + + setOperationAction(ISD::FNEG, MVT::v2f16, Custom); + setOperationAction(ISD::FABS, MVT::v2f16, Custom); } for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { @@ -3703,6 +3706,28 @@ Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect)); return; } + case ISD::FNEG: { + SDLoc SL(N); + assert(N->getValueType(0) == MVT::v2f16); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); + + SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, + BC, + DAG.getConstant(0x80008000, SL, MVT::i32)); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); + return; + } + case ISD::FABS: { + SDLoc SL(N); + assert(N->getValueType(0) == MVT::v2f16); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); + + SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, + BC, + DAG.getConstant(0x7fff7fff, SL, MVT::i32)); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); + return; + } default: break; } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -633,6 +633,11 @@ (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; +def : GCNPat < + (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), + (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE) +>; + def : GCNPat < (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -28,16 +28,9 @@ ret void } -; FIXME: Should be able to use single and ; GCN-LABEL: {{^}}s_fabs_v2f16: -; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] -; CI: v_or_b32_e32 - -; GFX89: s_load_dword [[VAL:s[0-9]+]] -; GFX89: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) store <2 x half> %fabs, <2 x half> addrspace(1)* %out @@ -45,18 +38,11 @@ } ; GCN-LABEL: {{^}}s_fabs_v4f16: -; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] - - -; GFX89: s_load_dword s -; GFX89: s_load_dword s -; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff -; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] -; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { @@ -108,14 +94,19 @@ ret void } -; GCN-LABEL: {{^}}v_fabs_fold_v2f16: +; FIXME: Should do fabs after conversion to avoid converting multiple +; times in this particular case. + +; GCN-LABEL: {{^}}v_fabs_fold_self_v2f16: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_cvt_f32_f16_e32 ; CI: v_cvt_f32_f16_e32 -; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} +; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_cvt_f16_f32 -; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} +; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_cvt_f16_f32 ; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -123,7 +114,7 @@ ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}} -define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { +define amdgpu_kernel void @v_fabs_fold_self_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid %val = load <2 x half>, <2 x half> addrspace(1)* %gep @@ -133,6 +124,34 @@ ret void } +; GCN-LABEL: {{^}}v_fabs_fold_v2f16: +; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] + +; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; CI: v_cvt_f32_f16_e32 +; CI: v_cvt_f32_f16_e32 +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_cvt_f16_f32 +; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_cvt_f16_f32 + +; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, s{{[0-9]+}} + +; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], s{{[0-9]+$}} +define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 %other.val) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %other.val.cvt = bitcast i32 %other.val to <2 x half> + %fmul = fmul <2 x half> %fabs, %other.val.cvt + store <2 x half> %fmul, <2 x half> addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}v_extract_fabs_fold_v2f16: ; GCN-DAG: {{flat|global}}_load_dword [[VAL:v[0-9]+]] ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/fneg-fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -123,8 +123,10 @@ } ; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16: -; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| -; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| +; CI: s_load_dword s +; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 +; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}} +; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}} ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -60,17 +60,9 @@ ret void } -; FIXME: Terrible code with SI/CI. ; FIXME: scalar for VI, vector for gfx9 ; GCN-LABEL: {{^}}s_fneg_v2f16: -; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_or_b32_e32 - -; VI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 - +; CIVI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { %fneg = fsub <2 x half> , %in @@ -78,6 +70,18 @@ ret void } +; FIXME: vector on gfx9 +; GCN-LABEL: {{^}}s_fneg_v2f16_nonload: +; CIVI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 +; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} +define amdgpu_kernel void @s_fneg_v2f16_nonload(<2 x half> addrspace(1)* %out) #0 { + %in = call i32 asm sideeffect "; def $0", "=s"() + %in.bc = bitcast i32 %in to <2 x half> + %fneg = fsub <2 x half> , %in.bc + store <2 x half> %fneg, <2 x half> addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}v_fneg_v2f16: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]] @@ -107,8 +111,12 @@ ; GCN-LABEL: {{^}}v_fneg_fold_v2f16: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}} -; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}} +; CI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, [[VAL]] +; CI: v_lshrrev_b32_e32 +; CI: v_lshrrev_b32_e32 + +; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_cvt_f16_f32 ; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}