Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1343,11 +1343,16 @@ //===----------------------------------------------------------------------===// // Miscellaneous Patterns //===----------------------------------------------------------------------===// +let OtherPredicates = [ Predicate<"Subtarget->getGeneration() < AMDGPUSubtarget::GFX9"> ] in { def : GCNPat < (i32 (AMDGPUfp16_zext f16:$src)), (COPY $src) >; - +} +def : GCNPat < + (i32 (AMDGPUfp16_zext f16:$src)), + (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x0000ffff))) +>; def : GCNPat < (i32 (trunc i64:$a)), Index: test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -793,8 +793,8 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16: -; GFX9: v_pk_mul_f16 -; GFX9: v_mul_f16_e32 +; GFX9-DAG: v_pk_mul_f16 +; GFX9-DAG: v_mul_f16_e32 ; GFX9-NOT: v_max ; GFX9-NOT: v_pk_max define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) { Index: test/CodeGen/AMDGPU/fptrunc.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fptrunc.f16.ll +++ test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -141,7 +141,8 @@ ; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] -; GCN-NOT: v[[R_F16]] +; SIVI-NOT: v[[R_F16]] +; GFX9: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]] ; GCN: buffer_store_dword v[[R_F16]] define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( i32 addrspace(1)* %r, @@ -158,7 +159,8 @@ ; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| -; GCN-NOT: v[[R_F16]] +; SIVI-NOT: v[[R_F16]] +; GFX9: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]] ; GCN: buffer_store_dword v[[R_F16]] define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( i32 addrspace(1)* %r, Index: test/CodeGen/AMDGPU/mad-mix-lo.ll =================================================================== --- test/CodeGen/AMDGPU/mad-mix-lo.ll +++ test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -286,6 +286,22 @@ ret <4 x half> %cvt.result } +; GCN-LABEL: mixlo_zext: +; GCN: s_waitcnt +; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2{{$}} +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_setpc_b64 + +; CIVI: v_mac_f32_e32 +; CIVI: v_cvt_f16_f32_e32 +define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 { + %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) + %cvt.result = fptrunc float %result to half + %cvt.result.i16 = bitcast half %cvt.result to i16 + %cvt.result.i32 = zext i16 %cvt.result.i16 to i32 + ret i32 %cvt.result.i32 +} + declare half @llvm.minnum.f16(half, half) #1 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 declare <3 x half> @llvm.minnum.v3f16(<3 x half>, <3 x half>) #1