Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -52,7 +52,7 @@ SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND_LegalFTRUNC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2161,7 +2161,8 @@ // Don't handle v2f16. The extra instructions to scalarize and repack around the // compare and vselect end up producing worse code than scalarizing the whole // operation. -SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op, + SelectionDAG &DAG) const { SDLoc SL(Op); SDValue X = Op.getOperand(0); EVT VT = Op.getValueType(); @@ -2250,8 +2251,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT == MVT::f32 || VT == MVT::f16) - return LowerFROUND32_16(Op, DAG); + if (isOperationLegal(ISD::FTRUNC, VT)) + return LowerFROUND_LegalFTRUNC(Op, DAG); if (VT == MVT::f64) return LowerFROUND64(Op, DAG); Index: llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -1,7 +1,8 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,CI %s -; FUNC-LABEL: {{^}}round_f64: -; SI: s_endpgm +; GCN-LABEL: {{^}}round_f64: +; GCN: s_endpgm define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 { %result = call double @llvm.round.f64(double %x) #1 store double %result, double addrspace(1)* %out @@ -11,7 +12,7 @@ ; This is a pretty large function, so just test a few of the ; instructions that are necessary. -; FUNC-LABEL: {{^}}v_round_f64: +; GCN-LABEL: {{^}}v_round_f64: ; SI: buffer_load_dwordx2 ; SI-DAG: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 @@ -25,7 +26,13 @@ ; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] ; SI: buffer_store_dwordx2 -; SI: s_endpgm + +; CI: v_trunc_f64 +; CI: v_add_f64 +; CI: v_bfi_b32 +; CI: v_cmp_ge_f64 +; CI: v_cndmask_b32 +; CI: v_add_f64 define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -36,24 +43,24 @@ ret void } -; FUNC-LABEL: {{^}}round_v2f64: -; SI: s_endpgm +; GCN-LABEL: {{^}}round_v2f64: +; GCN: s_endpgm define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 store <2 x double> %result, <2 x double> addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}round_v4f64: -; SI: s_endpgm +; GCN-LABEL: {{^}}round_v4f64: +; GCN: s_endpgm define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, <4 x double> addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}round_v8f64: -; SI: s_endpgm +; GCN-LABEL: {{^}}round_v8f64: +; GCN: s_endpgm define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, <8 x double> addrspace(1)* %out