Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -131,6 +131,8 @@ return false; } + static bool allUsesHaveSourceMods(const SDNode *N, + unsigned CostThreshold = 4); bool isFAbsFree(EVT VT) const override; bool isFNegFree(EVT VT) const override; bool isTruncateFree(EVT Src, EVT Dest) const override; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -566,13 +566,19 @@ case AMDGPUISD::INTERP_P1: case AMDGPUISD::INTERP_P2: case AMDGPUISD::DIV_SCALE: + + // TODO: Should really be looking at the users of the bitcast. These are + // problematic because bitcasts are used to legalize all stores to integer + // types. + case ISD::BITCAST: return false; default: return true; } } -static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) { +bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, + unsigned CostThreshold) { // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus // it is truly free to use a source modifier in all cases. If there are // multiple users but for each one will necessitate using VOP3, there will be Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -100,6 +100,7 @@ SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -487,6 +487,7 @@ setTargetDAGCombine(ISD::FCANONICALIZE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -4530,6 +4531,24 @@ return SDValue(); } +SDValue SITargetLowering::performExtractVectorEltCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDValue Vec = N->getOperand(0); + + SelectionDAG &DAG= DCI.DAG; + if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { + SDLoc SL(N); + EVT EltVT = N->getValueType(0); + SDValue Idx = N->getOperand(1); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(0), Idx); + return DAG.getNode(ISD::FNEG, SL, EltVT, Elt); + } + + return SDValue(); +} + + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -4818,6 +4837,8 @@ break; } + case ISD::EXTRACT_VECTOR_ELT: + return performExtractVectorEltCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } Index: test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-combines.ll +++ test/CodeGen/AMDGPU/fneg-combines.ll @@ -1471,11 +1471,10 @@ ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] -; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] -; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] -; GCN: buffer_store_dword [[MUL]] +; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] +; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0 +; GCN-NEXT: buffer_store_dword [[ADD]] +; GCN-NEXT: buffer_store_dword [[MUL]] define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s ; FIXME: Should be able to do scalar op ; GCN-LABEL: {{^}}s_fneg_f16: @@ -129,6 +129,41 @@ ret void } +; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] +; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} +; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} + +; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]] +; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] +; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]] +define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %fneg = fsub <2 x half> , %val + %elt0 = extractelement <2 x half> %fneg, i32 0 + %elt1 = extractelement <2 x half> %fneg, i32 1 + + %fmul0 = fmul half %elt0, 4.0 + %fadd1 = fadd half %elt1, 2.0 + store volatile half %fmul0, half addrspace(1)* undef + store volatile half %fadd1, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] +; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]] +; GCN: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]] +define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %fneg = fsub <2 x half> , %val + %elt0 = extractelement <2 x half> %fneg, i32 0 + %elt1 = extractelement <2 x half> %fneg, i32 1 + store volatile half %elt0, half addrspace(1)* undef + store volatile half %elt1, half addrspace(1)* undef + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind }