Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -7349,18 +7349,24 @@ return SDValue(); // TODO: Support other binary operations. case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: case ISD::ADD: case ISD::UMIN: case ISD::UMAX: case ISD::SMIN: case ISD::SMAX: case ISD::FMAXNUM: - case ISD::FMINNUM: - return DAG.getNode(Opc, SL, EltVT, - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Vec.getOperand(0), Idx), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Vec.getOperand(1), Idx)); + case ISD::FMINNUM: { + SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(0), Idx); + SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(1), Idx); + + DCI.AddToWorklist(Elt0.getNode()); + DCI.AddToWorklist(Elt1.getNode()); + return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags()); + } } } Index: test/CodeGen/AMDGPU/reduction.ll =================================================================== --- test/CodeGen/AMDGPU/reduction.ll +++ test/CodeGen/AMDGPU/reduction.ll @@ -1,19 +1,79 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; GCN-LABEL: {{^}}reduction_half4: +; GCN-LABEL: {{^}}reduction_fadd_v4f16: ; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI: v_add_f16_sdwa ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 -define half @reduction_half4(<4 x half> %vec4) { +define half @reduction_fadd_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> - %bin.rdx = fadd fast <4 x half> %vec4, %rdx.shuf + %bin.rdx = fadd <4 x half> %vec4, %rdx.shuf %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> - %bin.rdx2 = fadd fast <4 x half> %bin.rdx, %rdx.shuf1 + %bin.rdx2 = fadd <4 x half> %bin.rdx, %rdx.shuf1 + %res = extractelement <4 x half> %bin.rdx2, i32 0 + ret half %res +} + +; GCN-LABEL: {{^}}reduction_fsub_v4f16: +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_add_f16 [[ADD:v[0-9]+]], v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}} +; GFX9-NEXT: v_sub_f16_sdwa v0, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_setpc_b64 + +; VI: v_sub_f16_sdwa +; VI-NEXT: v_sub_f16_e32 +; VI-NEXT: v_sub_f16_e32 +; VI-NEXT: s_setpc_b64 +define half @reduction_fsub_v4f16(<4 x half> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> + %bin.rdx = fsub <4 x half> %vec4, %rdx.shuf + %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> + %bin.rdx2 = fsub <4 x half> %bin.rdx, %rdx.shuf1 + %res = extractelement <4 x half> %bin.rdx2, i32 0 + ret half %res +} + +; Make sure nsz is preserved when the operations are split. +; GCN-LABEL: {{^}}reduction_fsub_v4f16_preserve_fmf: +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}} +; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_setpc_b64 + +; VI: s_waitcnt +; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_f16_e32 v0, v0, v1 +; VI-NEXT: v_sub_f16_e32 v0, v2, v0 +; VI-NEXT: s_setpc_b64 +define half @reduction_fsub_v4f16_preserve_fmf(<4 x half> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> + %bin.rdx = fsub nsz <4 x half> %vec4, %rdx.shuf + %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> + %bin.rdx2 = fsub nsz <4 x half> %bin.rdx, %rdx.shuf1 + %res = extractelement <4 x half> %bin.rdx2, i32 0 + %neg.res = fsub half -0.0, %res + ret half %neg.res +} + +; GCN-LABEL: {{^}}reduction_fmul_half4: +; GFX9: v_pk_mul_f16 [[MUL:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_mul_f16_sdwa v{{[0-9]+}}, [[MUL]], [[MUL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_mul_f16_sdwa +; VI-NEXT: v_mul_f16_e32 +; VI-NEXT: v_mul_f16_e32 +define half @reduction_fmul_half4(<4 x half> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> + %bin.rdx = fmul <4 x half> %vec4, %rdx.shuf + %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> + %bin.rdx2 = fmul <4 x half> %bin.rdx, %rdx.shuf1 %res = extractelement <4 x half> %bin.rdx2, i32 0 ret half %res } @@ -52,19 +112,19 @@ define half @reduction_half8(<8 x half> %vec8) { entry: %rdx.shuf = shufflevector <8 x half> %vec8, <8 x half> undef, <8 x i32> - %bin.rdx = fadd fast <8 x half> %vec8, %rdx.shuf + %bin.rdx = fadd <8 x half> %vec8, %rdx.shuf %rdx.shuf1 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> - %bin.rdx2 = fadd fast <8 x half> %bin.rdx, %rdx.shuf1 + %bin.rdx2 = fadd <8 x half> %bin.rdx, %rdx.shuf1 %rdx.shuf3 = shufflevector <8 x half> %bin.rdx2, <8 x half> undef, <8 x i32> - %bin.rdx4 = fadd fast <8 x half> %bin.rdx2, %rdx.shuf3 + %bin.rdx4 = fadd <8 x half> %bin.rdx2, %rdx.shuf3 %res = extractelement <8 x half> %bin.rdx4, i32 0 ret half %res } ; GCN-LABEL: {{^}}reduction_v8i16: -; GFX9: v_pk_add_u16 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_u16 [[ADD2]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_u16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}} +; GFX9: v_pk_add_u16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_u16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_u16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI: v_add_u16_sdwa @@ -92,9 +152,9 @@ ; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_f16 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_f16 [[ADD2]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX9-NEXT: v_pk_add_f16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI: v_add_f16_sdwa @@ -116,13 +176,13 @@ define half @reduction_half16(<16 x half> %vec16) { entry: %rdx.shuf = shufflevector <16 x half> %vec16, <16 x half> undef, <16 x i32> - %bin.rdx = fadd fast <16 x half> %vec16, %rdx.shuf + %bin.rdx = fadd <16 x half> %vec16, %rdx.shuf %rdx.shuf1 = shufflevector <16 x half> %bin.rdx, <16 x half> undef, <16 x i32> - %bin.rdx2 = fadd fast <16 x half> %bin.rdx, %rdx.shuf1 + %bin.rdx2 = fadd <16 x half> %bin.rdx, %rdx.shuf1 %rdx.shuf3 = shufflevector <16 x half> %bin.rdx2, <16 x half> undef, <16 x i32> - %bin.rdx4 = fadd fast <16 x half> %bin.rdx2, %rdx.shuf3 + %bin.rdx4 = fadd <16 x half> %bin.rdx2, %rdx.shuf3 %rdx.shuf5 = shufflevector <16 x half> %bin.rdx4, <16 x half> undef, <16 x i32> - %bin.rdx6 = fadd fast <16 x half> %bin.rdx4, %rdx.shuf5 + %bin.rdx6 = fadd <16 x half> %bin.rdx4, %rdx.shuf5 %res = extractelement <16 x half> %bin.rdx6, i32 0 ret half %res } @@ -373,40 +433,77 @@ ret i16 %res } -; GCN-LABEL: {{^}}reduction_fmax_v4half: +; GCN-LABEL: {{^}}reduction_maxnum_v4f16: ; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI: v_max_f16_sdwa ; VI-NEXT: v_max_f16_e32 ; VI-NEXT: v_max_f16_e32 -define half @reduction_fmax_v4half(<4 x half> %vec4) { +define half @reduction_maxnum_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> - %rdx.minmax.cmp = fcmp fast ogt <4 x half> %vec4, %rdx.shuf + %rdx.minmax = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf) + %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32> + %rdx.minmax3 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1) + %res = extractelement <4 x half> %rdx.minmax3, i32 0 + ret half %res +} + +; GCN-LABEL: {{^}}reduction_minnum_v4f16: +; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_min_f16_sdwa +; VI-NEXT: v_min_f16_e32 +; VI-NEXT: v_min_f16_e32 +define half @reduction_minnum_v4f16(<4 x half> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> + %rdx.minmax = call <4 x half> @llvm.minnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf) + %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32> + %rdx.minmax3 = call <4 x half> @llvm.minnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1) + %res = extractelement <4 x half> %rdx.minmax3, i32 0 + ret half %res +} + +; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16: +; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +; VI: v_max_f16_sdwa +; VI-NEXT: v_max_f16_e32 +; VI-NEXT: v_max_f16_e32 +define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) { +entry: + %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> + %rdx.minmax.cmp = fcmp nnan nsz ogt <4 x half> %vec4, %rdx.shuf %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> - %rdx.minmax.cmp2 = fcmp fast ogt <4 x half> %rdx.minmax.select, %rdx.shuf1 + %rdx.minmax.cmp2 = fcmp nnan nsz ogt <4 x half> %rdx.minmax.select, %rdx.shuf1 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 ret half %res } -; GCN-LABEL: {{^}}reduction_fmin_v4half: +; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16: ; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI: v_min_f16_sdwa ; VI-NEXT: v_min_f16_e32 ; VI-NEXT: v_min_f16_e32 -define half @reduction_fmin_v4half(<4 x half> %vec4) { +define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> - %rdx.minmax.cmp = fcmp fast olt <4 x half> %vec4, %rdx.shuf + %rdx.minmax.cmp = fcmp nnan nsz olt <4 x half> %vec4, %rdx.shuf %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> - %rdx.minmax.cmp2 = fcmp fast olt <4 x half> %rdx.minmax.select, %rdx.shuf1 + %rdx.minmax.cmp2 = fcmp nnan nsz olt <4 x half> %rdx.minmax.select, %rdx.shuf1 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 ret half %res } + +declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>) +declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)